Python URLLister示例，webutils.htmlparser.URLLister Python示例

示例#1

0

显示文件

文件： rsc.py 项目： jjwilke/PySkyNet

    def url(self, selenium):
        volume = self.volume
        page = self.page
        issue = self.issue

        from webutils.htmlparser import fetch_url

        self.validate("template", "year1")
        response = None
        if not issue:
            query = RSCQuery(self.name, volume, page, selenium)
            query.run()
            response = query.rschtml
        else:
            parser = self.get_articles(volume, issue)
            for article in parser:
                if article.start_page == page:
                    response = fetch_url(article.url)
                    break

        if not response:
            raise HTMLException("No match found for %s %d %s" % (self.name, volume, page))

        url_list = URLLister()
        url_list.feed(response)
        pdflink = "http://www.rsc.org" + url_list["PDF"]
        return pdflink, issue

示例#2

0

显示文件

文件： springer.py 项目： jjwilke/PySkyNet

    def url(self, selenium):
        volume = self.volume
        page = self.page
        issue = self.issue

        self.validate("baseurl", "maxvolume")

        from webutils.htmlparser import URLLister, fetch_url
        import re

        query = SpringerQuery(volume, self.maxvolume, selenium)
        query.run()
        url_list = URLLister()
        url_list.feed(query.html)

        if not url:
            raise HTMLException("No page found for volume %d for %s" % (volume, self.name))
        
        nexturl = self.mainurl + url
        response = fetch_url(nexturl)

        issparser = IssueParser()
        issparser.feed(response, volume, issue)

        #now have the url for the issue
        nexturl = self.mainurl + issparser.url
        response = fetch_url(nexturl)
        parser = SpringerParser()
        parser.feed(response)

        for article in parser:
            if article.start_page == page:
                return article.url, issue

        raise HTMLException("No match found for %s %d %s" % (self.name, volume, page))

示例#3

0

显示文件

文件： aip.py 项目： jjwilke/PySkyNet

    def url(self, selenium):

        self.validate("baseurl", "volstart")
        
        if self.volume >= self.volstart: #get the issue from the page number
            self.issue = self.page.get_issue()

        if not issue:
            query = AIPQuery(self.volume, self.page, self.baseurl, selenium)
            query.run()
            url_list = URLLister()
            url_list.feed(query.aiphtml)
            pdfurl = url_list["Download PDF"]
            regexp = re.compile("Issue\s(\d+)")
            for name in url_list:
                match = regexp.search(name)
                if match:
                    self.issue = int(match.groups()[0])
                    return pdfurl, self.issue
        else:
            parser = self.get_articles(self.volume, self.issue)
            for article in parser:
                if article.start_page == self.page:
                    return article.url, self.issue

        raise HTMLException("No match found for %s %d %s" % (self.name, self.volume, self.page))

示例#4

0

显示文件

文件： isi.py 项目： jjwilke/PySkyNet

 def walk_references(self):
     import time
     url_list = URLLister()
     text = self.run("get_html")
     url_list.feed(text)
     for name in url_list:
         link = url_list[name]
         if "CitedFullRecord" in link:
             self.process_article(link)

示例#5

0

显示文件

文件： informa.py 项目： jjwilke/PySkyNet

    def url(self, selenium):

        self.validate()
        
        query = InformaQuery(self.name, self.volume, self.page, selenium)
        query.run()
        url_list = URLLister()
        url_list.feed(query.html)
        pdfurl = "http://www.informaworld.com/" + url_list["Full Text PDF"]

        self.issue = int(re.compile("Issue\s+(\d+)").search(query.text).groups()[0])

        return pdfurl, self.issue

示例#6

0

显示文件

文件： acs.py 项目： jjwilke/PySkyNet

    def get_issue(self, volume, page):
        mainurl = "http://pubs.acs.org/loi/%s/%d" % (self.id, volume)
        response = fetch_url(mainurl)
        url_list = URLLister()
        url_list.feed(response)
        for name in url_list:
            if not "Issue" in name or not "Volume" in name:
                continue


            volcheck, issue, start, end = map(int, re.compile("Volume\s(\d+)[,]\sIssue\s(\d+)[,]\spp[.]\s(\d+).*?(\d+)").search(name).groups())
            if volcheck == volume and start <= page and end >= page:
                return issue

        return 0

示例#7

0

显示文件

文件： jstor.py 项目： jjwilke/PySkyNet

    def url(self, selenium):
        volume = self.volume
        page = self.page
        issue = self.issue

        query = JstorQuery()
        query.run(self.name, volume, page, selenium)
        url_list = URLLister()
        url_list.feed(query.html)
        url = url_list["PDF"]
        #parse away the nonsense
        match = re.compile("redirectUri[=](.*)").search(url)
        if not match:
            raise HTMLException("No page found for volume %d for %s" % (volume, self.name))

        nextlink = match.groups()[0]
        fullurl = "http://www.jstor.org" + nextlink
        return fullurl, 0

示例#8

0

显示文件

文件： acs.py 项目： jjwilke/PySkyNet

    def url(self, selenium):
        volume = self.volume
        page = self.page
        issue = self.issue

        self.validate("id")

        if not self.issue:
            query = ACSQuery(self.id, self.volume, self.page, selenium)
            query.run()
            url_list = URLLister("Abstract","Tools")
            url_list.feed(query.html)
            pdfurl = None

            try:
                pdfurl = "http://pubs.acs.org" + url_list["PDF w/ Links"]
            except KeyError:
                pass

            if not pdfurl:
                pdfurl = "http://pubs.acs.org" + url_list["Hi-Res PDF"]
                

            tocurl = url_list["Table of Contents"]
            self.issue = int(os.path.split(tocurl)[-1])
            return pdfurl, self.issue

        else:
            mainurl = "http://pubs.acs.org/toc/%s/%d/%d" % (self.id, self.volume, self.issue)

            response = fetch_url(mainurl)
            parser = ACSParser()
            parser.feed(response)
            for article in parser:
                if article.start_page == page:
                    return article.url, self.issue

        raise HTMLException("No match found for %s %d %s" % (self.name, self.volume, self.page))

示例#9

0

显示文件

文件： sciencedirect.py 项目： jjwilke/PySkyNet

    def url(self, selenium):
        volume = self.volume
        page = self.page
        issue = self.issue
        
        self.validate("baseurl")

        
        response = fetch_url(self.baseurl)
        url_list = URLLister()
        url_list.feed(response)

        #check to see if we are already on the top page
        match = re.compile("Volume\s%d[,]\sIssue" % volume).search(response)
        nexturl = None
        if not match:
            for name in url_list:
                match1  = re.compile("Volumes\s(\d+)\s*[-]\s*(\d+)").search(name)
                match2  = re.compile("Volume\s(\d+)").search(name)
                if not match1 and not match2:
                    continue

                start = finish = 0
                if match1:
                    start, finish = map(int, match1.groups())
                elif match2:
                    start = finish = int(match2.groups()[0])

                if volume >= start and volume <= finish:
                    nexturl = url_list[name]
                    break

            if not nexturl:
                raise HTMLException("Unable to find link for volume %d" % volume)

            nexturl = "http://www.sciencedirect.com%s" % nexturl
            response = fetch_url(nexturl)
            url_list.reset()
            url_list.feed(response)


        baseurl = nexturl
        nexturl = None
        for name in url_list:
            match1 = re.compile("Volume\s(\d+)[,]\sIssue\s(\d+)").search(name)
            match2 = re.compile("Volume\s(\d+)[,]\sIssues\s(\d+)[-](\d+)").search(name)
            
            if not match1 and not match2:
                continue



            start_issue = 0
            end_issue = 0
            volcheck = 0
            if match1:
                volcheck, start_issue = map(int, match1.groups())
                end_issue = start_issue
            elif match2:
                volcheck, start_issue, end_issue = map(int, match2.groups())


            page_text = url_list.get_text(name)
            
            start_page, end_page = map(Page, re.compile("pp[.]\s+(\d+)[-](\d+)").search(page_text).groups())

            if volume == volcheck and page >= start_page and page <= end_page:
                nexturl = url_list[name]
                if not issue:
                    issue = start_issue
                break

        if not nexturl: #all is not lost... we might already be on the correct page
            regexp = "Volume\s%d[,]\sIssue[s]?\s(\d+)[-]?\d*[,]\sPages\s(\d+)[-](\d+)" % volume
            match = re.compile(regexp).search(response)
            if match:
                number, start, end = map(int, match.groups())
                if start <= page and end >= page:
                    nexturl = baseurl
                    issue = number

        else:
            nexturl = "http://www.sciencedirect.com%s" % nexturl

        if not nexturl:
            raise HTMLException("Unable to find link for volume %d issue %d" % (volume, issue))

        response = fetch_url(nexturl)

        sdparser = SDParser()
        sdparser.feed(response)

        for article in sdparser:
            if article.start_page == page:
                return article.url, issue

        raise HTMLException("No match found for %s %d %s" % (self.name, volume, page))