示例#1
0
    def url(self, selenium):
        volume = self.volume
        page = self.page
        issue = self.issue
        
        self.validate("baseurl")

        
        response = fetch_url(self.baseurl)
        url_list = URLLister()
        url_list.feed(response)

        #check to see if we are already on the top page
        match = re.compile("Volume\s%d[,]\sIssue" % volume).search(response)
        nexturl = None
        if not match:
            for name in url_list:
                match1  = re.compile("Volumes\s(\d+)\s*[-]\s*(\d+)").search(name)
                match2  = re.compile("Volume\s(\d+)").search(name)
                if not match1 and not match2:
                    continue

                start = finish = 0
                if match1:
                    start, finish = map(int, match1.groups())
                elif match2:
                    start = finish = int(match2.groups()[0])

                if volume >= start and volume <= finish:
                    nexturl = url_list[name]
                    break

            if not nexturl:
                raise HTMLException("Unable to find link for volume %d" % volume)

            nexturl = "http://www.sciencedirect.com%s" % nexturl
            response = fetch_url(nexturl)
            url_list.reset()
            url_list.feed(response)


        baseurl = nexturl
        nexturl = None
        for name in url_list:
            match1 = re.compile("Volume\s(\d+)[,]\sIssue\s(\d+)").search(name)
            match2 = re.compile("Volume\s(\d+)[,]\sIssues\s(\d+)[-](\d+)").search(name)
            
            if not match1 and not match2:
                continue



            start_issue = 0
            end_issue = 0
            volcheck = 0
            if match1:
                volcheck, start_issue = map(int, match1.groups())
                end_issue = start_issue
            elif match2:
                volcheck, start_issue, end_issue = map(int, match2.groups())


            page_text = url_list.get_text(name)
            
            start_page, end_page = map(Page, re.compile("pp[.]\s+(\d+)[-](\d+)").search(page_text).groups())

            if volume == volcheck and page >= start_page and page <= end_page:
                nexturl = url_list[name]
                if not issue:
                    issue = start_issue
                break

        if not nexturl: #all is not lost... we might already be on the correct page
            regexp = "Volume\s%d[,]\sIssue[s]?\s(\d+)[-]?\d*[,]\sPages\s(\d+)[-](\d+)" % volume
            match = re.compile(regexp).search(response)
            if match:
                number, start, end = map(int, match.groups())
                if start <= page and end >= page:
                    nexturl = baseurl
                    issue = number

        else:
            nexturl = "http://www.sciencedirect.com%s" % nexturl

        if not nexturl:
            raise HTMLException("Unable to find link for volume %d issue %d" % (volume, issue))

        response = fetch_url(nexturl)

        sdparser = SDParser()
        sdparser.feed(response)

        for article in sdparser:
            if article.start_page == page:
                return article.url, issue

        raise HTMLException("No match found for %s %d %s" % (self.name, volume, page))