def url(self, selenium): volume = self.volume page = self.page issue = self.issue self.validate("baseurl") response = fetch_url(self.baseurl) url_list = URLLister() url_list.feed(response) #check to see if we are already on the top page match = re.compile("Volume\s%d[,]\sIssue" % volume).search(response) nexturl = None if not match: for name in url_list: match1 = re.compile("Volumes\s(\d+)\s*[-]\s*(\d+)").search(name) match2 = re.compile("Volume\s(\d+)").search(name) if not match1 and not match2: continue start = finish = 0 if match1: start, finish = map(int, match1.groups()) elif match2: start = finish = int(match2.groups()[0]) if volume >= start and volume <= finish: nexturl = url_list[name] break if not nexturl: raise HTMLException("Unable to find link for volume %d" % volume) nexturl = "http://www.sciencedirect.com%s" % nexturl response = fetch_url(nexturl) url_list.reset() url_list.feed(response) baseurl = nexturl nexturl = None for name in url_list: match1 = re.compile("Volume\s(\d+)[,]\sIssue\s(\d+)").search(name) match2 = re.compile("Volume\s(\d+)[,]\sIssues\s(\d+)[-](\d+)").search(name) if not match1 and not match2: continue start_issue = 0 end_issue = 0 volcheck = 0 if match1: volcheck, start_issue = map(int, match1.groups()) end_issue = start_issue elif match2: volcheck, start_issue, end_issue = map(int, match2.groups()) page_text = url_list.get_text(name) start_page, end_page = map(Page, re.compile("pp[.]\s+(\d+)[-](\d+)").search(page_text).groups()) if volume == volcheck and page >= start_page and page <= end_page: nexturl = url_list[name] if not issue: issue = start_issue break if not nexturl: #all is not lost... we might already be on the correct page regexp = "Volume\s%d[,]\sIssue[s]?\s(\d+)[-]?\d*[,]\sPages\s(\d+)[-](\d+)" % volume match = re.compile(regexp).search(response) if match: number, start, end = map(int, match.groups()) if start <= page and end >= page: nexturl = baseurl issue = number else: nexturl = "http://www.sciencedirect.com%s" % nexturl if not nexturl: raise HTMLException("Unable to find link for volume %d issue %d" % (volume, issue)) response = fetch_url(nexturl) sdparser = SDParser() sdparser.feed(response) for article in sdparser: if article.start_page == page: return article.url, issue raise HTMLException("No match found for %s %d %s" % (self.name, volume, page))