def url(self, selenium): volume = self.volume page = self.page issue = self.issue self.validate("baseurl", "maxvolume") from webutils.htmlparser import URLLister, fetch_url import re query = SpringerQuery(volume, self.maxvolume, selenium) query.run() url_list = URLLister() url_list.feed(query.html) if not url: raise HTMLException("No page found for volume %d for %s" % (volume, self.name)) nexturl = self.mainurl + url response = fetch_url(nexturl) issparser = IssueParser() issparser.feed(response, volume, issue) #now have the url for the issue nexturl = self.mainurl + issparser.url response = fetch_url(nexturl) parser = SpringerParser() parser.feed(response) for article in parser: if article.start_page == page: return article.url, issue raise HTMLException("No match found for %s %d %s" % (self.name, volume, page))
def url(self, selenium): volume = self.volume page = self.page issue = self.issue self.validate("baseurl") toc = fetch_url("%s/%d" % (self.baseurl, volume)) if not toc: raise HTMLException("No match found for %s %d %s" % (self.name, volume, page)) #figure out the issue number issue_parser = IssueParser() issue_parser.feed(toc) for entry in issue_parser: start, end = issue_parser[entry] if page >= start and page <= end: issue = entry break toc = fetch_url("%s/%d/%d" % (self.baseurl, volume, issue)) parser = IOPParser() parser.feed(toc) if hasattr(self, "baseurl2"): toc = fetch_url("%s/%d/%d" % (self.baseurl2, volume, issue)) parser.feed(toc) for article in parser: if article.start_page == page: url = "http://iopscience.iop.org" + article.url return url, issue raise HTMLException("No match found for %s %d %s" % (self.name, volume, page))
def run(self): sel = self.selenium sel.open(self.baseurl) sel.click("link=*ll Issue*") sel.wait_for_page_to_load(30000) loc=sel.get_location() text = sel.get_body_text() year = re.compile("(\d+)\s*[-]\s*Volume\s%s\s" % self.volume).search(text).groups()[0] url = "%s?activeYear=%s" % (loc, year) text = fetch_url(url) regexp = "Volume\s*%d.*?Issue[s]?\s*(\d+)[-]?\d*.*?Pages\s*(\d+)[-](\d+)" % self.volume matches = re.compile(regexp, re.DOTALL).findall(text) issue = 0 for iss, start, end in matches: iss, start, end = map(int, (iss, start, end)) if start <= self.page and end >= self.page: issue = iss break if not issue: raise HTMLException("Could not find issue number") sel.click("link=*Volume %d*" % self.volume) time.sleep(0.5) sel.click("link=*Issue*%d*" % issue) sel.wait_for_page_to_load(30000) sel.click("link=*page*%s*" % self.page) sel.wait_for_page_to_load(30000) location = sel.get_location() base = location.split("/")[:-1] base.append("pdf") pdfurl = "/".join(base) return pdfurl, issue
def url(self, selenium): volume = self.volume page = self.page issue = self.issue from webutils.htmlparser import fetch_url self.validate("template", "year1") response = None if not issue: query = RSCQuery(self.name, volume, page, selenium) query.run() response = query.rschtml else: parser = self.get_articles(volume, issue) for article in parser: if article.start_page == page: response = fetch_url(article.url) break if not response: raise HTMLException("No match found for %s %d %s" % (self.name, volume, page)) url_list = URLLister() url_list.feed(response) pdflink = "http://www.rsc.org" + url_list["PDF"] return pdflink, issue
def get_articles(self, volume, issue): mainurl = "%s/v%d/i%d" % (self.baseurl, volume, issue) response = fetch_url(mainurl) if not response: return [] parser = AIPParser() parser.feed(response) return parser
def url(self, selenium): volume = self.volume page = self.page issue = self.issue query = WileyQuery(self.baseurl, volume, page, selenium) url, issue = query.run() text = fetch_url(url) regexp = 'pdfDocument.*?(http.*?)["]' pdfurl = re.compile(regexp, re.DOTALL).search(text).groups()[0] return pdfurl, issue
def get_articles(self, volume, issue): year = self.year1 + volume - 1 mainurl = self.template % (year, volume, volume, year, issue) response = fetch_url(mainurl) if not response: return [] parser = RSCParser() parser.feed(response) return parser
def get_issue(self, volume, page): mainurl = "http://pubs.acs.org/loi/%s/%d" % (self.id, volume) response = fetch_url(mainurl) url_list = URLLister() url_list.feed(response) for name in url_list: if not "Issue" in name or not "Volume" in name: continue volcheck, issue, start, end = map(int, re.compile("Volume\s(\d+)[,]\sIssue\s(\d+)[,]\spp[.]\s(\d+).*?(\d+)").search(name).groups()) if volcheck == volume and start <= page and end >= page: return issue return 0
def url(self, selenium): self.validate("baseurl", "abbrev", "volstart", "doi") if self.volume >= self.volstart: #get the issue from the page number self.issue = page.get_issue() else: url = "%s.%d.%s" % (self.doi, self.volume, self.page) text = fetch_url(url) regexp = "/toc/%s/v%d/i(\d+)" % (self.abbrev, self.volume) self.issue = int(re.compile(regexp).search(text).groups()[0]) parser = self.get_articles(self.volume, self.issue) for article in parser: if article.start_page == page: url = self.baseurl + article.url return url, self.issue raise HTMLException("No match found for %s %d %s" % (self.name, self.volume, self.page))
def url(self, selenium): volume = self.volume page = self.page issue = self.issue self.validate("id") if not self.issue: query = ACSQuery(self.id, self.volume, self.page, selenium) query.run() url_list = URLLister("Abstract","Tools") url_list.feed(query.html) pdfurl = None try: pdfurl = "http://pubs.acs.org" + url_list["PDF w/ Links"] except KeyError: pass if not pdfurl: pdfurl = "http://pubs.acs.org" + url_list["Hi-Res PDF"] tocurl = url_list["Table of Contents"] self.issue = int(os.path.split(tocurl)[-1]) return pdfurl, self.issue else: mainurl = "http://pubs.acs.org/toc/%s/%d/%d" % (self.id, self.volume, self.issue) response = fetch_url(mainurl) parser = ACSParser() parser.feed(response) for article in parser: if article.start_page == page: return article.url, self.issue raise HTMLException("No match found for %s %d %s" % (self.name, self.volume, self.page))
def url(self, selenium): volume = self.volume page = self.page issue = self.issue self.validate("baseurl") response = fetch_url(self.baseurl) url_list = URLLister() url_list.feed(response) #check to see if we are already on the top page match = re.compile("Volume\s%d[,]\sIssue" % volume).search(response) nexturl = None if not match: for name in url_list: match1 = re.compile("Volumes\s(\d+)\s*[-]\s*(\d+)").search(name) match2 = re.compile("Volume\s(\d+)").search(name) if not match1 and not match2: continue start = finish = 0 if match1: start, finish = map(int, match1.groups()) elif match2: start = finish = int(match2.groups()[0]) if volume >= start and volume <= finish: nexturl = url_list[name] break if not nexturl: raise HTMLException("Unable to find link for volume %d" % volume) nexturl = "http://www.sciencedirect.com%s" % nexturl response = fetch_url(nexturl) url_list.reset() url_list.feed(response) baseurl = nexturl nexturl = None for name in url_list: match1 = re.compile("Volume\s(\d+)[,]\sIssue\s(\d+)").search(name) match2 = re.compile("Volume\s(\d+)[,]\sIssues\s(\d+)[-](\d+)").search(name) if not match1 and not match2: continue start_issue = 0 end_issue = 0 volcheck = 0 if match1: volcheck, start_issue = map(int, match1.groups()) end_issue = start_issue elif match2: volcheck, start_issue, end_issue = map(int, match2.groups()) page_text = url_list.get_text(name) start_page, end_page = map(Page, re.compile("pp[.]\s+(\d+)[-](\d+)").search(page_text).groups()) if volume == volcheck and page >= start_page and page <= end_page: nexturl = url_list[name] if not issue: issue = start_issue break if not nexturl: #all is not lost... we might already be on the correct page regexp = "Volume\s%d[,]\sIssue[s]?\s(\d+)[-]?\d*[,]\sPages\s(\d+)[-](\d+)" % volume match = re.compile(regexp).search(response) if match: number, start, end = map(int, match.groups()) if start <= page and end >= page: nexturl = baseurl issue = number else: nexturl = "http://www.sciencedirect.com%s" % nexturl if not nexturl: raise HTMLException("Unable to find link for volume %d issue %d" % (volume, issue)) response = fetch_url(nexturl) sdparser = SDParser() sdparser.feed(response) for article in sdparser: if article.start_page == page: return article.url, issue raise HTMLException("No match found for %s %d %s" % (self.name, volume, page))