def url(self, selenium): volume = self.volume page = self.page issue = self.issue from webutils.htmlparser import fetch_url self.validate("template", "year1") response = None if not issue: query = RSCQuery(self.name, volume, page, selenium) query.run() response = query.rschtml else: parser = self.get_articles(volume, issue) for article in parser: if article.start_page == page: response = fetch_url(article.url) break if not response: raise HTMLException("No match found for %s %d %s" % (self.name, volume, page)) url_list = URLLister() url_list.feed(response) pdflink = "http://www.rsc.org" + url_list["PDF"] return pdflink, issue
def url(self, selenium): volume = self.volume page = self.page issue = self.issue self.validate("baseurl", "maxvolume") from webutils.htmlparser import URLLister, fetch_url import re query = SpringerQuery(volume, self.maxvolume, selenium) query.run() url_list = URLLister() url_list.feed(query.html) if not url: raise HTMLException("No page found for volume %d for %s" % (volume, self.name)) nexturl = self.mainurl + url response = fetch_url(nexturl) issparser = IssueParser() issparser.feed(response, volume, issue) #now have the url for the issue nexturl = self.mainurl + issparser.url response = fetch_url(nexturl) parser = SpringerParser() parser.feed(response) for article in parser: if article.start_page == page: return article.url, issue raise HTMLException("No match found for %s %d %s" % (self.name, volume, page))
def url(self, selenium): self.validate("baseurl", "volstart") if self.volume >= self.volstart: #get the issue from the page number self.issue = self.page.get_issue() if not issue: query = AIPQuery(self.volume, self.page, self.baseurl, selenium) query.run() url_list = URLLister() url_list.feed(query.aiphtml) pdfurl = url_list["Download PDF"] regexp = re.compile("Issue\s(\d+)") for name in url_list: match = regexp.search(name) if match: self.issue = int(match.groups()[0]) return pdfurl, self.issue else: parser = self.get_articles(self.volume, self.issue) for article in parser: if article.start_page == self.page: return article.url, self.issue raise HTMLException("No match found for %s %d %s" % (self.name, self.volume, self.page))
def walk_references(self): import time url_list = URLLister() text = self.run("get_html") url_list.feed(text) for name in url_list: link = url_list[name] if "CitedFullRecord" in link: self.process_article(link)
def url(self, selenium): self.validate() query = InformaQuery(self.name, self.volume, self.page, selenium) query.run() url_list = URLLister() url_list.feed(query.html) pdfurl = "http://www.informaworld.com/" + url_list["Full Text PDF"] self.issue = int(re.compile("Issue\s+(\d+)").search(query.text).groups()[0]) return pdfurl, self.issue
def get_issue(self, volume, page): mainurl = "http://pubs.acs.org/loi/%s/%d" % (self.id, volume) response = fetch_url(mainurl) url_list = URLLister() url_list.feed(response) for name in url_list: if not "Issue" in name or not "Volume" in name: continue volcheck, issue, start, end = map(int, re.compile("Volume\s(\d+)[,]\sIssue\s(\d+)[,]\spp[.]\s(\d+).*?(\d+)").search(name).groups()) if volcheck == volume and start <= page and end >= page: return issue return 0
def url(self, selenium): volume = self.volume page = self.page issue = self.issue query = JstorQuery() query.run(self.name, volume, page, selenium) url_list = URLLister() url_list.feed(query.html) url = url_list["PDF"] #parse away the nonsense match = re.compile("redirectUri[=](.*)").search(url) if not match: raise HTMLException("No page found for volume %d for %s" % (volume, self.name)) nextlink = match.groups()[0] fullurl = "http://www.jstor.org" + nextlink return fullurl, 0
def url(self, selenium): volume = self.volume page = self.page issue = self.issue self.validate("id") if not self.issue: query = ACSQuery(self.id, self.volume, self.page, selenium) query.run() url_list = URLLister("Abstract","Tools") url_list.feed(query.html) pdfurl = None try: pdfurl = "http://pubs.acs.org" + url_list["PDF w/ Links"] except KeyError: pass if not pdfurl: pdfurl = "http://pubs.acs.org" + url_list["Hi-Res PDF"] tocurl = url_list["Table of Contents"] self.issue = int(os.path.split(tocurl)[-1]) return pdfurl, self.issue else: mainurl = "http://pubs.acs.org/toc/%s/%d/%d" % (self.id, self.volume, self.issue) response = fetch_url(mainurl) parser = ACSParser() parser.feed(response) for article in parser: if article.start_page == page: return article.url, self.issue raise HTMLException("No match found for %s %d %s" % (self.name, self.volume, self.page))
def url(self, selenium): volume = self.volume page = self.page issue = self.issue self.validate("baseurl") response = fetch_url(self.baseurl) url_list = URLLister() url_list.feed(response) #check to see if we are already on the top page match = re.compile("Volume\s%d[,]\sIssue" % volume).search(response) nexturl = None if not match: for name in url_list: match1 = re.compile("Volumes\s(\d+)\s*[-]\s*(\d+)").search(name) match2 = re.compile("Volume\s(\d+)").search(name) if not match1 and not match2: continue start = finish = 0 if match1: start, finish = map(int, match1.groups()) elif match2: start = finish = int(match2.groups()[0]) if volume >= start and volume <= finish: nexturl = url_list[name] break if not nexturl: raise HTMLException("Unable to find link for volume %d" % volume) nexturl = "http://www.sciencedirect.com%s" % nexturl response = fetch_url(nexturl) url_list.reset() url_list.feed(response) baseurl = nexturl nexturl = None for name in url_list: match1 = re.compile("Volume\s(\d+)[,]\sIssue\s(\d+)").search(name) match2 = re.compile("Volume\s(\d+)[,]\sIssues\s(\d+)[-](\d+)").search(name) if not match1 and not match2: continue start_issue = 0 end_issue = 0 volcheck = 0 if match1: volcheck, start_issue = map(int, match1.groups()) end_issue = start_issue elif match2: volcheck, start_issue, end_issue = map(int, match2.groups()) page_text = url_list.get_text(name) start_page, end_page = map(Page, re.compile("pp[.]\s+(\d+)[-](\d+)").search(page_text).groups()) if volume == volcheck and page >= start_page and page <= end_page: nexturl = url_list[name] if not issue: issue = start_issue break if not nexturl: #all is not lost... we might already be on the correct page regexp = "Volume\s%d[,]\sIssue[s]?\s(\d+)[-]?\d*[,]\sPages\s(\d+)[-](\d+)" % volume match = re.compile(regexp).search(response) if match: number, start, end = map(int, match.groups()) if start <= page and end >= page: nexturl = baseurl issue = number else: nexturl = "http://www.sciencedirect.com%s" % nexturl if not nexturl: raise HTMLException("Unable to find link for volume %d issue %d" % (volume, issue)) response = fetch_url(nexturl) sdparser = SDParser() sdparser.feed(response) for article in sdparser: if article.start_page == page: return article.url, issue raise HTMLException("No match found for %s %d %s" % (self.name, volume, page))