def process(self, obj): ret = ISIVoid() #default nothing try: method = getattr(self, obj.method) println("Running %s\n" % obj.method) if obj.args: ret = method(obj.args) else: ret = method() except Exception, error: sys.stderr.write("ERROR: %s\n%s\n" % (traceback(error), error))
def walkISI(files, archive, notes): from papers.pdfget import download_pdf parser = SavedRecordParser(archive) for file in files: text = open(file).read() parser.feed(text, notes) println("%d new articles\n" % len(parser.archive)) for article in parser: journal = article.get_journal() abbrev = article.get_abbrev() volume = article.get_volume() start = article.get_start_page() name = "%s %d %s" % (abbrev, volume, start) println("Downloading %s" % name) path = name + ".pdf" if os.path.isfile(path): println(" -> exists %s" % path) article.set_pdf(path) continue #check to see if we already have it path = download_pdf(ISIArticle.get_journal(journal), volume, 0, start) #don't require issue if path: println(" -> %s" % path) article.set_pdf(path) else: sys.stdout.write(" -> FAILED") parser.archive.commit()
def feed(self, text, notes): journals = {} blocks = re.compile("PT\sJ(.*?)\nER", re.DOTALL).findall(text) for block in blocks: try: self.block = block self.article = self.archive.create_article() get_number = lambda x: re.compile("(\d+)").search(x).groups()[0] get_page = lambda x: Page(get_number(x)) #clean_title = lambda x: clean_line(clean_entry(x)) clean_title = Cleanup.clean_title self.get_entry("journal", entries=(("so", "la"), ("so", "ab"), ("so", "sn")) ) self.get_entry("volume", method=int, entries=(("vl", "is"), ("vl", "bp")) ) self.get_entry("issue", method=lambda x: int(get_number(x)), require=False, entries=(("is", "bp"),) ) self.get_entry("start_page", method=get_page, exclude=("art. no.",), entries=(("bp", "ep"), ("bp", "ut"), ("ar", "di"), ("ar", "ut")) ) self.get_entry("end_page", method=get_page, require=False, entries=(("ep", "di"), ("ep", "ut")) ) self.get_entry("authors", method=lambda x: get_authors(x, "\n", ","), entries=(("af", "ti"), ("au", "ti"), ("au", "so"))) self.get_entry("title", method=clean_title, entries=(("ti", "so"),) ) self.get_entry("abstract", method=clean_entry, require=False, entries=(("ab", "sn"),) ) self.get_entry("year", method=int, entries=(("py", "vl"), ("py", "tc") ) ) self.get_entry("doi", require=False, entries=(("di", "pg"), ("di", "ut"),("di", "er")) ) self.article.set_notes(notes) journal = ISIArticle.get_journal(self.article.get_journal()) volume = self.article.get_volume() page = self.article.get_page() name = "%s %d %s" % (journal, volume, page) if not self.master.has(self.article): self.archive.test_and_add(self.article) else: println("%s exists in archive\n" % name) continue except Exception, error: sys.stderr.write("ERROR: %s\n%s\n" % (error, block))
def store(self, download = False, notes = [], keywords = []): journal = self.article.get_journal() volume = self.article.get_volume() page = self.article.get_page() year = self.article.get_year() name = "%s %d %s (%d)" % (self.article.get_abbrev(), volume, page, year) local_match = self.archive.find_match(self.article) if local_match: download = download and not local_match.has_pdf() #set to download if we don't have pdf self.article = local_match println("Already have article %s in local archive\n" % name) master_match = None if not local_match: artreq = ArchiveRequest(self.article) master_match = artreq.run() #query master if master_match: println("Already have article %s in master archive\n" % name) download = download and not master_match.has_pdf() #set to download if we don't have pdf #self.article = master_match if not local_match and not master_match: self.archive.add(self.article) if download: path = download_pdf(journal, volume=volume, page=page) if path: println(" -> downloaded %s\n" % path) self.article.set_pdf(path) if keywords: self.add_keywords(keywords) if notes: self.add_notes(notes) println("Completed storage of %s\n%s\n%s\n" % (name, keywords, notes))