def pdf2xml(pdffile, xmlfile, keep_tempfiles=False, ocr_ranges=None): """ converts pdf to xml using pdftohtml or, if that fails, ocr2xml; returns 'pdftohtml' or 'ocr2xml' depending on which process was used. ocr_ranges (optional) is a list of pairs such as [(1,3),(7,10)] which would specify that only pages 1-3 and 7-10 should get ocr'ed. TODO: check quality to see if ocr is needed? """ if not exists(pdffile): raise FileNotFoundError("{} not found".format(pdffile)) # first try pdftohtml try: pdftohtml(pdffile, xmlfile) return "pdftohtml" except NoTextInPDFException: debug(2, "no text in xml produced by pdftohtml") except Exception as e: debug(2, "pdftohtml failed: %s -- %s", pdffile, str(e)) # then try ocr2xml (not catching exceptions here) if ocr_ranges: shortened_pdf = pdffile.rsplit(".")[0] + "-short.pdf" pdfcut(pdffile, shortened_pdf, ocr_ranges) pdffile = shortened_pdf ocr2xml(pdffile, xmlfile, keep_tempfiles=keep_tempfiles) if not keep_tempfiles and ocr_ranges: try: os.remove(shortened_pdf) except: pass return "ocr2xml"
def doc2text(doc): if len(doc.content) < 100000: text = doc.content else: text = doc.content[:50000] + doc.content[-50000:] # Simple hack to add authors etc. to document features: if len(text): if len(text) < 4000: text += " XLEN_TINY" * 2 elif len(text) < 8000: text += " XLEN_VSHORT" * 2 elif len(text) < 15000: text += " XLEN_SHORT" * 2 elif len(text) < 40000: text += " XLEN_MEDIUM" * 2 elif len(text) < 80000: text += " XLEN_LONG" * 2 else: text += " XLEN_VLONG {}" * 2 if doc.title: text += (" " + doc.title) * 2 if doc.authors: for au in doc.authors.split(","): text += " " + re.sub(r' (\w+)\s*', r' XAU_\1', au) m = doc.url and re.match(r'(.+)/[^/]*', doc.url) # url path if m: text += " XPATH_" + re.sub(r'\W', '_', m.group(1)) if doc.filetype: text += " XTYPE_" + doc.filetype debug(5, "doc text for classification:\n%s\n", text) return text
def old_link(self, url): """ If a link to (a session variant of) url is already known on this page (as stored in the database), returns the stored Link, otherwise returns None. """ if not hasattr(self, '_links'): cur = db.dict_cursor() query = "SELECT * FROM links WHERE source_id = %s" cur.execute(query, (self.source_id,)) debug(5, cur._last_executed) self._links = [ Link(source=self, **li) for li in cur.fetchall() ] #debug(2, 'xxx old links:\n%s', '\n'.join([li.url for li in self._links])) for li in self._links: if li.url == url: return li s_url = self.strip_session_variables(url) if s_url != url: for li in self._links: if s_url == self.strip_session_variables(li.url): return li return None
def update_db(self, **kwargs): """ update self.**kwargs and write present state to db, also set 'last_checked' """ for k,v in kwargs.items(): setattr(self, k, v) cur = db.cursor() self.last_checked = datetime.now() fields = [f for f in self.db_fields.keys() if f != 'link_id' and getattr(self, f) is not None] values = [getattr(self, f) for f in fields] if self.link_id: query = "UPDATE links SET {},urlhash=MD5(url) WHERE link_id = %s".format( ",".join(k+"=%s" for k in fields)) cur.execute(query, values + [self.link_id]) else: query = "INSERT INTO links ({},urlhash) VALUES ({},MD5(url))".format( ",".join(fields), ",".join(("%s",)*len(fields))) try: cur.execute(query, values) except: debug(1, "oops, %s: %s", query, ','.join(map(str, values))) raise self.link_id = cur.lastrowid debug(4, cur._last_executed) db.commit()
def scholarquery(author, title): """ TODO: check if we're locked out of google scholar.""" from . import scholar time.sleep(1) scholar.ScholarConf.COOKIE_JAR_FILE = os.path.join(tempdir(), 'scholar.cookie') querier = scholar.ScholarQuerier() settings = scholar.ScholarSettings() querier.apply_settings(settings) query = scholar.SearchScholarQuery() query.set_author(author) query.set_phrase(title) #before_year = 2016 #query.set_timeframe(options.after, options.before) query.set_include_patents(False) querier.send_query(query) debug(4, 'google scholar query %s', query) articles = querier.articles for a in articles: debug(4, 'result: %s (%s)', a['title'], a['year']) # Testing for exact equality of titles means that false # negatives are likely. On the other hand, we don't want to # treat "Desire as Belief II" as old just because there has # been "Desire as Belief". We err on the side of false # negatives: if a['title'].lower() == title.lower(): return a return None
def test_debug(caplog): debuglevel(4) debug(4, 'hi there') assert 'hi there' in caplog.text() debug(5, 'secret') assert 'secret' not in caplog.text() debuglevel(5)
def evaluate(*docs): prob = clf.classify(*docs) if len(docs) > 1: debug(4, 'probability that documents are about philosophy: %s', ','.join(prob)) return prob else: debug(4, 'probability that document is about philosophy: %s', prob) return prob
def default_author(self): """ returns doc.source.default_author if that is defined (i.e., if doc.source is a personal page), otherwise tries to extract an author candidate from doc.link.context. The metadata extractor (docparser.paperparser) uses this property as default author if no author string can be found in the document, and to evaluate the plausibility of candidate author strings. Unfortunately, journal pages tend to put the author name in unpredictable places, often outside what is recognized as the link context. On the other hand, journal publications reliably contain the author name(s) in the document. So here we don't bother setting default_author at the moment. On repository pages, people do sometimes upload papers that don't contain any author names. The metadata extractor assumes that default_author is a single author, because personal homepages only have a single default author. People also usually don't forget to put their names in the paper if there are co-authors. So we return the first author only. On philsci-archive, the format is Teller, Paul (2016) Role-Player Realism. Livengood, Jonathan and Sytsma, Justin and Rose, David (2016) Following... On philpapers, it is Stefan Dragulinescu, Mechanisms and Difference-Making. Michael Baumgartner & Lorenzo Casini, An Abductive Theory of Constitution. How do we know "Stefan Dragulinescu, Mechanisms" isn't the name of a person called "Mechanisms Stefan Dragulinescu" in last-comma-first format? Ultimately, we should use some clever general heuristics here. For now we simply split at /,| &| and|\(/; if the first element contains a whitespace, we return that element, otherwise we concatenate the first two elements in reverse order. This will only retrieve the surname on philsci-archive for authors with a double surname. TODO: improve. """ try: if self.source.sourcetype != 'repo': return self.source.default_author re_split = re.compile(',| & | and|\(') au, rest = re_split.split(self.link.context.strip(), 1) if len(au.split()) == 1: au2, rest2 = re_split.split(rest, 1) au = au2 + ' ' + au debug(3, 'setting "%s" as default_author', au) return au except Exception as e: return ''
def update_db(self, **kwargs): """write **kwargs to db, also update 'last_checked'""" if self.source_id: cur = db.cursor() kwargs['last_checked'] = time.strftime('%Y-%m-%d %H:%M:%S') query = "UPDATE sources SET {},urlhash=MD5(url) WHERE source_id = %s".format( ",".join(k+"=%s" for k in kwargs.keys())) cur.execute(query, tuple(kwargs.values()) + (self.source_id,)) debug(3, cur._last_executed) db.commit()
def load(self): if os.path.isfile(self.picklefile): debug(4, "loading classifier model from disk") with open(self.picklefile, 'rb') as f: (vect,clf) = pickle.load(f) self.vectorizer = vect self.classifier = clf self.ready = True else: self.reset()
def assign_category(self, cat_id, strength): """inserts or updates a docs2cats entry in the db""" if not self.doc_id: raise Exception("cannot assign category: document has no id") cur = db.cursor() query = ("INSERT INTO docs2cats (cat_id, doc_id, strength) VALUES (%s,%s,%s)" " ON DUPLICATE KEY UPDATE strength=%s") cur.execute(query, (cat_id, self.doc_id, strength, strength)) debug(4, cur._last_executed) db.commit()
def convert_to_pdf(tempfile): outfile = tempfile.rsplit('.',1)[0]+'.pdf' try: cmd = ['/usr/bin/python3', '/usr/bin/unoconv', '-f', 'pdf', '-o', outfile, tempfile] debug(2, ' '.join(cmd)) subprocess.check_call(cmd, timeout=20) except Exception as e: debug(1, "cannot convert %s to pdf: %s", tempfile, str(e)) raise return outfile
def save_to_db(self): """write object to db""" cur = db.cursor() fields = [f for f in self.db_fields.keys() if f != 'link_id' and getattr(self, f) is not None] values = [getattr(self, f) for f in fields] query = "INSERT INTO sources ({}, urlhash) VALUES ({}, MD5(url))".format( ",".join(fields), ",".join(("%s",)*len(fields))) cur.execute(query, values) debug(3, cur._last_executed) db.commit() self.source_id = cur.lastrowid
def parse(doc): """ tries to enrich doc by metadata (authors, title, abstract, numwords, doctype, content); returns True if successful, False if doc.page doesn't look like an article. """ page = doc.page debug(2, "parsing page %s", page.url) if "stanford.edu/entries" not in page.url: debug(2, "page is not a Stanford Encyclopedia entry") return False # title: h1s = page.xpath("//h1/text()") if not h1s: debug(2, "page is not a Stanford Encyclopedia entry") return False doc.title = h1s[0] # abstract: preamble_divs = page.xpath("//div[@id='preamble']") if not preamble_divs: debug(2, "page is not a Stanford Encyclopedia entry") return False preamble_html = etree.tostring(preamble_divs[0], encoding="unicode") doc.abstract = get_abstract(preamble_html) # authors: copyright_divs = page.xpath("//div[@id='article-copyright']") if not copyright_divs: debug(2, "page is not a Stanford Encyclopedia entry") return False copyright_html = etree.tostring(copyright_divs[0], encoding="unicode") copyright_html = re.sub("<a.+Copyright.+", "", copyright_html) copyright_html = re.sub("<.+?>", "", copyright_html) authors = [strip_tags(frag).strip() for frag in copyright_html.split("<br/>")] doc.authors = ", ".join([a for a in authors if a]) # text content: # textnodes = page.xpath("//div[@id='article-content']//text()") # if not textnodes: # debug(2, "page is not a Stanford Encyclopedia entry") # return False # doc.content = ' '.join([n.strip() for n in textnodes if n.strip()]) doc.content = page.text() doc.numwords = len(doc.content.split()) doc.numpages = int(doc.numwords / 300) # rough guess, just for classifiers doc.doctype = "article" doc.meta_confidence = 90 return True
def evaluate(doc): debug(4, 'trying to guess document type') probs = { 'book': bookfilter.test(doc, debug=debuglevel()>3, smooth=False), 'chapter': chapterfilter.test(doc, debug=debuglevel()>3, smooth=True), 'thesis': thesisfilter.test(doc, debug=debuglevel()>3, smooth=False), 'review': reviewfilter.test(doc, debug=debuglevel()>3, smooth=True) } debug(2, 'doctyper: %s', ', '.join(['{} {}'.format(k,v) for k,v in probs.items()])) if max(probs.values()) > 0.5: return max(probs, key=probs.get) else: return 'article'
def parse(doc): """ main method: fixes title and content of blogpost <doc> and adds authors, abstract, numwords """ debug(3, "fetching blog post %s", doc.url) bytehtml = requests.get(doc.url).content.decode('utf-8', 'ignore') doc.content = extract_content(bytehtml, doc) or strip_tags(doc.content) doc.numwords = len(doc.content.split()) doc.abstract = get_abstract(doc.content) if doc.title.isupper(): doc.title = doc.title.capitalize() debug(2, "\npost abstract: %s\n", doc.abstract)
def pdftohtml(pdffile, xmlfile): cmd = [ PDFTOHTML, "-i", # ignore images "-xml", # xml output "-enc", "UTF-8", "-nodrm", # ignore copy protection pdffile, xmlfile, ] debug(2, " ".join(cmd)) try: stdout = subprocess.check_output(cmd, stderr=subprocess.STDOUT, timeout=10) except subprocess.CalledProcessError as e: debug(1, e.output) raise if not exists(xmlfile): raise PdftohtmlFailedException(stdout) xml = readfile(xmlfile) if not xml_ok(xml): debug(4, "No text in pdf: %s", xml) raise NoTextInPDFException else: debug(3, "pdftohtml output ok") writefile(xmlfile, fix_pdftohtml(xml)) doctidy(xmlfile)
def load_from_db(self, url=''): url = url or self.url if not url: raise TypeError("need source url to load Source from db") cur = db.dict_cursor() query = "SELECT * FROM sources WHERE urlhash = MD5(%s)" cur.execute(query, (url,)) debug(5, cur._last_executed) sources = cur.fetchall() if sources: for k,v in sources[0].items(): setattr(self, k, v) else: debug(4, "%s not in sources table", url)
def ocr2xml(pdffile, xmlfile, keep_tempfiles=False, write_hocr=False): """ocr pdffile and write pdftohtml-type parsing to xmlfile""" start_time = timer() debug(2, "ocr2xml %s %s", pdffile, xmlfile) try: numpages = int(pdfinfo(pdffile)['Pages']) except e: raise MalformedPDFError('pdfinfo failed') debug(2, '%s pages to process', numpages) xml = init_xml() hocr = b'' for p in range(numpages): page_hocr = ocr_page(pdffile, p+1) xml_add_page(xml, page_hocr) hocr += page_hocr xmlstr = lxml.etree.tostring(xml, encoding='utf-8', pretty_print=True, xml_declaration=True) if write_hocr: with open(xmlfile, 'wb') as f: f.write(hocr) else: with open(xmlfile, 'wb') as f: f.write(xmlstr) doctidy(xmlfile) end_time = timer() if not keep_tempfiles: debug(3, 'cleaning up') remove_tempdir() debug(2, 'Time: %s seconds', str(end_time - start_time))
def context_suggests_published(context): """ returns True if the link context makes it fairly certain that the linked document has already been published before this year. """ # uncomment to test paper processing: # return False if re.search('forthcoming|unpublished', context, re.I): debug(4, 'forthcoming/unpublished in context suggests not yet published') return False for m in re.finditer(r'\b\d{4}\b', context): if 1950 < int(m.group(0)) <= datetime.today().year: break else: debug(4, 'no suitable year in context suggests not yet published') return False # See https://github.com/wo/opp-tools/issues/54 pubterms = [r'\beds?\b', r'edit(?:ed|ors?)', r'\d-+\d\d', r'\d:\s*\d', 'journal', r'philosophical\b'] for t in pubterms: if re.search(t, context, re.I): debug(1, "ignoring published paper ('%s' in context)", t) return True debug(4, 'no publication keywords, assuming not yet published') return False
def save_local(r): # use recognizable tempfile name: m = re.search('/([^/]+?)(?:\.\w+)?(?:[\?\#].+)*$', r.url) fname = m.group(1) if m else r.url fname = re.sub('\W', '_', fname) + '.' + r.filetype tempfile = os.path.join(tempdir(), fname) debug(2, "saving %s to %s", r.url, tempfile) try: with open(tempfile, 'wb') as f: for block in r.iter_content(1024): f.write(block) except EnvironmentError as e: debug(1, "cannot save %s to %s: %s", r.url, tempfile, str(e)) raise return tempfile
def load_from_db(self, url='', source_id=0): url = url or self.url source_id = source_id or self.source_id if not url or not source_id: raise TypeError("need url and source_id to load Link from db") cur = db.dict_cursor() query = "SELECT * FROM links WHERE urlhash = MD5(%s) AND source_id = %s LIMIT 1" cur.execute(query, (url, source_id)) debug(5, cur._last_executed) links = cur.fetchall() if links: for k,v in links[0].items(): setattr(self, k, v) else: debug(4, "link to %s not in database", url)
def pdfinfo(filename): """returns dictionary of pdfinfo (poppler) data""" cmd = [PDFINFO, filename] debug(3, " ".join(cmd)) try: output = subprocess.check_output(cmd, stderr=subprocess.STDOUT, timeout=2) output = output.decode("utf-8") except subprocess.CalledProcessError as e: logger.warn(e.output) raise res = {} for line in output.split("\n"): if ":" in line: k, v = line.split(":", 1) res[k] = v.strip() return res
def is_bad_url(url): if len(url) > 512: debug(1, 'url %s is too long', url) return True re_bad_url = re.compile(""" ^\#| ^mailto| ^data| ^javascript| ^.+//[^/]+/?$| # TLD twitter\.com| fonts\.googleapis\.com| philpapers\.org/asearch| \.(?:css|mp3|avi|mov|jpg|gif|ppt|png|ico|mso|xml)(?:\?.+)?$ # .css?version=12 """, re.I | re.X) return re_bad_url.search(url) is not None
def run(): """ retrieve and process new blog posts that have been put in the db by opp-web:feedhandler """ cur = db.cursor() query = "SELECT doc_id FROM docs WHERE doctype = 'blogpost' AND status = 0" cur.execute(query) debug(4, cur._last_executed) posts = cur.fetchall() if not posts: return debug(3, "no new blog posts") for id in posts: post = Doc(doc_id=id) post.load_from_db() process_blogpost(post)
def next_source(): """return the next source from db that's due to be checked""" min_age = datetime.now() - timedelta(hours=16) min_age = min_age.strftime('%Y-%m-%d %H:%M:%S') cur = db.dict_cursor() query = ("SELECT * FROM sources WHERE" " sourcetype != 'blog'" # ignore rss feeds " AND (last_checked IS NULL OR last_checked < %s)" " ORDER BY last_checked LIMIT 1") cur.execute(query, (min_age,)) debug(4, cur._last_executed) sources = cur.fetchall() if sources: return Source(**sources[0]) else: debug(1, "all pages recently checked") return None
def paper_is_old(doc): """ checks online if document has been published earlier than this year """ debug(4, "checking if paper is old") title = re.sub('<[\S]+?>', '', doc.title) # strip tags match = scholarquery(doc.authors, title) if (match and match['year'] and 1950 < int(match['year']) < datetime.today().year-2): # Unfortunately, Google Scholar gives publication dates even # for unpublished manuscripts e.g. if they were cited with a # certain date once; so we only ignore papers if the given # date is at least two years old. TODO: improve! (If I finally # upload my "Generalizing Kripke Semantics" paper, I don't # want it to be treated as published in 2011!) debug(1, "paper already published in %s", match['year']) return True return False
def update_db(self, **kwargs): """update self.**kwargs and write present state to db""" for k, v in kwargs.items(): setattr(self, k, v) cur = db.cursor() fields = [f for f in self.db_fields.keys() if f != 'doc_id' and getattr(self, f) is not None] values = [getattr(self, f) for f in fields] if self.doc_id: query = "UPDATE docs SET {},urlhash=MD5(url) WHERE doc_id = %s".format( ",".join(k+"=%s" for k in fields)) cur.execute(query, values + [self.doc_id]) else: query = "INSERT INTO docs ({},urlhash) VALUES ({},MD5(url))".format( ",".join(fields), ",".join(("%s",)*len(fields))) cur.execute(query, values) self.doc_id = cur.lastrowid debug(4, cur._last_executed) db.commit()
def extract_links(self, browser): """ extracts links from source page; sets self.new_links and self.old_links, both lists of Link objects. """ self.new_links = [] self.old_links = [] new_links = {} # url => Link old_links = {} # url => Link # lots of try/except because selenium easily crashes: try: els = browser.find_elements_by_tag_name("a") except: debug(1, "cannot retrieve links from page %s", self.url) return [],[] for el in els: try: if not el.is_displayed(): continue href = el.get_attribute('href') anchortext = el.text if not href: continue except: continue if is_bad_url(href): debug(3, 'ignoring link to %s (bad url)', href) continue if href in old_links.keys() or href in new_links.keys(): debug(3, 'ignoring repeated link to %s', href) old_link = self.old_link(href) if old_link: debug(3, 'link to %s is old', href) old_links[href] = old_link old_links[href].element = el else: debug(1, 'new link: "%s" %s', anchortext, href) new_links[href] = Link(url=href, source=self, element=el) self.new_links = new_links.values() self.old_links = old_links.values()
def get_authors(full_html, post_html, post_text): # look for 'by (Foo Bar)' near the start of the post post_start = full_html.find(post_html) tagsoup = r'(?:<[^>]+>|\s)*' by = r'[Bb]y\b'+tagsoup name = r'[\w\.\-]+(?: (?!and)[\w\.\-]+){0,3}' separator = tagsoup+r'(?: and |, )'+tagsoup re_str = r'{}({})(?:{}({}))*'.format(by,name,separator,name) regex = re.compile(re_str) best_match = None for m in regex.finditer(full_html): if post_text.find(m.group(1)) > 20: debug(2, 'author candidate "%s" because too far in text', m.group(1)) continue if not best_match or abs(m.start()-post_start) < abs(best_match.start()-post_start): best_match = m if best_match: names = [n for n in best_match.groups() if n] return ', '.join(names) return ''