def cache_pdfs(self, cachepath="data/cache/", refresh=False): if not os.path.exists(cachepath): os.makedirs(cachepath) all_pdfs = set( os.path.splitext(os.path.basename(self.pdf_index[entry['pmid']])) [0] for entry in self.index_data) if not refresh: already_done = set( os.path.splitext(os.path.basename(filename))[0] for filename in glob(cachepath + "*.txt")) todo = list(all_pdfs - already_done) else: todo = list(all_pdfs) if not todo: print "cache up to date" else: pb = ProgressBar(len(todo), timer=True) for pdf_filename in todo: pb.tap() pm = PdfReader(PDF_PATH + pdf_filename + '.pdf') text = pm.get_text() with open(cachepath + pdf_filename + '.txt', 'wb') as f: f.write(text)
def cache_pdfs(self, cachepath="data/cache/", refresh=False): if not os.path.exists(cachepath): os.makedirs(cachepath) all_pdfs = set(os.path.splitext(os.path.basename(self.pdf_index[entry['pmid']]))[0] for entry in self.index_data) if not refresh: already_done = set(os.path.splitext(os.path.basename(filename))[0] for filename in glob(cachepath + "*.txt")) todo = list(all_pdfs - already_done) else: todo = list(all_pdfs) if not todo: print "cache up to date" else: pb = ProgressBar(len(todo), timer=True) for pdf_filename in todo: pb.tap() pm = PdfReader(PDF_PATH + pdf_filename + '.pdf') text = pm.get_text() with open(cachepath + pdf_filename + '.txt', 'wb') as f: f.write(text)
def second_view(self, study, cachepath="data/cache/"): """ overrides code which gets pubmed abstract and instead returns the full text of an associated PDF""" try: # try to read first as plain text from the cache if exists with open(cachepath + os.path.splitext(os.path.basename(self.pdf_index[study['pmid']]))[0] + '.txt', 'rb') as f: text = f.read() return {"text": text, "pmid": study['pmid']} except: # otherwise run through pdftotext pm = PdfReader(self.pdf_index[study['pmid']]) return {"text": pm.get_text(), "pmid": study['pmid']}
def doc_demo(models, testfile="testdata/demo.pdf", test_mode=False): import color print "Document demo: " + testfile print "=" * 40 print raw_text = PdfReader(testfile).get_text() text = unidecode(raw_text) text = re.sub('\n', ' ', text) # text_sents = sent_tokenizer.tokenize(text) # tokenize into sentences sents = sent_tokenizer.tokenize(text) domain_limiter = 1 if test_mode else len( CORE_DOMAINS) # only parse first domain in test mode for test_domain, doc_model, doc_vec, sent_model, sent_vec in zip( CORE_DOMAINS[:domain_limiter], *models): #### ## PART ONE - get the predicted sentences with risk of bias information #### # vectorize the sentences X_sents = sent_vec.transform(sents) # get predicted 1 / -1 for the sentences pred_sents = sent_model.predict(X_sents) # get the sentences which are predicted 1 positive_sents = [ sent for sent, pred in zip(sents, pred_sents) if pred == 1 ] # make a single string per doc summary_text = " ".join(positive_sents) #### ## PART TWO - integrate summarized and full text, then predict the document class #### doc_vec.builder_clear() doc_vec.builder_add_docs([text]) doc_vec.builder_add_docs([summary_text], prefix="high-prob-sent-") X_doc = doc_vec.builder_transform() prediction = doc_model.predict(X_doc)[0] print "-" * 30 print test_domain prediction = {1: "Low", -1: "Unknown or high"}[prediction] print prediction if prediction == "Low": text_color = color.GREEN elif prediction == "Unknown or high": text_color = color.YELLOW color.printout(prediction, text_color) print "-" * 30