def get_sent_predictions_for_domain(self, domain): uids = self.domain_uids(domain) predictions = [] for uid in uids: # get the index of the study with specified uid study_index = np.nonzero(self.X_uids==uid)[0][0] # tokenize into sentences sents = sent_tokenizer.tokenize(self.X_list[study_index]) # vectorize the sentences X_sents = self.sent_vectorizer.transform(sents) # get predicted 1 / -1 for the sentences pred_class = self.sent_clf.predict(X_sents) # get the sentences which are predicted 1 positive_sents = [sent for sent, pred in zip(sents, pred_class) if pred==1] # make a single string per doc doc = " ".join(positive_sents) predictions.append(doc) return predictions
def _get_sentence_level_X_y(test_domain=CORE_DOMAINS[0]): # sample_negative_examples = n: for low rate of positive examples; random sample # of n negative examples if > n negative examples in article; if n=0 then all examples # used q = QualityQuoteReader(quotes_only=False) y = [] X_words = [] study_indices = [] study_sent_indices = [] # list of (start, end) indices corresponding to each study sent_index_counter = 0 domains = q.domains() counter = 0 for i, study in enumerate(q): if i > 200: # pdb.set_trace() print "WARNING RETURNING SMALL SUBSET OF DATA!" break study_indices.append(i) # fast forward to the matching domain for domain in study.cochrane["QUALITY"]: if domain["DOMAIN"] == test_domain: break else: # pdb.set_trace() # if no matching domain continue to the next study # study_sent_indices.append(()) # continue pass quote = None has_quote = False try: quote = QUALITY_QUOTE_REGEX.search(domain["DESCRIPTION"]).group(1) has_quote = True except: ## formerly this was freaking out, instead, let's just pass pass # print "Unable to extract quote:" # print domain["DESCRIPTION"] # raise pdf_sents = sent_tokenizer.tokenize(study.studypdf) if has_quote: quote_words = word_tokenizer.tokenize(quote) quote_sent_bow = set((word.lower() for word in quote_words)) rankings = [] for pdf_i, pdf_sent in enumerate(pdf_sents): pdf_words = word_tokenizer.tokenize(pdf_sent) pdf_sent_bow = set((word.lower() for word in pdf_words)) if not pdf_sent_bow or not quote_sent_bow: prop_quote_in_sent = 0 else: prop_quote_in_sent = 100 * ( 1 - (float(len(quote_sent_bow - pdf_sent_bow)) / float(len(quote_sent_bow))) ) # print "%.0f" % (prop_quote_in_sent,) rankings.append((prop_quote_in_sent, pdf_i)) rankings.sort(key=lambda x: x[0], reverse=True) best_match_index = rankings[0][1] y_study = np.zeros(len(pdf_sents)) # all zeros when we don't have a quote if has_quote: y_study[best_match_index] = 1 X_words.extend(pdf_sents) sent_end_index = sent_index_counter + len(pdf_sents) study_sent_indices.append((sent_index_counter, sent_end_index)) sent_index_counter = sent_end_index y.extend(y_study) print len(X_words) print X_words[0] print "fitting vectorizer" vectorizer = CountVectorizer(max_features=10000) X = vectorizer.fit_transform(X_words) print "done!" y = np.array(y) return X, y, X_words, vectorizer, study_sent_indices, study_indices print "Finished! %d studies included domain %s" % (counter, test_domain)
def word_sent_tokenize(raw_text): return [(word_tokenizer.tokenize(sent)) for sent in sent_tokenizer.tokenize(raw_text)]
def doc_demo(models, testfile="testdata/demo.pdf", test_mode=False): import color print "Document demo: " + testfile print "=" * 40 print raw_text = PdfReader(testfile).get_text() text = unidecode(raw_text) text = re.sub('\n', ' ', text) # text_sents = sent_tokenizer.tokenize(text) # tokenize into sentences sents = sent_tokenizer.tokenize(text) domain_limiter = 1 if test_mode else len(CORE_DOMAINS) # only parse first domain in test mode for test_domain, doc_model, doc_vec, sent_model, sent_vec in zip(CORE_DOMAINS[:domain_limiter], *models): #### ## PART ONE - get the predicted sentences with risk of bias information #### # vectorize the sentences X_sents = sent_vec.transform(sents) # get predicted 1 / -1 for the sentences pred_sents = sent_model.predict(X_sents) # get the sentences which are predicted 1 positive_sents = [sent for sent, pred in zip(sents, pred_sents) if pred==1] # make a single string per doc summary_text = " ".join(positive_sents) #### ## PART TWO - integrate summarized and full text, then predict the document class #### doc_vec.builder_clear() doc_vec.builder_add_docs([text]) doc_vec.builder_add_docs([summary_text], prefix="high-prob-sent-") X_doc = doc_vec.builder_transform() prediction = doc_model.predict(X_doc)[0] print "-" * 30 print test_domain prediction = {1: "Low", -1: "Unknown or high"}[prediction] print prediction if prediction == "Low": text_color = color.GREEN elif prediction == "Unknown or high": text_color = color.YELLOW color.printout(prediction, text_color) print "-" * 30
def _get_sentence_level_X_y(test_domain=CORE_DOMAINS[0]): # sample_negative_examples = n: for low rate of positive examples; random sample # of n negative examples if > n negative examples in article; if n=0 then all examples # used q = QualityQuoteReader(quotes_only=False) y = [] X_words = [] study_indices = [] study_sent_indices = [] # list of (start, end) indices corresponding to each study sent_index_counter = 0 domains = q.domains() counter = 0 for i, study in enumerate(q): if i > 200: #pdb.set_trace() print "WARNING RETURNING SMALL SUBSET OF DATA!" break study_indices.append(i) # fast forward to the matching domain for domain in study.cochrane["QUALITY"]: if domain["DOMAIN"] == test_domain: break else: #pdb.set_trace() # if no matching domain continue to the next study #study_sent_indices.append(()) #continue pass quote = None has_quote = False try: quote = QUALITY_QUOTE_REGEX.search(domain["DESCRIPTION"]).group(1) has_quote = True except: ## formerly this was freaking out, instead, let's just pass pass # print "Unable to extract quote:" # print domain["DESCRIPTION"] # raise pdf_sents = sent_tokenizer.tokenize(study.studypdf) if has_quote: quote_words = word_tokenizer.tokenize(quote) quote_sent_bow = set((word.lower() for word in quote_words)) rankings = [] for pdf_i, pdf_sent in enumerate(pdf_sents): pdf_words = word_tokenizer.tokenize(pdf_sent) pdf_sent_bow = set((word.lower() for word in pdf_words)) if not pdf_sent_bow or not quote_sent_bow: prop_quote_in_sent = 0 else: prop_quote_in_sent = 100* (1 - (float(len(quote_sent_bow-pdf_sent_bow))/float(len(quote_sent_bow)))) # print "%.0f" % (prop_quote_in_sent,) rankings.append((prop_quote_in_sent, pdf_i)) rankings.sort(key=lambda x: x[0], reverse=True) best_match_index = rankings[0][1] y_study = np.zeros(len(pdf_sents)) # all zeros when we don't have a quote if has_quote: y_study[best_match_index] = 1 X_words.extend(pdf_sents) sent_end_index = sent_index_counter + len(pdf_sents) study_sent_indices.append((sent_index_counter, sent_end_index)) sent_index_counter = sent_end_index y.extend(y_study) print len(X_words) print X_words[0] print "fitting vectorizer" vectorizer = CountVectorizer(max_features=10000) X = vectorizer.fit_transform(X_words) print "done!" y = np.array(y) return X, y, X_words, vectorizer, study_sent_indices, study_indices print "Finished! %d studies included domain %s" % (counter, test_domain)
def doc_demo(models, testfile="testdata/demo.pdf", test_mode=False): import color print "Document demo: " + testfile print "=" * 40 print raw_text = PdfReader(testfile).get_text() text = unidecode(raw_text) text = re.sub('\n', ' ', text) # text_sents = sent_tokenizer.tokenize(text) # tokenize into sentences sents = sent_tokenizer.tokenize(text) domain_limiter = 1 if test_mode else len( CORE_DOMAINS) # only parse first domain in test mode for test_domain, doc_model, doc_vec, sent_model, sent_vec in zip( CORE_DOMAINS[:domain_limiter], *models): #### ## PART ONE - get the predicted sentences with risk of bias information #### # vectorize the sentences X_sents = sent_vec.transform(sents) # get predicted 1 / -1 for the sentences pred_sents = sent_model.predict(X_sents) # get the sentences which are predicted 1 positive_sents = [ sent for sent, pred in zip(sents, pred_sents) if pred == 1 ] # make a single string per doc summary_text = " ".join(positive_sents) #### ## PART TWO - integrate summarized and full text, then predict the document class #### doc_vec.builder_clear() doc_vec.builder_add_docs([text]) doc_vec.builder_add_docs([summary_text], prefix="high-prob-sent-") X_doc = doc_vec.builder_transform() prediction = doc_model.predict(X_doc)[0] print "-" * 30 print test_domain prediction = {1: "Low", -1: "Unknown or high"}[prediction] print prediction if prediction == "Low": text_color = color.GREEN elif prediction == "Unknown or high": text_color = color.YELLOW color.printout(prediction, text_color) print "-" * 30