Пример #1
0
    def get_sent_predictions_for_domain(self, domain):

        uids = self.domain_uids(domain)

        predictions = []

        for uid in uids:

            # get the index of the study with specified uid
            study_index = np.nonzero(self.X_uids==uid)[0][0]

            # tokenize into sentences
            sents = sent_tokenizer.tokenize(self.X_list[study_index])

            # vectorize the sentences
            X_sents = self.sent_vectorizer.transform(sents)

            # get predicted 1 / -1 for the sentences
            pred_class = self.sent_clf.predict(X_sents)

            # get the sentences which are predicted 1
            positive_sents = [sent for sent, pred in zip(sents, pred_class) if pred==1]

            # make a single string per doc
            doc = " ".join(positive_sents)
            
            predictions.append(doc)
        
        return predictions
Пример #2
0
def _get_sentence_level_X_y(test_domain=CORE_DOMAINS[0]):
    # sample_negative_examples = n: for low rate of positive examples; random sample
    # of n negative examples if > n negative examples in article; if n=0 then all examples
    # used

    q = QualityQuoteReader(quotes_only=False)
    y = []
    X_words = []

    study_indices = []

    study_sent_indices = []  # list of (start, end) indices corresponding to each study
    sent_index_counter = 0

    domains = q.domains()
    counter = 0

    for i, study in enumerate(q):

        if i > 200:
            # pdb.set_trace()
            print "WARNING RETURNING SMALL SUBSET OF DATA!"
            break

        study_indices.append(i)

        # fast forward to the matching domain
        for domain in study.cochrane["QUALITY"]:
            if domain["DOMAIN"] == test_domain:
                break
        else:
            # pdb.set_trace()
            # if no matching domain continue to the next study
            # study_sent_indices.append(())
            # continue
            pass

        quote = None
        has_quote = False
        try:
            quote = QUALITY_QUOTE_REGEX.search(domain["DESCRIPTION"]).group(1)
            has_quote = True
        except:
            ## formerly this was freaking out, instead, let's just pass
            pass
        #    print "Unable to extract quote:"
        #    print domain["DESCRIPTION"]
        #    raise

        pdf_sents = sent_tokenizer.tokenize(study.studypdf)

        if has_quote:
            quote_words = word_tokenizer.tokenize(quote)
            quote_sent_bow = set((word.lower() for word in quote_words))
            rankings = []

            for pdf_i, pdf_sent in enumerate(pdf_sents):
                pdf_words = word_tokenizer.tokenize(pdf_sent)
                pdf_sent_bow = set((word.lower() for word in pdf_words))

                if not pdf_sent_bow or not quote_sent_bow:
                    prop_quote_in_sent = 0
                else:
                    prop_quote_in_sent = 100 * (
                        1 - (float(len(quote_sent_bow - pdf_sent_bow)) / float(len(quote_sent_bow)))
                    )

                # print "%.0f" % (prop_quote_in_sent,)

                rankings.append((prop_quote_in_sent, pdf_i))

            rankings.sort(key=lambda x: x[0], reverse=True)
            best_match_index = rankings[0][1]

        y_study = np.zeros(len(pdf_sents))  # all zeros when we don't have a quote
        if has_quote:
            y_study[best_match_index] = 1
        X_words.extend(pdf_sents)

        sent_end_index = sent_index_counter + len(pdf_sents)
        study_sent_indices.append((sent_index_counter, sent_end_index))
        sent_index_counter = sent_end_index
        y.extend(y_study)

    print len(X_words)
    print X_words[0]

    print "fitting vectorizer"
    vectorizer = CountVectorizer(max_features=10000)
    X = vectorizer.fit_transform(X_words)
    print "done!"
    y = np.array(y)

    return X, y, X_words, vectorizer, study_sent_indices, study_indices

    print "Finished! %d studies included domain %s" % (counter, test_domain)
Пример #3
0
def word_sent_tokenize(raw_text):
    return [(word_tokenizer.tokenize(sent)) for sent in sent_tokenizer.tokenize(raw_text)]
Пример #4
0
def doc_demo(models, testfile="testdata/demo.pdf", test_mode=False):

    import color

    print "Document demo: " + testfile
    print "=" * 40
    print

    raw_text = PdfReader(testfile).get_text()
    text = unidecode(raw_text)
    text = re.sub('\n', ' ', text)

    # text_sents = sent_tokenizer.tokenize(text)
    # tokenize into sentences
    sents = sent_tokenizer.tokenize(text)



    domain_limiter = 1 if test_mode else len(CORE_DOMAINS) # only parse first domain in test mode

    
    for test_domain, doc_model, doc_vec, sent_model, sent_vec in zip(CORE_DOMAINS[:domain_limiter], *models):
        
        ####
        ## PART ONE - get the predicted sentences with risk of bias information
        ####

        # vectorize the sentences
        X_sents = sent_vec.transform(sents)

        # get predicted 1 / -1 for the sentences
        pred_sents = sent_model.predict(X_sents)

        # get the sentences which are predicted 1
        positive_sents = [sent for sent, pred in zip(sents, pred_sents) if pred==1]

        # make a single string per doc
        summary_text = " ".join(positive_sents)


        ####
        ##  PART TWO - integrate summarized and full text, then predict the document class
        ####

        doc_vec.builder_clear()
        doc_vec.builder_add_docs([text])
        doc_vec.builder_add_docs([summary_text], prefix="high-prob-sent-")

        X_doc = doc_vec.builder_transform()

        prediction = doc_model.predict(X_doc)[0]
        print "-" * 30
        print test_domain


        prediction = {1: "Low", -1: "Unknown or high"}[prediction]

        print prediction


        if prediction == "Low":
            text_color = color.GREEN
        elif prediction == "Unknown or high":
            text_color = color.YELLOW

        color.printout(prediction, text_color)

        print "-" * 30
Пример #5
0
def _get_sentence_level_X_y(test_domain=CORE_DOMAINS[0]):
    # sample_negative_examples = n: for low rate of positive examples; random sample
    # of n negative examples if > n negative examples in article; if n=0 then all examples
    # used


    q = QualityQuoteReader(quotes_only=False)
    y = []
    X_words = []
    
    study_indices = []

    study_sent_indices = [] # list of (start, end) indices corresponding to each study
    sent_index_counter = 0


    domains = q.domains()
    counter = 0

    for i, study in enumerate(q):

        if i > 200:
            #pdb.set_trace()
            print "WARNING RETURNING SMALL SUBSET OF DATA!"
            break

        study_indices.append(i)

        # fast forward to the matching domain
        for domain in study.cochrane["QUALITY"]:
            if domain["DOMAIN"] == test_domain:
                break
        else:
            #pdb.set_trace()
            # if no matching domain continue to the next study
            #study_sent_indices.append(())
            #continue
            pass

        quote = None
        has_quote = False
        try:
            quote = QUALITY_QUOTE_REGEX.search(domain["DESCRIPTION"]).group(1)
            has_quote = True
        except:
            ## formerly this was freaking out, instead, let's just pass
            pass
        #    print "Unable to extract quote:"
        #    print domain["DESCRIPTION"]
        #    raise



        pdf_sents = sent_tokenizer.tokenize(study.studypdf)

 
        
        if has_quote:
            quote_words = word_tokenizer.tokenize(quote)
            quote_sent_bow = set((word.lower() for word in quote_words))
            rankings = []

            for pdf_i, pdf_sent in enumerate(pdf_sents):
                pdf_words = word_tokenizer.tokenize(pdf_sent)
                pdf_sent_bow = set((word.lower() for word in pdf_words))

                if not pdf_sent_bow or not quote_sent_bow:
                    prop_quote_in_sent = 0
                else:
                    prop_quote_in_sent = 100* (1 - (float(len(quote_sent_bow-pdf_sent_bow))/float(len(quote_sent_bow))))

            # print "%.0f" % (prop_quote_in_sent,)

                rankings.append((prop_quote_in_sent, pdf_i))

            rankings.sort(key=lambda x: x[0], reverse=True)
            best_match_index = rankings[0][1]

        y_study = np.zeros(len(pdf_sents)) # all zeros when we don't have a quote
        if has_quote:
            y_study[best_match_index] = 1
        X_words.extend(pdf_sents)



        sent_end_index = sent_index_counter + len(pdf_sents)
        study_sent_indices.append((sent_index_counter, sent_end_index))
        sent_index_counter = sent_end_index
        y.extend(y_study)

 


                    
                    
                


    print len(X_words)
    print X_words[0]

    print "fitting vectorizer"
    vectorizer = CountVectorizer(max_features=10000)
    X = vectorizer.fit_transform(X_words)            
    print "done!"
    y = np.array(y)

    return X, y, X_words, vectorizer, study_sent_indices, study_indices

    print "Finished! %d studies included domain %s" % (counter, test_domain)
Пример #6
0
def word_sent_tokenize(raw_text):
    return [(word_tokenizer.tokenize(sent)) for sent in sent_tokenizer.tokenize(raw_text)]
Пример #7
0
def doc_demo(models, testfile="testdata/demo.pdf", test_mode=False):

    import color

    print "Document demo: " + testfile
    print "=" * 40
    print

    raw_text = PdfReader(testfile).get_text()
    text = unidecode(raw_text)
    text = re.sub('\n', ' ', text)

    # text_sents = sent_tokenizer.tokenize(text)
    # tokenize into sentences
    sents = sent_tokenizer.tokenize(text)

    domain_limiter = 1 if test_mode else len(
        CORE_DOMAINS)  # only parse first domain in test mode

    for test_domain, doc_model, doc_vec, sent_model, sent_vec in zip(
            CORE_DOMAINS[:domain_limiter], *models):

        ####
        ## PART ONE - get the predicted sentences with risk of bias information
        ####

        # vectorize the sentences
        X_sents = sent_vec.transform(sents)

        # get predicted 1 / -1 for the sentences
        pred_sents = sent_model.predict(X_sents)

        # get the sentences which are predicted 1
        positive_sents = [
            sent for sent, pred in zip(sents, pred_sents) if pred == 1
        ]

        # make a single string per doc
        summary_text = " ".join(positive_sents)

        ####
        ##  PART TWO - integrate summarized and full text, then predict the document class
        ####

        doc_vec.builder_clear()
        doc_vec.builder_add_docs([text])
        doc_vec.builder_add_docs([summary_text], prefix="high-prob-sent-")

        X_doc = doc_vec.builder_transform()

        prediction = doc_model.predict(X_doc)[0]
        print "-" * 30
        print test_domain

        prediction = {1: "Low", -1: "Unknown or high"}[prediction]

        print prediction

        if prediction == "Low":
            text_color = color.GREEN
        elif prediction == "Unknown or high":
            text_color = color.YELLOW

        color.printout(prediction, text_color)

        print "-" * 30