def get_answer_from_doc(rep_doc):
    extractor = MasterExtractor()
    doc = Document(rep_doc['title'], rep_doc[' description'], rep_doc[' body'],
                   rep_doc[' time'])
    doc = extractor.parse(doc)
    try:
        top_who_answer = doc.get_top_answer('who').get_parts_as_text()
    except:
        top_who_answer = 'unknown'
    try:
        top_what_answer = doc.get_top_answer('what').get_parts_as_text()
    except:
        top_what_answer = 'unknown'
    try:
        top_when_answer = doc.get_top_answer('when').get_parts_as_text()
    except:
        top_when_answer = 'unknown'
    try:
        top_where_answer = doc.get_top_answer('where').get_parts_as_text()
    except:
        top_where_answer = 'unknown'
    try:
        top_why_answer = doc.get_top_answer('why').get_parts_as_text()
    except:
        top_why_answer = 'unknown'
    try:
        top_how_answer = doc.get_top_answer('how').get_parts_as_text()
    except:
        top_how_answer = 'unknown'

    return (top_who_answer, top_what_answer, top_when_answer, top_where_answer,
            top_why_answer, top_how_answer)
Пример #2
0
def extract_article(news_json):
    extractor = MasterExtractor()
    #doc = Document.from_text(sample["text"], sample["date_publish"])
    doc = Document(news_json["title"], news_json["description"],
                   news_json["text"], news_json["date_publish"])
    # or: doc = Document(title, lead, text, date_publish)
    doc = extractor.parse(doc)
    who = doc.get_top_answer('who').get_parts_as_text() if len(
        doc.get_answers('who')) > 0 else ""
    what = doc.get_top_answer('what').get_parts_as_text() if len(
        doc.get_answers('what')) > 0 else ""
    where = doc.get_top_answer('where').get_parts_as_text() if len(
        doc.get_answers('where')) > 0 else ""
    why = doc.get_top_answer('why').get_parts_as_text() if len(
        doc.get_answers('why')) > 0 else ""
    how = doc.get_top_answer('how').get_parts_as_text() if len(
        doc.get_answers('how')) > 0 else ""
    return {'who': who, 'what': what, 'where': where, 'why': why, 'how': how}
Пример #3
0
wordNetLemmatizer = WordNetLemmatizer()

errorIndexes = []
print('process start')
with open(keywordsTextFilePath, 'a') as f:
    # f.write('\nasdf')
    for i in range(startTextFileLength, len(huffPostData)):
        # for i in range(startTextFileLength, 22552):
        try:
            print(i, 'th index start')
            huffPostDatum = huffPostData[i]
            keywords = []
            doc = Document(huffPostDatum['title'], huffPostDatum['subtitle'],
                           huffPostDatum['content'], huffPostDatum['date'])
            print(i, 'extractor.parse(doc) start')
            doc = extractor.parse(doc)
            print(i, 'doc.get_answers() start')
            answers = doc.get_answers()

            for (fivew1h, answer) in answers.items():
                if len(answer) != 0:
                    text = answer[0].get_parts_as_text()
                    words = word_tokenize(text)
                    wordPosTuples = pos_tag(words)

                    for wordPosTuple in wordPosTuples:
                        if wordPosTuple[1].startswith('N'):
                            lemmatizedWord = wordNetLemmatizer.lemmatize(
                                wordPosTuple[0].lower())
                            keywords.append({
                                'keyword': lemmatizedWord,
Пример #4
0
class WHPhrasesBatchProcessor(BatchProcessorBase):
    """
    Extracts the WH phrases (who, what, when, where, why, how) from text.

    This is intended to be run from within a Docker network, since
    access to a Stanford CoreNLP server API at http://corenlp-service:9000
    is required. Please see the readme file at https://github.com/stevengt/whatwhy
    for more information.
    """
    def __init__(self,
                 source,
                 dest,
                 id_col_name="ID",
                 source_col_name="Preprocessed Text",
                 dest_col_name=None,
                 include_cols=None):

        super().__init__(source=source,
                         dest=dest,
                         id_col_name=id_col_name,
                         source_col_name=source_col_name,
                         include_cols=include_cols)
        configure_nltk()
        sleep(60)  # Wait for Stanford CoreNLP server to start.
        extractor_preprocessor = Preprocessor("http://corenlp-service:9000")
        extractors = [
            action_extractor.ActionExtractor(),
            cause_extractor.CauseExtractor(),
            method_extractor.MethodExtractor()
        ]
        self.extractor = MasterExtractor(preprocessor=extractor_preprocessor,
                                         extractors=extractors)

    def get_top_wh_phrases(self, text_segment):
        top_phrases = {}
        for question_type in QUESTION_WORDS:
            top_phrases[question_type] = None

        if text_segment is not None and text_segment is not np.nan:
            try:
                doc = Document.from_text(text_segment)
                doc = self.extractor.parse(doc)
                for question_type in QUESTION_WORDS:
                    if question_type == "where" or question_type == "when":
                        top_phrases[question_type] = "NOT PROCESSED"
                    else:
                        try:
                            top_phrases[question_type] = doc.get_top_answer(
                                question_type).get_parts_as_text()
                        except:
                            continue
            except:
                pass

        return top_phrases

    def get_batch_results(self, batch):
        batch_as_df = get_df_from_csv_string(batch)
        for question_type in QUESTION_WORDS:
            batch_as_df[question_type] = None
        for i, row in batch_as_df.iterrows():
            top_wh_phrases = self.get_top_wh_phrases(row[self.source_col_name])
            for question_type in QUESTION_WORDS:
                batch_as_df.at[i, question_type] = top_wh_phrases.get(
                    question_type)

        results_df_cols = [self.id_col_name]
        results_df_cols.extend(QUESTION_WORDS)
        results_df_cols.extend(self.include_cols)
        results_df = batch_as_df[results_df_cols]
        results_csv_string = get_csv_string_from_df(results_df)

        results = {
            "target_results_file_name":
            f"batch{batch_as_df[self.id_col_name].iloc[0]}.csv",
            "file_content": results_csv_string
        }
        return results