Exemplo n.º 1
0
    def test_with_wiki_article(self):
        arnold_article = read_file('arnold.txt')
        arnold_article_doc = nlp(arnold_article)
        arnold_sections = [sent.string.strip() for sent in arnold_article_doc.sents]  # arnold_article.split("\n")
        passages = Passages()
        for section in arnold_sections:
            passages.add(Passage(section))
        question_str = "Who is the brother of Arnold Schwarzenegger"
        question = QuestionModel(text_to_token_list(question_str), question_str)

        tfidf = TfIdfRanker(text_to_token_list)
        ranked_passages = tfidf.calc_passage_ranks(passages, question)

        self.assertTrue(ranked_passages.get_passage_at(0).text.find('Meinhard') != -1)
        for p in ranked_passages.passages:
            print(str(p.tfidf_score) + ': ' + p.text)
Exemplo n.º 2
0
    def test_with_simple_input(self):
        passages = Passages()
        passages.add(Passage("c++ c++ object class oriented"))
        passages.add(Passage("class oriented code code sort sort python python"))
        passages.add(Passage("java java overflow sort stack stack"))
        passages.add(Passage("c++ pointer code code array"))
        passages.add(Passage("c++ object class oriented java"))
        passages.add(Passage("sort python python"))
        passages.add(Passage("object class java java array"))
        passages.add(Passage("c++ c++java java python python"))
        passages.add(Passage("loop loop code sort sort"))
        passages.add(Passage("class class code array array stack stack"))
        question_str = "java stack overflow"
        question = QuestionModel(question_str.split(), question_str)

        tfidf = TfIdfRanker(lambda text: text.split())
        ranked_passages = tfidf.calc_passage_ranks(passages, question)

        self.assertEqual("java java overflow sort stack stack", ranked_passages.get_passage_at(0).text)
Exemplo n.º 3
0
    def calc_passage_ranks(self, passages: Passages,
                           question_model: QuestionModel) -> Passages:
        query_vector = question_model.get_keywords()
        list_of_passage_tokens = passages.map(
            lambda passage: self.convert_text_to_token_list(passage.text))

        query_dict = self._build_query_dictionary(query_vector)
        score_list = self._calculate_score_list(list_of_passage_tokens)

        vectors = vectorize(score_list + [query_dict])
        list_of_passage_vectors = vectors[:-1]
        query_vector = vectors[-1]
        similarities_array = self._calculate_similarities(
            list_of_passage_vectors, query_vector)

        ranked_passages = self._build_ranked_passages_from(
            passages, similarities_array)
        ranked_passages.sort()
        return ranked_passages
Exemplo n.º 4
0
def process_question(question: str, nlp_toolkit: NLPToolkit) -> QPResult:
    # start logging
    Logger.info('started')
    start = datetime.now()

    # start question processing
    clf_name = get_clf_name(question)
    clf = get_clf_from_disk(clf_name)
    label = get_predicted_label(question, clf)
    keywords = nlp_toolkit.get_headwords(question)
    # keywords = get_key_words(question)
    # print(keywords)
    # print(AnswerType[label.upper()])

    # end logging
    end = datetime.now()
    diff = end - start
    Logger.info('AnswerType: ' + str(AnswerType[label]))
    Logger.info('finished (' + str(diff.seconds) + '.' +
                str(diff.microseconds)[0:2] + ' s)')
    Logger.small_seperator()

    return QPResult(QuestionModel(keywords, question), AnswerType[label])
Exemplo n.º 5
0
for dataset in data['data']:
    title = dataset['title']
    Logger.info('Dataset: ' + title)

    for paragraph in dataset['paragraphs'][:5]:
        context = paragraph['context']
        for question_answer_set in paragraph['qas']:
            question_counter += 1
            question = question_answer_set['question']
            correct_answers = question_answer_set['answers']
            Logger.info(question)

            doc = Document(title, context)
            docs = Documents()
            docs.add(doc)

            questionModel = QuestionModel(question.split(), question)
            qpresult = QPResult(questionModel, AnswerType.ENTY_dismed)

            rankedPassages = receive_passages(docs, qpresult, nlptoolkit)
            answer = rankedPassages.get_passage_at(0).text

            Logger.info(answer)
            Logger.info(correct_answers)
            if text_contains_any_answer(answer, correct_answers):
                correct_answers_counter += 1
            Logger.info("Result: " + str(correct_answers_counter) + " / " +
                        str(question_counter))
            Logger.info('')
            Logger.info('')
Exemplo n.º 6
0
def process_question(question: str, nlp_toolkit):
    if not isinstance(question, str):
        raise Exception("process_question expects parameter of type str")
    return QPResult(QuestionModel([], ""), AnswerType.HUM_ind)
        for question_answer_set in paragraph['qas']:
            Logger.error(
                '####################################################')
            Logger.error('Question ' + str(question_counter))
            Logger.error('')
            question_counter += 1
            question = question_answer_set['question']
            correct_answers = question_answer_set['answers']
            Logger.error(question)

            # answer = process_answer_question(question)

            keywords = nlp.get_headwords(question)
            Logger.error('Headwords: ' + ', '.join(keywords))
            keywords.append(title_ext)
            qm = QuestionModel(keywords, question)
            # error("Keywords: " + ', '.join(keywords))
            docs = receive_docs(qm, nlp)
            Logger.error("Correct Doc: " + title_ext)

            found = False
            for idx, doc in enumerate(docs.docs[:5]):
                if doc.title in title_ext:
                    correct_answer_dict[idx] += 1
                    found = True
                    break

            if not found:
                Logger.error("No Matching Doc found: ")
                Logger.error("Docs: " + docs.get_doc(1).title + ", " +
                             docs.get_doc(2).title + ", " +
Exemplo n.º 8
0
 def test_returns_main_article(self):
     question_str = "Arnold Schwarzenegger"
     question_model = QuestionModel(question_str.split(), question_str)
     result = document_retrieval.receive_docs(question_model, NLPToolkit())
     self.assertEqual(result.get_doc_with_highest_rank().title,
                      'Arnold Schwarzenegger')