示例#1
0
def receive_passages(docs: Documents, qp_result: QPResult,
                     nlp_toolkit: NLPToolkit) -> Passages:
    # start logging
    Logger.info('started')
    start = datetime.now()

    # Rank passages of the TOP_N_DOC documents with tfidf
    passages = Passages()

    for doc in docs.docs[:TOP_N_DOCS]:
        for p in doc.passages:
            passages.add(p)
    tfidf = TfIdfRanker(nlp_toolkit.remove_stop_words)
    ranked_passages = tfidf.calc_passage_ranks(passages,
                                               qp_result.question_model)
    log_passages(ranked_passages)
    Logger.info('#Passages: ' + str(len(ranked_passages.passages)) +
                ' considering ' + str(TOP_N_PASSAGES))
    scorer = Scorer()
    scorer.min_max_norm(ranked_passages)

    end = datetime.now()
    diff = end - start
    Logger.info('finished (' + str(diff.seconds) + '.' +
                str(diff.microseconds) + ' s)')
    Logger.small_seperator()

    return ranked_passages
示例#2
0
 def _build_ranked_passages_from(passages: Passages,
                                 similarities_array: list) -> Passages:
     ranked_passages = Passages()
     for i, similarity in enumerate(similarities_array):
         passage = passages.get_passage_at(i)
         passage.tfidf_score = similarity
         ranked_passages.add(passage)
     return ranked_passages
示例#3
0
    def test_with_wiki_article(self):
        arnold_article = read_file('arnold.txt')
        arnold_article_doc = nlp(arnold_article)
        arnold_sections = [sent.string.strip() for sent in arnold_article_doc.sents]  # arnold_article.split("\n")
        passages = Passages()
        for section in arnold_sections:
            passages.add(Passage(section))
        question_str = "Who is the brother of Arnold Schwarzenegger"
        question = QuestionModel(text_to_token_list(question_str), question_str)

        tfidf = TfIdfRanker(text_to_token_list)
        ranked_passages = tfidf.calc_passage_ranks(passages, question)

        self.assertTrue(ranked_passages.get_passage_at(0).text.find('Meinhard') != -1)
        for p in ranked_passages.passages:
            print(str(p.tfidf_score) + ': ' + p.text)
示例#4
0
class Document:
    def __init__(self, title: str, text: str, elastic_score: float = 0, doc_index=0, passages: Passages = None) -> None:
        self.title = title
        self.text = text
        self.elastic_score = elastic_score
        self.doc_index = doc_index
        self.passages = Passages() if passages is None else passages

    def add_passage(self, passage: Passage):
        self.passages.add(passage)

    def get_passages(self):
        return self.passages

    def get_id(self) -> str:
        return str(self.doc_index)
示例#5
0
    def calc_passage_ranks(self, passages: Passages,
                           question_model: QuestionModel) -> Passages:
        query_vector = question_model.get_keywords()
        list_of_passage_tokens = passages.map(
            lambda passage: self.convert_text_to_token_list(passage.text))

        query_dict = self._build_query_dictionary(query_vector)
        score_list = self._calculate_score_list(list_of_passage_tokens)

        vectors = vectorize(score_list + [query_dict])
        list_of_passage_vectors = vectors[:-1]
        query_vector = vectors[-1]
        similarities_array = self._calculate_similarities(
            list_of_passage_vectors, query_vector)

        ranked_passages = self._build_ranked_passages_from(
            passages, similarities_array)
        ranked_passages.sort()
        return ranked_passages
示例#6
0
    def test_with_simple_input(self):
        passages = Passages()
        passages.add(Passage("c++ c++ object class oriented"))
        passages.add(Passage("class oriented code code sort sort python python"))
        passages.add(Passage("java java overflow sort stack stack"))
        passages.add(Passage("c++ pointer code code array"))
        passages.add(Passage("c++ object class oriented java"))
        passages.add(Passage("sort python python"))
        passages.add(Passage("object class java java array"))
        passages.add(Passage("c++ c++java java python python"))
        passages.add(Passage("loop loop code sort sort"))
        passages.add(Passage("class class code array array stack stack"))
        question_str = "java stack overflow"
        question = QuestionModel(question_str.split(), question_str)

        tfidf = TfIdfRanker(lambda text: text.split())
        ranked_passages = tfidf.calc_passage_ranks(passages, question)

        self.assertEqual("java java overflow sort stack stack", ranked_passages.get_passage_at(0).text)
示例#7
0
def receive_passages(docs: Documents, question_model: QuestionModel,
                     nlp_toolkit):
    if not isinstance(docs, Documents):
        raise Exception("receive_passages expects parameter of type Documents")
    return Passages()
示例#8
0
 def __init__(self, title: str, text: str, elastic_score: float = 0, doc_index=0, passages: Passages = None) -> None:
     self.title = title
     self.text = text
     self.elastic_score = elastic_score
     self.doc_index = doc_index
     self.passages = Passages() if passages is None else passages