def receive_passages(docs: Documents, qp_result: QPResult, nlp_toolkit: NLPToolkit) -> Passages: # start logging Logger.info('started') start = datetime.now() # Rank passages of the TOP_N_DOC documents with tfidf passages = Passages() for doc in docs.docs[:TOP_N_DOCS]: for p in doc.passages: passages.add(p) tfidf = TfIdfRanker(nlp_toolkit.remove_stop_words) ranked_passages = tfidf.calc_passage_ranks(passages, qp_result.question_model) log_passages(ranked_passages) Logger.info('#Passages: ' + str(len(ranked_passages.passages)) + ' considering ' + str(TOP_N_PASSAGES)) scorer = Scorer() scorer.min_max_norm(ranked_passages) end = datetime.now() diff = end - start Logger.info('finished (' + str(diff.seconds) + '.' + str(diff.microseconds) + ' s)') Logger.small_seperator() return ranked_passages
def _build_ranked_passages_from(passages: Passages, similarities_array: list) -> Passages: ranked_passages = Passages() for i, similarity in enumerate(similarities_array): passage = passages.get_passage_at(i) passage.tfidf_score = similarity ranked_passages.add(passage) return ranked_passages
def test_with_wiki_article(self): arnold_article = read_file('arnold.txt') arnold_article_doc = nlp(arnold_article) arnold_sections = [sent.string.strip() for sent in arnold_article_doc.sents] # arnold_article.split("\n") passages = Passages() for section in arnold_sections: passages.add(Passage(section)) question_str = "Who is the brother of Arnold Schwarzenegger" question = QuestionModel(text_to_token_list(question_str), question_str) tfidf = TfIdfRanker(text_to_token_list) ranked_passages = tfidf.calc_passage_ranks(passages, question) self.assertTrue(ranked_passages.get_passage_at(0).text.find('Meinhard') != -1) for p in ranked_passages.passages: print(str(p.tfidf_score) + ': ' + p.text)
class Document: def __init__(self, title: str, text: str, elastic_score: float = 0, doc_index=0, passages: Passages = None) -> None: self.title = title self.text = text self.elastic_score = elastic_score self.doc_index = doc_index self.passages = Passages() if passages is None else passages def add_passage(self, passage: Passage): self.passages.add(passage) def get_passages(self): return self.passages def get_id(self) -> str: return str(self.doc_index)
def calc_passage_ranks(self, passages: Passages, question_model: QuestionModel) -> Passages: query_vector = question_model.get_keywords() list_of_passage_tokens = passages.map( lambda passage: self.convert_text_to_token_list(passage.text)) query_dict = self._build_query_dictionary(query_vector) score_list = self._calculate_score_list(list_of_passage_tokens) vectors = vectorize(score_list + [query_dict]) list_of_passage_vectors = vectors[:-1] query_vector = vectors[-1] similarities_array = self._calculate_similarities( list_of_passage_vectors, query_vector) ranked_passages = self._build_ranked_passages_from( passages, similarities_array) ranked_passages.sort() return ranked_passages
def test_with_simple_input(self): passages = Passages() passages.add(Passage("c++ c++ object class oriented")) passages.add(Passage("class oriented code code sort sort python python")) passages.add(Passage("java java overflow sort stack stack")) passages.add(Passage("c++ pointer code code array")) passages.add(Passage("c++ object class oriented java")) passages.add(Passage("sort python python")) passages.add(Passage("object class java java array")) passages.add(Passage("c++ c++java java python python")) passages.add(Passage("loop loop code sort sort")) passages.add(Passage("class class code array array stack stack")) question_str = "java stack overflow" question = QuestionModel(question_str.split(), question_str) tfidf = TfIdfRanker(lambda text: text.split()) ranked_passages = tfidf.calc_passage_ranks(passages, question) self.assertEqual("java java overflow sort stack stack", ranked_passages.get_passage_at(0).text)
def receive_passages(docs: Documents, question_model: QuestionModel, nlp_toolkit): if not isinstance(docs, Documents): raise Exception("receive_passages expects parameter of type Documents") return Passages()
def __init__(self, title: str, text: str, elastic_score: float = 0, doc_index=0, passages: Passages = None) -> None: self.title = title self.text = text self.elastic_score = elastic_score self.doc_index = doc_index self.passages = Passages() if passages is None else passages