def direct_matching(title, concept): title = clean_text(title).split(" ") concept = clean_text(concept).split(" ") if set(concept).issubset(set(title)) and len(concept) == len(title): return True else: return False
def title_in_concept(title, concept): title = clean_text(title).split(" ") concept = clean_text(concept).split(" ") if set(title).issubset(set(concept)): return True else: return False
def compute_sentences_scores(vocabulary, dataset, headline_similarity_influence): sentences_scores = {} for headline, _, ctext in dataset: article_sentences = text_cleaning.clean_text(ctext) # [[words]] article_sentences = text_cleaning.stem(article_sentences) headline_sentences = text_cleaning.clean_text(headline) headline_sentences = text_cleaning.stem(headline_sentences) headline_words = set(text_cleaning.get_all_words(headline_sentences)) count_article_words_in_headline = 0 for i, sentence in enumerate(article_sentences, start=1): sentence_score = 0 nouns_count = 0 # 3.1: Compute score using Nouns only tagged_sentence = pos_tag(set(sentence)) for word, pos in tagged_sentence: if word in vocabulary and pos[:2] == 'NN': # word is Noun nouns_count += 1 tf, idf = vocabulary[word] sentence_score += tf * idf # TODO: Check if correct if word in headline_words: # 3.2: Compute additional score if words from headline count_article_words_in_headline += 1 sentence_score /= nouns_count # Normalize it sentences_scores[' '.join(sentence)] = \ (sentence_score + (count_article_words_in_headline / len(headline_words))*headline_similarity_influence)\ * (i / len(article_sentences)) # 3.3 sentence's location in text weight return sorted(sentences_scores.items(), key=(lambda d: d[1]), reverse=True)[:3]
def compute_tf_idf(dataset): # dataset = [headlines, text, ctext] vocabulary = {} # {word: (tf, idf)} ctext_words = [ ] # list of words in each document, sort of a [document_words] dataset_sentences = [] for _, _, ctext in dataset: sentences = text_cleaning.clean_text(ctext) # [[words]] sentences = text_cleaning.stem(sentences) dataset_sentences.append(sentences) words = text_cleaning.get_all_words(sentences) ctext_words.append(set(words)) for word in words: if word not in vocabulary: vocabulary[word] = (1, 0) else: tf, idf = vocabulary[word] vocabulary[word] = (tf + 1, idf) print("Done computing TF") for word in vocabulary: tf, _ = vocabulary[word] occurrences = 0 for words in ctext_words: if word in words: # Found a document that contains word, so we increase the idf count occurrences += 1 vocabulary[word] = (tf, log(len(ctext_words) / occurrences)) print('Done with IDF') return vocabulary, dataset_sentences
def get_answer(text): index_array = [] for i in text_cleaning.clean_text(text).split(" "): index = word_index.get(i) if index == None: index_array.append(word_index.get('<OOV>')) else: index_array.append(index) index_array = pad_sequences(list([index_array]), padding = padding_type, truncating= truncating_type, maxlen=max_length)[0] dec_text = np.zeros(max_length) dec_text[0] = 1 words = [] for i in range(max_length): val_pred = model.predict([np.array([index_array]), np.array([dec_text])])[0][i] index_values = val_pred.argsort()[-10:][::-1] word = get_key(index_values[0]) index_val = index_values[0] if word == "<OOV>": word = "("+get_key(index_values[1])+")" index_val = index_values[1] if i != max_length-1: dec_text[i+1] = index_val if word!= None: words.append(word) return words
def get_final_section_list(documents, final_book_sections, concept, wikipedia_data_file): wiki_summary, wiki_content = get_wiki_data(concept, wikipedia_data_file) wiki_content = clean_text(wiki_content) final_section = [] for section_combination in final_book_sections: if len(section_combination) == 0: final_section.append([]) elif len(section_combination) == 1: final_section.append(section_combination[0]) else: all_documents = [wiki_content] for sections in section_combination: content = "" for section in sections: content += documents[section]["content"] + "\n" all_documents.append(content) score = tfidf_document_similarity(all_documents)[0] max_score = score[1] index = 1 for i in range(1, len(score)): if score[i] > max_score: max_score = score[i] index = i best_section = section_combination[index-1] final_section.append([best_section[0]]) return final_section
def get_best_section(concept, title_section, content_section, wikipedia_data_file, book_content_file): wiki_summary, wiki_content = get_wiki_data(concept, wikipedia_data_file) wiki_content = clean_text(wiki_content) all_documents = [wiki_content] content = get_book_data(title_section, book_content_file) content = clean_text(content) all_documents.append(content) content = get_book_data(content_section, book_content_file) content = clean_text(content) all_documents.append(content) score = tfidf_document_similarity(all_documents)[0] if score[1] > score[2]: return 1 else: return 2
def compare_sections(section, concept, wikipedia_data_file, book_content_file): wiki_summary, wiki_content = get_wiki_data(concept, wikipedia_data_file) wiki_content = clean_text(wiki_content) documents = [wiki_content] for index in section: content = get_book_data(index, book_content_file) content = clean_text(content) documents.append(content) score = tfidf_document_similarity(documents)[0] max_score = score[1] index = 1 for i in range(1, len(score)): if score[i] > max_score: max_score = score[i] index = i best_section = section[index - 1] return best_section
def resolve_multi_sections(sections, concept, chapter_distribution, wikipedia_data_file, book_content_file): resulted_sections = [] documents = {} for section in sections: documents[section] = { "content": clean_text(get_book_data(section, book_content_file)) } section_combination = get_section_combination(sections, chapter_distribution) resulted_sections = get_final_section_list(documents, section_combination, concept, wikipedia_data_file) return resulted_sections
def test_clean_text(test_case): input_text, output_text = test_case assert clean_text(input_text, twitter=True, han2zen=True) == output_text
# Create a list of all of the conversations' lines' ids. convs = [] for line in conv_lines[:-1]: _line = line.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "") convs.append(_line.split(',')) clean_questions = [] clean_answers = [] print("Cleaning the data.....") for conv in convs[:int(len(convs))]: for i in range(len(conv) - 1): question = id2line[conv[i]] answer = id2line[conv[i + 1]] q_clean = clean_text(question) a_clean = clean_text(answer) q_len = len(q_clean.split()) a_len = len(a_clean.split()) if q_len >= min_line_length and q_len <= max_length and a_len >= min_line_length and a_len <= max_length: clean_questions.append(q_clean) clean_answers.append(a_clean) vocab = {} for sentence in np.array(list(clean_answers) + list(clean_questions)): for word in sentence.split(" "): if word not in vocab: vocab[word] = 1 else: vocab[word] += 1 vocab_sort = {k: v for k, v in sorted(vocab.items(), key=lambda item: item[1])}