示例#1
0
def direct_matching(title, concept):
    title = clean_text(title).split(" ")
    concept = clean_text(concept).split(" ")
    if set(concept).issubset(set(title)) and len(concept) == len(title):
        return True
    else:
        return False
示例#2
0
def title_in_concept(title, concept):
    title = clean_text(title).split(" ")
    concept = clean_text(concept).split(" ")
    if set(title).issubset(set(concept)):
        return True
    else:
        return False
def compute_sentences_scores(vocabulary, dataset,
                             headline_similarity_influence):
    sentences_scores = {}
    for headline, _, ctext in dataset:
        article_sentences = text_cleaning.clean_text(ctext)  # [[words]]
        article_sentences = text_cleaning.stem(article_sentences)
        headline_sentences = text_cleaning.clean_text(headline)
        headline_sentences = text_cleaning.stem(headline_sentences)
        headline_words = set(text_cleaning.get_all_words(headline_sentences))
        count_article_words_in_headline = 0
        for i, sentence in enumerate(article_sentences, start=1):
            sentence_score = 0
            nouns_count = 0
            # 3.1: Compute score using Nouns only
            tagged_sentence = pos_tag(set(sentence))
            for word, pos in tagged_sentence:
                if word in vocabulary and pos[:2] == 'NN':  # word is Noun
                    nouns_count += 1
                    tf, idf = vocabulary[word]
                    sentence_score += tf * idf  # TODO: Check if correct
                    if word in headline_words:  # 3.2: Compute additional score if words from headline
                        count_article_words_in_headline += 1
            sentence_score /= nouns_count  # Normalize it
            sentences_scores[' '.join(sentence)] = \
                (sentence_score +
                 (count_article_words_in_headline / len(headline_words))*headline_similarity_influence)\
                * (i / len(article_sentences))  # 3.3 sentence's location in text weight

    return sorted(sentences_scores.items(), key=(lambda d: d[1]),
                  reverse=True)[:3]
def compute_tf_idf(dataset):
    # dataset = [headlines, text, ctext]
    vocabulary = {}  # {word: (tf, idf)}
    ctext_words = [
    ]  # list of words in each document, sort of a [document_words]
    dataset_sentences = []
    for _, _, ctext in dataset:
        sentences = text_cleaning.clean_text(ctext)  # [[words]]
        sentences = text_cleaning.stem(sentences)
        dataset_sentences.append(sentences)
        words = text_cleaning.get_all_words(sentences)
        ctext_words.append(set(words))

        for word in words:
            if word not in vocabulary:
                vocabulary[word] = (1, 0)
            else:
                tf, idf = vocabulary[word]
                vocabulary[word] = (tf + 1, idf)

    print("Done computing TF")

    for word in vocabulary:
        tf, _ = vocabulary[word]
        occurrences = 0
        for words in ctext_words:
            if word in words:
                # Found a document that contains word, so we increase the idf count
                occurrences += 1
        vocabulary[word] = (tf, log(len(ctext_words) / occurrences))

    print('Done with IDF')
    return vocabulary, dataset_sentences
示例#5
0
def get_answer(text):
    index_array = []
    for i in text_cleaning.clean_text(text).split(" "):
        index = word_index.get(i)
        if index == None:
            index_array.append(word_index.get('<OOV>'))
        else:
            index_array.append(index)
    index_array = pad_sequences(list([index_array]), padding = padding_type, 
                  truncating= truncating_type, maxlen=max_length)[0]
    dec_text = np.zeros(max_length)
    dec_text[0] = 1
    words = []
    for i in range(max_length):
        val_pred = model.predict([np.array([index_array]), np.array([dec_text])])[0][i]
        
        index_values = val_pred.argsort()[-10:][::-1]
        word = get_key(index_values[0])
        index_val = index_values[0]

        if word == "<OOV>":
            word = "("+get_key(index_values[1])+")"
            index_val = index_values[1]

        if i != max_length-1:
            dec_text[i+1] = index_val
        if word!= None:
            words.append(word)
    return words
def get_final_section_list(documents, final_book_sections, concept, wikipedia_data_file):
	wiki_summary, wiki_content = get_wiki_data(concept, wikipedia_data_file)
	wiki_content = clean_text(wiki_content)
	final_section = []
	for section_combination in final_book_sections:
		if len(section_combination) == 0:
			final_section.append([])
		elif len(section_combination) == 1:
			final_section.append(section_combination[0])
		else:
			all_documents = [wiki_content]
			for sections in section_combination:
				content = ""
				for section in sections:
					content += documents[section]["content"] + "\n"
				all_documents.append(content)
			score = tfidf_document_similarity(all_documents)[0]
			max_score = score[1]
			index = 1
			for i in range(1, len(score)):
				if score[i] > max_score:
					max_score = score[i]
					index = i
			best_section = section_combination[index-1]
			final_section.append([best_section[0]])
	return final_section
示例#7
0
def get_best_section(concept, title_section, content_section,
                     wikipedia_data_file, book_content_file):
    wiki_summary, wiki_content = get_wiki_data(concept, wikipedia_data_file)
    wiki_content = clean_text(wiki_content)
    all_documents = [wiki_content]

    content = get_book_data(title_section, book_content_file)
    content = clean_text(content)
    all_documents.append(content)

    content = get_book_data(content_section, book_content_file)
    content = clean_text(content)
    all_documents.append(content)

    score = tfidf_document_similarity(all_documents)[0]
    if score[1] > score[2]: return 1
    else: return 2
def compare_sections(section, concept, wikipedia_data_file, book_content_file):
    wiki_summary, wiki_content = get_wiki_data(concept, wikipedia_data_file)
    wiki_content = clean_text(wiki_content)
    documents = [wiki_content]
    for index in section:
        content = get_book_data(index, book_content_file)
        content = clean_text(content)
        documents.append(content)
    score = tfidf_document_similarity(documents)[0]
    max_score = score[1]
    index = 1
    for i in range(1, len(score)):
        if score[i] > max_score:
            max_score = score[i]
            index = i
    best_section = section[index - 1]
    return best_section
def resolve_multi_sections(sections, concept, chapter_distribution, wikipedia_data_file, book_content_file):
	resulted_sections = []
	documents = {}
	for section in sections:
		documents[section] = {
			"content": clean_text(get_book_data(section, book_content_file))
		}
	section_combination = get_section_combination(sections, chapter_distribution)
	resulted_sections = get_final_section_list(documents, section_combination, concept, wikipedia_data_file)
	return resulted_sections
示例#10
0
def test_clean_text(test_case):
    input_text, output_text = test_case
    assert clean_text(input_text, twitter=True, han2zen=True) == output_text
示例#11
0
# Create a list of all of the conversations' lines' ids.
convs = []
for line in conv_lines[:-1]:
    _line = line.split(' +++$+++ ')[-1][1:-1].replace("'", "").replace(" ", "")
    convs.append(_line.split(','))

clean_questions = []
clean_answers = []
print("Cleaning the data.....")
for conv in convs[:int(len(convs))]:
    for i in range(len(conv) - 1):
        question = id2line[conv[i]]
        answer = id2line[conv[i + 1]]

        q_clean = clean_text(question)
        a_clean = clean_text(answer)
        q_len = len(q_clean.split())
        a_len = len(a_clean.split())
        if q_len >= min_line_length and q_len <= max_length and a_len >= min_line_length and a_len <= max_length:
            clean_questions.append(q_clean)
            clean_answers.append(a_clean)

vocab = {}
for sentence in np.array(list(clean_answers) + list(clean_questions)):
    for word in sentence.split(" "):
        if word not in vocab:
            vocab[word] = 1
        else:
            vocab[word] += 1
vocab_sort = {k: v for k, v in sorted(vocab.items(), key=lambda item: item[1])}