Exemplo n.º 1
0
def extract_fact_set_mapped(factsets):
    sentences = []
    original_sentences = dict()
    for idx, data in factsets.items():

        fun_facts = data.get("fun_facts")

        if fun_facts:
            for fact in fun_facts:
                cleaned_fact = clean(fact)
                sentences.append(cleaned_fact)
                original_sentences[cleaned_fact] = fact

        short_wiki = data.get("shortened_wiki_lead_section")

        if short_wiki:
            cleaned_swiki = clean(short_wiki)
            sentences.append(cleaned_swiki)
            original_sentences[cleaned_swiki] = short_wiki

        summarized_wiki = data.get("summarized_wiki_lead_section")

        if summarized_wiki:
            cleaned_sum_wiki = clean(summarized_wiki)
            sentences.append(cleaned_sum_wiki)
            original_sentences[cleaned_sum_wiki] = summarized_wiki
    return original_sentences
Exemplo n.º 2
0
def prepare_reading_set_for_conversation(conv_id, reading_set):
    conv_reading_set = reading_set[conv_id]
    fact_mapping_1 = extract_fact_set_mapped(conv_reading_set["agent_1"])
    fact_mapping_2 = extract_fact_set_mapped(conv_reading_set["agent_2"])
    fact_set_1 = set(fact_mapping_1.keys())
    fact_set_2 = set(fact_mapping_2.keys())
    article_data = conv_reading_set["article"]
    article_indices = ['AS1', 'AS2', 'AS3', 'AS4']
    common_knowledge_mapping = dict()
    if "AS1" in article_data:
        for idx in article_indices:
            sentence = article_data[idx]
            if len(word_tokenize(sentence)) < 5:
                continue

            cleaned_sentence = clean(sentence)
            common_knowledge_mapping[cleaned_sentence] = sentence
    common_knowledge_set = set(common_knowledge_mapping.keys())
    fact_set_1.update(common_knowledge_set)
    fact_set_2.update(common_knowledge_set)
    fact_mapping_1.update(common_knowledge_mapping)
    fact_mapping_2.update(common_knowledge_mapping)
    agent_knowledge = {
        "agent_1": list(fact_set_1),
        "agent_2": list(fact_set_2)
    }
    agent_mapping = {"agent_1": fact_mapping_1, "agent_2": fact_mapping_2}
    return agent_knowledge, agent_mapping
        def knowledge_selection_strategy(text, available_knowledge):
            fact_sims = get_cosine_similarity_embs_all(clean(text),
                                                       available_knowledge,
                                                       model,
                                                       knowledge_policy="bert")
            fact_sims.sort(key=lambda x: x[1], reverse=True)

            return fact_sims
Exemplo n.º 4
0
def prepare_sentence_knowledge_data(agent_mapping, conv_id, dialog_act,
                                    tokenizer, turn, sentence, ranker,
                                    da_index):
    knowledge_sentence = ranker.get_top_fact(clean(sentence),
                                             conv_id,
                                             threshold=True)
    original_knowledge_sentence = agent_mapping[turn["agent"]].get(
        knowledge_sentence, "")
    return tokenizer.encode(sentence), [
        turn[dialog_act][da_index]
    ], tokenizer.encode(original_knowledge_sentence)
        def knowledge_selection_strategy(text, available_knowledge):
            text_tfidf = vectorizer.transform([clean(text)])
            knowledge_tfidf = vectorizer.transform(available_knowledge)
            similarity = np.squeeze(
                np.asarray(
                    text_tfidf.dot(knowledge_tfidf.transpose()).todense()))

            top_n_indices = similarity.argsort()[-3:][::-1].tolist()
            top_similarities = [similarity[i] for i in top_n_indices]
            top_n_knowledges = [available_knowledge[i] for i in top_n_indices]

            return list(zip(top_n_knowledges, top_similarities))
def get_tfidf_conv_knowledge(conv_id, test_freq_reading_set):
    conv_reading_set = test_freq_reading_set[conv_id]
    fact_set_1 = set(extract_fact_set(conv_reading_set["agent_1"]))
    fact_set_2 = set(extract_fact_set(conv_reading_set["agent_2"]))
    article_data = conv_reading_set["article"]
    article_indices = ['AS1', 'AS2', 'AS3', 'AS4']
    common_knowledge_set = set()
    if "AS1" in article_data:
        for idx in article_indices:
            sentence = article_data[idx]
            if len(word_tokenize(sentence)) < 5:
                continue
            common_knowledge_set.add(clean(sentence))
    fact_set_1.update(common_knowledge_set)
    fact_set_2.update(common_knowledge_set)
    agent_knowledge = {
        "agent_1": list(fact_set_1),
        "agent_2": list(fact_set_2)
    }
    return agent_knowledge
Exemplo n.º 7
0
def prepare_turn_data(agent_mapping,
                      available_knowledge,
                      conv_id,
                      dialog_act,
                      knowledge_policy,
                      response,
                      tokenizer,
                      turn,
                      vec,
                      sentiment=None,
                      ranker=None):

    knowledge_sentence = ""
    for segment in turn["segments"]:
        sentence = segment["text"]

        if knowledge_policy == "none":
            # Always return an empty sentence
            break
        if knowledge_policy == "tf_idf":
            # With regards to knowledge selection, this is a highly approximate heuristic.
            # Both Gopalakrishnan et al. 2019 and Hedayatnia et al. 2020
            # acknowledge they don't have anything better for this issue
            text_tfidf = vec.transform([clean(sentence)])
            """
            In this section, we find the knowledge sentence that is closest
            to the ground truth response expected from the model.
            This is so that the model learns to appropriately condition on
            the knowledge
            """
            knowledge_tfidf = vec.transform(available_knowledge)

            similarities = linear_kernel(knowledge_tfidf, text_tfidf).flatten()
            closest_knowledge_index = similarities.argsort()[-1]

            if similarities[closest_knowledge_index] > 0.3:
                knowledge_sentence = available_knowledge[
                    closest_knowledge_index]
                break
        else:
            knowledge_sentence = ranker.get_top_fact(clean(sentence),
                                                     conv_id,
                                                     threshold=True)
            if knowledge_sentence != "":
                break
    else:
        if knowledge_policy == "tf_idf":
            text_tfidf = vec.transform([clean(response)])
            knowledge_tfidf = vec.transform(available_knowledge)
            similarities = linear_kernel(knowledge_tfidf, text_tfidf).flatten()
            closest_knowledge_index = similarities.argsort()[-1]

            knowledge_sentence = available_knowledge[closest_knowledge_index] \
                if similarities[closest_knowledge_index] > 0.3 else ""
    original_knowledge_sentence = agent_mapping[turn["agent"]].get(
        knowledge_sentence, "")
    if sentiment:
        current_turn_data = (tokenizer.encode(response),
                             turn["sentiment_vader"],
                             tokenizer.encode(original_knowledge_sentence))
    else:
        current_turn_data = (tokenizer.encode(response), turn[dialog_act],
                             tokenizer.encode(original_knowledge_sentence))
    return current_turn_data