def test_partial_load_invalid_end_index(self):
        speaker_byte_arr1 = bytearray([120, 3, 255, 0, 100])
        speaker_byte_arr2 = bytearray([110, 3, 255, 90])
        utt_byte_arr1 = bytearray([99, 44, 33])
        utt_byte_arr2 = bytearray([110, 200, 220, 28])

        corpus1 = Corpus(utterances=[
            Utterance(id="0",
                      text="hello world",
                      speaker=Speaker(
                          id="alice",
                          meta={'speaker_binary_data': speaker_byte_arr1}),
                      meta={'utt_binary_data': utt_byte_arr1}),
            Utterance(id="1",
                      text="my name is bob",
                      speaker=Speaker(
                          id="bob",
                          meta={'speaker_binary_data': speaker_byte_arr2}),
                      meta={'utt_binary_data': utt_byte_arr2}),
            Utterance(
                id="2", text="this is a test", speaker=Speaker(id="charlie")),
        ])

        corpus1.dump('test_corpus', './')

        corpus2 = Corpus(filename="test_corpus", utterance_end_index=-1)

        self.assertEqual(len(list(corpus2.iter_utterances())), 0)
Exemplo n.º 2
0
    def test_partial_load_start_idx_specified_only(self):
        user_byte_arr1 = bytearray([120, 3, 255, 0, 100])
        user_byte_arr2 = bytearray([110, 3, 255, 90])
        utt_byte_arr1 = bytearray([99, 44, 33])
        utt_byte_arr2 = bytearray([110, 200, 220, 28])

        corpus1 = Corpus(utterances=[
            Utterance(id="0",
                      text="hello world",
                      user=User(name="alice",
                                meta={'user_binary_data': user_byte_arr1}),
                      meta={'utt_binary_data': utt_byte_arr1}),
            Utterance(id="1",
                      text="my name is bob",
                      user=User(name="bob",
                                meta={'user_binary_data': user_byte_arr2}),
                      meta={'utt_binary_data': utt_byte_arr2}),
            Utterance(id="2", text="this is a test", user=User(
                name="charlie")),
        ])

        corpus1.dump('test_corpus', './')

        corpus2 = Corpus(filename="test_corpus", utterance_start_index=1)

        self.assertEqual(len(list(corpus2.iter_utterances())), 2)
        self.assertEqual(corpus1.get_utterance("1"),
                         corpus2.get_utterance("1"))
        self.assertEqual(corpus1.get_utterance("2"),
                         corpus2.get_utterance("2"))
Exemplo n.º 3
0
 def transform(self, corpus: Corpus) -> Corpus:
     for utt in corpus.iter_utterances():
         if self.utt_selector(utt):
             utt.add_meta(self.perplexity_feat_name, self.model.str_perplexity(self.utt_text_func(utt)))
         else:
             utt.add_meta(self.perplexity_feat_name, None)
     return corpus
Exemplo n.º 4
0
def get_corpus_leaf_ids(c: Corpus) -> set:
    leaves = set()
    not_leaves = set()
    for utt in c.iter_utterances():
        if utt.id not in not_leaves:
            leaves.add(utt.id)
        if utt.reply_to in leaves:
            leaves.remove(utt.reply_to)
        not_leaves.add(utt.reply_to)
    return leaves
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 27 23:20:11 2020

@author: kach
"""

from convokit import Corpus, download
corpus = Corpus(filename=download("conversations-gone-awry-corpus"))

corpus.print_summary_stats()

reviews = open("data/reviews.txt", "w", encoding="utf-8")
label = open("data/labels.txt", "w")

#i=0
for utt in corpus.iter_utterances():
    #i+=1
    txt = str(utt.text).replace('\n', ' ')
    reviews.write(txt + '\n')
    if utt.meta['comment_has_personal_attack']:
        l = '1'
    else:
        l = '0'
    label.write(l + '\n')
    #if i>10:
    #    break

reviews.close()
label.close()
# This example extracts politeness strategies from the Conversations Gone Awry dataset,
#   one of the steps in the Conversations Gone Awry paper (http://www.cs.cornell.edu/~cristian/Conversations_gone_awry.html).
#   For code reproducing the full results of the paper, see the example notebook in the
#   `conversations-gone-awry` example subdirectory.

import pandas as pd
from convokit import PolitenessStrategies, Corpus, download

print("Loading awry corpus...")
corpus = Corpus(filename=download('conversations-gone-awry-corpus'))

# extract the politeness strategies.
# Note: politeness strategies are a hand-engineered feature set, so no fitting is needed.
ps = PolitenessStrategies(verbose=100)
print("Extracting politeness strategies...")
corpus = ps.transform(corpus)

values = []
idx = []
for utterance in corpus.iter_utterances():
    values.append(utterance.meta["politeness_strategies"])
    idx.append(utterance.id)
pd.DataFrame(values, index=idx).to_csv("awry_strategy_df_v2.csv")
print("Done, results written to awry_strategy_df_v2.csv")
    def get_examples(self, filename, ns_name, anserini_folder, sent_bert_model, loss, output_dir, input_pair=True, eval_data=False,
        denoise_negatives=False, num_ns_for_denoising=100, generative_model = 'facebook/blenderbot-3B', remove_cand_subsets=True,
        last_utterance_only=False, use_external_corpus=False):
        """
        filename specified which data split to use (train.csv, dev.csv, test.csv).
        """
        filepath = os.path.join(self.dataset_folder, filename)
        self.data = pd.read_csv(filepath, sep="\t")


        if denoise_negatives:
            num_ns = num_ns_for_denoising
        else:
            num_ns = 10

        candidates = list(self.data["response"].values)
        if use_external_corpus:
            external_datasets = [
                'movie-corpus',
                'wiki-corpus',
                'subreddit-Ubuntu',
                'subreddit-microsoft',
                'subreddit-apple',
                'subreddit-Database',
                'subreddit-DIY',
                'subreddit-electronics',
                'subreddit-ENGLISH',
                'subreddit-gis',
                'subreddit-Physics',
                'subreddit-scifi',
                'subreddit-statistics',
                'subreddit-travel',
                'subreddit-worldbuilding'
            ]
            for ds_name in external_datasets:
                corpus = Corpus(download(ds_name))
                corpus.print_summary_stats()
                for utt in corpus.iter_utterances():
                    if utt.text != "":
                        candidates.append(utt.text)

        if ns_name == "random" or eval_data:
            self.negative_sampler = negative_sampling.RandomNegativeSampler(candidates, num_ns)
        elif ns_name == "bm25":
            index_folder = "/anserini_train_-1/"
            if use_external_corpus:
                index_folder = index_folder.replace("train", "train_expanded_")
            self.negative_sampler = negative_sampling.BM25NegativeSamplerPyserini(candidates, num_ns,
                self.dataset_folder+index_folder, -1, anserini_folder)
        elif ns_name == "sentence_transformer":
            self.negative_sampler = negative_sampling.SentenceBERTNegativeSampler(candidates, num_ns, 
                self.dataset_folder+"/train_sentenceBERTembeds", -1, sent_bert_model, large_index=use_external_corpus)
        elif ns_name == "generative":
            self.negative_sampler = negative_sampling.GenerativeNegativeSamplerForDialogue(num_ns, generative_model)
            
        if loss == 'MarginMSELoss':
            self.negative_sampler.score_relevant_docs = True
        if loss == "ContrastiveLoss" and not eval_data:
            input_pair = False
        if loss == "OnlineContrastiveLoss" and not eval_data:
            input_pair = False
        examples = []
        scores_df = []

        # Code used to annotate some samples
        # samples_to_annotate = []
        # self.data = self.data.sample(200, random_state=42)
        # self.negative_sampler.score_relevant_docs = True
        count_ns_part_of_context = 0
        for idx, row in enumerate(tqdm(self.data.itertuples(index=False), total=len(self.data))):
            context = row[0]
            if last_utterance_only:
                if 'msdialog' in self.dataset_folder:
                    context = context.split("[TURN_SEP]")[-1].split("[UTTERANCE_SEP]")[0].strip()
                else:
                    context = context.split("[TURN_SEP]")[-1].split("[UTTERANCE_SEP]")[-2].strip()

            relevant_response = row[1]
            if not input_pair:
                examples.append(InputExample(guid=filename+str(idx)+"_pos",
                    texts=[context, relevant_response], label=1.0))
            if ns_name == "bm25" and not eval_data:
                ns_candidates, ns_scores , _ , _ , rel_scores = self.negative_sampler.sample(context, [relevant_response], max_query_len = 512, normalize_scores = False, rel_doc_id = str(idx))
            else:
                ns_candidates, ns_scores , _ , _ , rel_scores = self.negative_sampler.sample(context, [relevant_response])
            rel_score = rel_scores[0]

            if denoise_negatives:
                zipped = zip(ns_candidates[-10:], ns_scores[-10:])
            else: 
                zipped = zip(ns_candidates, ns_scores)

            for ns, score_ns in zipped:
                if remove_cand_subsets and ns.replace("<<<AGENT>>>: ", "") in context:
                    count_ns_part_of_context+=1
                else: 
                    if input_pair:
                        examples.append(InputExample(texts=[context, relevant_response, ns], label=float(rel_score-score_ns)))
                        scores_df.append(rel_score-score_ns)
                        # samples_to_annotate.append([self.dataset_folder.split("/")[-1], ns_name, context, relevant_response, ns, rel_score, score_ns])
                    else:
                        examples.append(InputExample(guid=filename+str(idx)+"_neg", 
                            texts=[context, ns], label=0.0))
        logging.info("{} {} count of ns which are part of the context: {} out of {}.".format(self.dataset_folder.split("/")[-1],
         ns_name, count_ns_part_of_context, len(examples)))
        # print(pd.DataFrame(scores_df).describe())
        # pd.DataFrame(samples_to_annotate, columns=['task', 'ns', 'context', 'rel_response', 'negative_sample', 'rel_score', 'score_negative']).\
        #     to_csv(output_dir+"neg_samples_{}_{}.csv".format(ns_name, self.dataset_folder.split("/")[-1]), index=False)        

        if loss == 'MarginMSELoss':
            pd.DataFrame(scores_df).to_csv(output_dir+"MarginScores_{}_{}.csv".format(ns_name, self.dataset_folder.split("/")[-1]))
        return examples