def fit_transform(self, corpus: Corpus) -> Corpus: """ fit_transform() retrieves features from the corpus conversational threads using retrieve_feats() :param corpus: Corpus object to retrieve feature information from :return: corpus with conversations having a new meta field "hyperconvo" containing the stats generated by retrieve_feats(). Each conversation's metadata then contains the stats for the thread(s) it contains. """ feats = HyperConvo.retrieve_feats(corpus, prefix_len=self.prefix_len, min_thread_len=self.min_thread_len, include_root=self.include_root) if self.include_root: # threads start at root (post) for root_id in feats.keys(): convo = corpus.get_conversation(root_id) convo.add_meta("hyperconvo", {root_id: feats[root_id]}) else: # threads start at top-level-comment # Construct top-level-comment to root mapping tlc_to_root_mapping = dict() # tlc = top level comment threads = corpus.utterance_threads(prefix_len=self.prefix_len, include_root=False) root_to_tlc = dict() for tlc_id, utts in threads.items(): if len(utts) < self.min_thread_len: continue thread_root = threads[tlc_id][tlc_id].root if thread_root in root_to_tlc: root_to_tlc[thread_root][tlc_id] = feats[tlc_id] else: root_to_tlc[thread_root] = {tlc_id: feats[tlc_id]} for root_id in root_to_tlc: convo = corpus.get_conversation(root_id) convo.add_meta("hyperconvo", root_to_tlc[root_id]) return corpus
def test_broken_convos(self): """ Test basic meta functions """ corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", reply_to=None, speaker=Speaker(id="alice"), timestamp=0), Utterance(id="1", text="my name is bob", reply_to="0", speaker=Speaker(id="bob"), timestamp=2), Utterance(id="2", text="this is a test", reply_to="1", speaker=Speaker(id="charlie"), timestamp=1), Utterance(id="3", text="hello world 2", reply_to=None, speaker=Speaker(id="alice2"), timestamp=0), ]) corpus2 = Corpus(utterances=[ Utterance(id="0", text="hello world", reply_to=None, speaker=Speaker(id="alice"), timestamp=0), Utterance(id="1", text="my name is bob", reply_to="0", speaker=Speaker(id="bob"), timestamp=2), Utterance(id="2", text="this is a test", reply_to="1", speaker=Speaker(id="charlie"), timestamp=1), Utterance(id="3", text="hello world 2", reply_to="9", speaker=Speaker(id="alice2"), timestamp=0), ]) # test broken convo where there are multiple roots convo = corpus1.get_conversation(None) self.assertRaises( ValueError, lambda: list(convo.traverse("dfs", as_utterance=True))) # test broken convo where utterance replies to something not in Conversation convo = corpus2.get_conversation(None) self.assertRaises( ValueError, lambda: list(convo.traverse("dfs", as_utterance=True)))
def transform(self, corpus: Corpus) -> Corpus: """Computes the average number of questions asked in a conversation :param corpus: the corpus to compute features for. :type corpus: Corpus """ if self.verbose: print("Finding questions per utterance") questions = [] allutterids = corpus.get_utterance_ids() for i in list(range(0, len(allutterids))): utter_id = allutterids[i] text = corpus.get_utterance(utter_id).text nquestions = len(re.findall(r'\?+', text)) questions.append( nquestions) #gives number of questions in each utterance if self.verbose: print("Finding questions per conversation") allconvoids = corpus.get_conversation_ids() for i in list(range(0, len(allconvoids))): convo_id = allconvoids[i] convo_utters = corpus.get_conversation(convo_id)._utterance_ids avgquestion = np.mean( np.asarray(questions)[np.asarray(convo_utters)]) corpus.get_conversation(convo_id)._meta[ self.ATTR_NAME] = avgquestion #adds average questions per conversation to conversation metadata return corpus
def transform(self, corpus: Corpus) -> Corpus: """Computes the count of pause and hesitancy words for each utterance, then aggregates them for each conversation :param corpus: the corpus to compute features for. :type corpus: Corpus """ if self.verbose: print("Finding counts of pause and hesitancy words...") pause_words = [ 'um', 'umm', 'ummm', 'uh', 'uhh', 'uhhh', 'hm', 'hmm', 'hmmm', 'er', 'err', 'uh huh', 'huh', 'mhm', 'mhmm', 'erm', '...', 'ah', 'ahh', 'ahem', 'eh', 'ehh', 'ehhh', 'meh' ] hesitant_words = [ 'maybe', 'not', 'sure', 'unsure', 'probably', 'well', 'okay', 'like', 'actually', 'basically', 'seriously', 'totally', 'literally', 'know', 'mean', 'guess', 'suppose', 'but', 'something', 'so', 'wow', 'just', 'really', 'later', 'wait', 'future', 'almost', 'slightly', 'perhaps', 'somehow', 'sort', 'kind', 'little', 'somewhat', 'hey', 'alas', 'see', 'sounds', 'ok', 'roughly', 'why', 'how', 'yep', 'yup', 'may', 'possibly', 'might', 'could', 'doubt', 'skeptical', 'don\'t', 'won\'t', 'nah' ] pause = [] hesitancy = [] allutterids = corpus.get_utterance_ids() for i in list(range(0, len(allutterids))): utter_id = allutterids[i] text = corpus.get_utterance(utter_id).text textcleaned = "".join( c for c in text if c not in ('!', '.', ':', '?', '\'', ',', '\"', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '~', '`', '_', '+', '=', '>', '<', '[', ']', '{', '}')) textlist = textcleaned.split() npause = len([i for i in textlist if i in pause_words]) nhesitant = len([i for i in textlist if i in hesitant_words]) pause.append( npause) #gives number of pause words in each utterance hesitancy.append( nhesitant) #gives number of hesitant words in each utterance corpus.get_utterance(utter_id).meta[self.NAME1] = npause corpus.get_utterance(utter_id).meta[self.NAME2] = nhesitant allconvoids = corpus.get_conversation_ids() for i in list(range(0, len(allconvoids))): convo_id = allconvoids[i] convo_utters = corpus.get_conversation(convo_id)._utterance_ids avgpause = np.mean(np.asarray(pause)[np.asarray(convo_utters)]) avghesitancy = np.mean( np.asarray(hesitancy)[np.asarray(convo_utters)]) corpus.get_conversation(convo_id)._meta[self.NAME3] = avgpause corpus.get_conversation(convo_id)._meta[self.NAME4] = avghesitancy return corpus
def transform(self, corpus: Corpus): """Adds metadata about self-reflection to each utterance. :param corpus: the corpus to compute features for. :type corpus: Corpus """ for conv_id in corpus.conversations: conv = corpus.get_conversation(conv_id) for utt in conv.iter_utterances(): if utt.text != None: tokenized = word_tokenize(utt.text.lower()) invocations = 0 length = len(tokenized) pol_words = [] for token in tokenized: if token in self.key_words: invocations += 1 pol_words.append(token) utt.meta["num_pol_refs"] = invocations if (length > 0): utt.meta["num_pol_refs_incidence"] = (invocations/length) else: utt.meta["num_pol_refs_incidence"] = 0 utt.meta["pol_words"] = pol_words return corpus
def test_corpus_dump(self): corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus1.get_utterance("0").meta['foo'] = 'bar' corpus1.get_utterance("1").meta['foo'] = 'bar2' corpus1.get_utterance("2").meta['hey'] = 'jude' corpus1.get_conversation(None).meta['convo_meta'] = 1 corpus1.get_speaker("alice").meta['surname'] = 1.0 corpus1.dump('test_index_meta_corpus', base_path="./") corpus2 = Corpus(filename="test_index_meta_corpus") self.assertEqual(corpus1.meta_index.utterances_index, corpus2.meta_index.utterances_index) self.assertEqual(corpus1.meta_index.speakers_index, corpus2.meta_index.speakers_index) self.assertEqual(corpus1.meta_index.conversations_index, corpus2.meta_index.conversations_index) self.assertEqual(corpus1.meta_index.overall_index, corpus2.meta_index.overall_index)
def test_key_insertion_deletion(self): corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus1.get_utterance("0").meta['foo'] = 'bar' corpus1.get_utterance("1").meta['foo'] = 'bar2' corpus1.get_utterance("2").meta['hey'] = 'jude' corpus1.get_conversation(None).meta['convo_meta'] = 1 corpus1.get_speaker("alice").meta['surname'] = 1.0 self.assertEqual(corpus1.meta_index.utterances_index['foo'], str(type('bar'))) self.assertEqual(corpus1.meta_index.conversations_index['convo_meta'], str(type(1))) self.assertEqual(corpus1.meta_index.speakers_index['surname'], str(type(1.0))) # test that deleting a key from an utterance removes it from the index del corpus1.get_utterance("2").meta['hey'] self.assertRaises(KeyError, lambda: corpus1.meta_index.utterances_index['hey']) # test that deleting a key from an utterance removes it from the index and from all other objects of same type del corpus1.get_utterance("1").meta['foo'] self.assertRaises(KeyError, lambda: corpus1.meta_index.utterances_index['foo']) self.assertRaises(KeyError, lambda: corpus1.get_utterance("0").meta["foo"])
def test_key_insertion_deletion(self): corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus1.get_utterance("0").meta['foo'] = 'bar' corpus1.get_utterance("1").meta['foo'] = 'bar2' corpus1.get_utterance("2").meta['hey'] = 'jude' corpus1.get_conversation(None).meta['convo_meta'] = 1 corpus1.get_speaker("alice").meta['surname'] = 1.0 self.assertEqual(corpus1.meta_index.utterances_index['foo'], [str(type('bar'))]) self.assertEqual(corpus1.meta_index.conversations_index['convo_meta'], [str(type(1))]) self.assertEqual(corpus1.meta_index.speakers_index['surname'], [str(type(1.0))]) # test that deleting an attribute from an individual utterance fails to remove it del corpus1.get_utterance("2").meta['hey'] corpus1.get_utterance("2").meta['hey'] # test that delete_metadata works corpus1.delete_metadata('utterance', 'foo') self.assertRaises(KeyError, lambda: corpus1.meta_index.utterances_index['foo']) self.assertRaises(KeyError, lambda: corpus1.get_utterance("0").meta["foo"])
def test_overlap_convo_metadata(self): """ Merge with overlap in conversation with metadata differences. Expect second corpus convo metadata to override if keys are the same """ corpus1 = Corpus(utterances=[ Utterance(id="0", conversation_id='convo1', text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", conversation_id='convo1', text="my name is bob", speaker=Speaker(id="bob")), Utterance(id="2", conversation_id='convo1', text="this is a test", speaker=Speaker(id="charlie")), ]) corpus2 = Corpus(utterances=[ Utterance(id="2", conversation_id='convo1', text="this is a test", speaker=Speaker(id="charlie")), Utterance(id="4", conversation_id='convo1', text="this is a sentence", speaker=Speaker(id="echo")), Utterance(id="5", conversation_id='convo1', text="goodbye", speaker=Speaker(id="foxtrot")), ]) corpus1.get_conversation('convo1').add_meta('hey', 'jude') corpus1.get_conversation('convo1').add_meta('hello', 'world') corpus2.get_conversation('convo1').add_meta('hey', 'jude') corpus2.get_conversation('convo1').add_meta('hello', 'food') corpus2.get_conversation('convo1').add_meta('what', 'a mood') merged = corpus1.merge(corpus2) self.assertEqual(len(merged.get_conversation('convo1').meta), 3) self.assertEqual( merged.get_conversation('convo1').meta['hello'], 'food')
def transform(self, corpus: Corpus): for character in corpus.get_usernames(): user1 = corpus.get_user(character) utterances = user1.get_utterance_ids() utterances_per_conversation = [] conversations = [] for uid in utterances: utterance = corpus.get_utterance(uid) conversation = corpus.get_conversation(utterance.root) conversations.append(utterance.root) utterances_per_conversation.append( (utterance.root, len(conversation.get_usernames()), len(conversation.get_utterance_ids()))) first_last = 0 if uid in (utterance.root, list(conversation.get_utterance_ids())[-1]): first_last += 1 raw_count = len(utterances) / len(list(corpus.utterances.values())) total_conversations = len(set(conversations)) #bootstrapping iterations = 0 for i in range(20): samples = random.choices(utterances, k=25) #for politeness complexity# politeness_rows = [] #many operations# for uid in samples: politeness_rows.append( list( corpus.get_utterance( uid).meta["politeness_strategies"].values())) #politeness# politeness_results = np.sum(politeness_rows, 0) politeness_results_count = len([ i / len(politeness_rows) for i in politeness_results if i != 0.0 ]) / len(politeness_rows) iterations += politeness_results_count #politness_final# politeness_final = iterations / 20 #first/last# first_last_count = first_last / total_conversations #utterances_per_conversation# utterances_per_conversations = Counter(utterances_per_conversation) upc_final = [] for k, v in utterances_per_conversations.items(): average = k[2] / k[1] upc_final.append(v / average) upc_count = sum(upc_final) / len(utterances_per_conversations) user1.add_meta('politeness_complexity', politeness_final) user1.add_meta('utterance_per_conversation', upc_count) user1.add_meta('first_last_word', first_last_count) user1.add_meta('raw_count', raw_count) return (corpus)
def transform(self, corpus: Corpus): """Adds metadata about readability of the corpus to each utterance. :param corpus: the corpus to compute features for. :type corpus: Corpus """ for conv_id in corpus.conversations: conv = corpus.get_conversation(conv_id) for utt in conv.iter_utterances(): if utt.text != None: cumu_sentences = 0 cumu_words = 0 cumu_syllables = 0 cumu_syll_counted_words = 0 cumu_words_over2_syllables = 0 for sentence in sent_tokenize(utt.text): cumu_sentences += 1 tokenized = word_tokenize(sentence) cumu_words += len(tokenized) for token in tokenized: try: syll = self.__num_syllables(token)[0] cumu_syllables += syll cumu_syll_counted_words += 1 if syll > 2: cumu_words_over2_syllables += 1 except Exception as e: pass # readability formulas from https://www.geeksforgeeks.org/readability-index-pythonnlp/ if cumu_sentences > 0 and cumu_syll_counted_words > 0: gunning_fog = 0.4 * ((cumu_words / cumu_sentences) + \ (cumu_words_over2_syllables / cumu_syll_counted_words)) flesch = 206.835 - (1.015 * (cumu_words / cumu_sentences)) - \ (84.6 * (cumu_syllables / cumu_syll_counted_words)) flesch_kincaid = (0.39 * cumu_words / cumu_sentences) + \ (11.8 * cumu_syllables / cumu_syll_counted_words) - 15.59 utt.meta['complexity'] = \ {"gunning_fog": gunning_fog, "flesch": flesch, "flesch_kincaid": flesch_kincaid, "num_words": cumu_words, "num_sentences": cumu_sentences} else: utt.meta['complexity'] = \ {"gunning_fog": None, "flesch": None, "flesch_kincaid": None, "num_words": None, "num_sentences": None} return corpus
def transform(self, corpus: Corpus) -> Corpus: """ transform() retrieves features from the corpus conversational threads using retrieve_feats() and annotates Conversations with this data :param corpus: Corpus object to retrieve feature information from :return: corpus with conversations having a new meta field "hyperconvo" containing the stats generated by retrieve_feats(). Each conversation's metadata then contains the stats for the thread(s) it contains. """ convo_id_to_feats = self.retrieve_feats(corpus) for convo_id, feats in convo_id_to_feats.items(): convo = corpus.get_conversation(convo_id) convo.add_meta("hyperconvo", feats) return corpus
def transform(self, corpus: Corpus): """Adds metadata about self-reflection to each utterance. :param corpus: the corpus to compute features for. :type corpus: Corpus """ for conv_id in corpus.conversations: conv = corpus.get_conversation(conv_id) for utt in conv.iter_utterances(): if utt.text != None: tokenized = word_tokenize(utt.text.lower()) invocations = 0 for token in tokenized: if token in self.key_words: invocations += 1 utt.meta["num_self_invocations"] = invocations return corpus
def transform(self, corpus: Corpus): """Extract politeness strategies from each utterances in the corpus and annotate the utterances with the extracted strategies. Requires that the corpus has previously been transformed by a Parser, such that each utterance has dependency parse info in its metadata table. :param corpus: the corpus to compute features for. :type corpus: Corpus """ for conv_id in corpus.conversations: conv = corpus.get_conversation(conv_id) num_utts = len(conv.get_utterance_ids()) for i, utt in enumerate(conv.iter_utterances()): length_through_conv = i / num_utts utt.meta['length_tracker'] = [i, length_through_conv] return corpus
class CorpusTraversal(unittest.TestCase): def setUp(self) -> None: """ Basic Conversation tree (left to right within subtree => earliest to latest) 0 1 2 3 4 5 6 7 8 9 10 11 """ self.corpus = Corpus(utterances=[ Utterance(id="0", reply_to=None, root="0", speaker=Speaker(id="alice"), timestamp=0), Utterance(id="2", reply_to="0", root="0", speaker=Speaker(id="alice"), timestamp=2), Utterance(id="1", reply_to="0", root="0", speaker=Speaker(id="alice"), timestamp=1), Utterance(id="3", reply_to="0", root="0", speaker=Speaker(id="alice"), timestamp=3), Utterance(id="4", reply_to="1", root="0", speaker=Speaker(id="alice"), timestamp=4), Utterance(id="5", reply_to="1", root="0", speaker=Speaker(id="alice"), timestamp=5), Utterance(id="6", reply_to="1", root="0", speaker=Speaker(id="alice"), timestamp=6), Utterance(id="7", reply_to="2", root="0", speaker=Speaker(id="alice"), timestamp=4), Utterance(id="8", reply_to="2", root="0", speaker=Speaker(id="alice"), timestamp=5), Utterance(id="9", reply_to="3", root="0", speaker=Speaker(id="alice"), timestamp=4), Utterance(id="10", reply_to="4", root="0", speaker=Speaker(id="alice"), timestamp=5), Utterance(id="11", reply_to="9", root="0", speaker=Speaker(id="alice"), timestamp=10), Utterance(id="other", reply_to=None, root="other", speaker=Speaker(id="alice"), timestamp=99) ]) self.corpus.get_conversation("0").meta['hey'] = 'jude' self.corpus.meta['foo'] = 'bar' def test_broken_convos(self): """ Test basic meta functions """ corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", reply_to=None, speaker=Speaker(id="alice"), timestamp=0), Utterance(id="1", text="my name is bob", reply_to="0", speaker=Speaker(id="bob"), timestamp=2), Utterance(id="2", text="this is a test", reply_to="1", speaker=Speaker(id="charlie"), timestamp=1), Utterance(id="3", text="hello world 2", reply_to=None, speaker=Speaker(id="alice2"), timestamp=0), ]) corpus2 = Corpus(utterances=[ Utterance(id="0", text="hello world", reply_to=None, speaker=Speaker(id="alice"), timestamp=0), Utterance(id="1", text="my name is bob", reply_to="0", speaker=Speaker(id="bob"), timestamp=2), Utterance(id="2", text="this is a test", reply_to="1", speaker=Speaker(id="charlie"), timestamp=1), Utterance(id="3", text="hello world 2", reply_to="9", speaker=Speaker(id="alice2"), timestamp=0), ]) # test broken convo where there are multiple roots convo = corpus1.get_conversation(None) self.assertRaises( ValueError, lambda: list(convo.traverse("dfs", as_utterance=True))) # test broken convo where utterance replies to something not in Conversation convo = corpus2.get_conversation(None) self.assertRaises( ValueError, lambda: list(convo.traverse("dfs", as_utterance=True))) def test_bfs_traversal(self): convo = self.corpus.get_conversation("0") bfs_traversal = [ utt.id for utt in convo.traverse("bfs", as_utterance=True) ] self.assertEqual(bfs_traversal, [str(i) for i in range(12)]) def test_dfs_traversal(self): convo = self.corpus.get_conversation("0") dfs_traversal = [ utt.id for utt in convo.traverse("dfs", as_utterance=True) ] self.assertEqual( dfs_traversal, [str(i) for i in [0, 1, 4, 10, 5, 6, 2, 7, 8, 3, 9, 11]]) def test_postorder_traversal(self): convo = self.corpus.get_conversation("0") postorder_traversal = [ utt.id for utt in convo.traverse("postorder", as_utterance=True) ] self.assertEqual( postorder_traversal, ['10', '4', '5', '6', '1', '7', '8', '2', '11', '9', '3', '0']) def test_preorder_traversal(self): convo = self.corpus.get_conversation("0") preorder_traversal = [ utt.id for utt in convo.traverse("preorder", as_utterance=True) ] self.assertEqual( preorder_traversal, ['0', '1', '4', '10', '5', '6', '2', '7', '8', '3', '9', '11']) def test_subtree(self): convo = self.corpus.get_conversation("0") node = convo.get_subtree("1") self.assertEqual([node.utt.id for node in node.bfs_traversal()], ['1', '4', '5', '6', '10']) def test_root_to_leaf_paths(self): convo = self.corpus.get_conversation("0") paths = convo.get_root_to_leaf_paths() path_tuples = [tuple(utt.id for utt in paths[i]) for i in range(6)] self.assertIn(('0', '1', '4', '10'), path_tuples) self.assertIn(('0', '1', '5'), path_tuples) self.assertIn(('0', '1', '6'), path_tuples) self.assertIn(('0', '2', '7'), path_tuples) self.assertIn(('0', '2', '8'), path_tuples) self.assertIn(('0', '3', '9', '11'), path_tuples) def test_one_utt_convo(self): convo = self.corpus.get_conversation("other") self.assertEqual([utt.id for utt in convo.traverse('bfs')], ["other"]) self.assertEqual([utt.id for utt in convo.traverse('dfs')], ["other"]) self.assertEqual([utt.id for utt in convo.traverse('postorder')], ["other"]) self.assertEqual([utt.id for utt in convo.traverse('preorder')], ["other"]) def test_reindex_corpus(self): new_convo_roots = ['1', '2', '3'] new_corpus = self.corpus.reindex_conversations(new_convo_roots) # checking for correct number of conversations and utterances self.assertEqual(len(list(new_corpus.iter_conversations())), 3) self.assertEqual(len(list(new_corpus.iter_utterances())), 11) # checking that corpus and conversation metadata was preserved for convo in new_corpus.iter_conversations(): self.assertEqual(convo.meta['original_convo_meta'], self.corpus.get_conversation("0").meta) self.assertEqual(self.corpus.meta, new_corpus.meta) def test_reindex_corpus2(self): new_convo_roots = ['1', '2', '3'] new_corpus = self.corpus.reindex_conversations( new_convo_roots, preserve_convo_meta=False, preserve_corpus_meta=False) # checking for correct number of conversations and utterances self.assertEqual(len(list(new_corpus.iter_conversations())), 3) self.assertEqual(len(list(new_corpus.iter_utterances())), 11) # checking that corpus and conversation metadata was preserved for convo in new_corpus.iter_conversations(): self.assertEqual(convo.meta, dict()) self.assertEqual(new_corpus.meta, dict())