def transform(self, corpus: Corpus): """Adds metadata about politicization to each utterance. :param corpus: the corpus to compute features for. :type corpus: Corpus """ assert 'stem_tokens' in next(corpus.iter_utterances()).meta for utt in corpus.iter_utterances(): if utt.meta['valid']: utt.meta['num_pol_words'] = len( self.key_words.intersection(utt.meta['stem_tokens'])) utt.meta['political'] = int(utt.meta['num_pol_words'] > 0) else: utt.meta['num_pol_words'] = None utt.meta['political'] = None # for conv_id in corpus.conversations: # conv = corpus.get_conversation(conv_id) # for utt in conv.iter_utterances(): # if utt.text != None: # tokenized = word_tokenize(utt.text.lower()) # invocations = 0 # length = len(tokenized) # pol_words = [] # for token in tokenized: # if token in self.key_words: # invocations += 1 # pol_words.append(token) # utt.meta["num_pol_refs"] = invocations # if (length > 0): # utt.meta["num_pol_refs_incidence"] = (invocations/length) # else: # utt.meta["num_pol_refs_incidence"] = 0 # utt.meta["pol_words"] = pol_words return corpus
def transform( self, corpus: Corpus, selector: Optional[Callable[[Conversation], bool]] = lambda convo: True ) -> Corpus: """ Retrieves features from the Corpus Conversations using retrieve_feats() and annotates Conversations with this feature set :param corpus: Corpus object to retrieve feature information from :param selector: a (lambda) function that takes a Conversation and returns True / False; function selects conversations to be annotated with hypergraph features. By default, all conversations will be annotated. :return: corpus with conversations having a new meta field with the specified feature name containing the stats generated by retrieve_feats(). """ convo_id_to_feats = self.retrieve_feats(corpus, selector) df = pd.DataFrame(convo_id_to_feats).T corpus.set_vector_matrix(name=self.vector_name, ids=list(df.index), columns=list(df.columns), matrix=csr_matrix( df.values.astype('float64'))) for convo in corpus.iter_conversations(selector): convo.add_vector(self.vector_name) return corpus
def transform(self, corpus: Corpus): for convo in corpus.iter_conversations(): reciprocal = 0 onesided = 0 user_to_targets = dict() for user in convo.iter_users(): user_to_targets[user.name] = { corpus.get_utterance(utt.reply_to).user.name for utt in user.iter_utterances() if utt.reply_to is not None } for user1, user2 in combinations(convo.iter_users(), 2): user1_to_user2 = user2.name in user_to_targets[user1.name] user2_to_user1 = user1.name in user_to_targets[user2.name] if user1_to_user2 and user2_to_user1: reciprocal += 1 elif user1_to_user2 or user2_to_user1: onesided += 1 if reciprocal + onesided == 0: reciprocity_pct = 0 else: reciprocity_pct = reciprocal / (reciprocal + onesided) convo.add_meta('reciprocity', reciprocity_pct) return corpus
def test_add_utterance(self): corpus1 = Corpus(utterances=[ Utterance(id=0, text="hello world", user=User(name="alice")), Utterance(id=1, text="my name is bob", user=User(name="bob")), Utterance(id=2, text="this is a test", user=User(name="charlie"), meta={ 'hey': 'jude', 'hello': 'world' }), ]) utts = [ Utterance(id=1, text="i like pie", user=User(name="delta")), Utterance(id=2, text="this is a test", user=User(name="charlie"), meta={ 'hello': 'food', 'what': 'a mood' }), Utterance(id=5, text="goodbye", user=User(name="foxtrot")), ] added = corpus1.add_utterances(utts) self.assertEqual(len(list(added.iter_utterances())), 4) self.assertEqual(len(added.get_utterance(2).meta), 3) self.assertEqual(added.get_utterance(2).meta['hello'], 'food')
def summarize(self, corpus: Corpus, cv=LeaveOneOut()): """ Run PairedPrediction on the corpus with cross-validation :param corpus: target Corpus (must be annotated with pair information using PairedPrediction.transform()) :param cv: optional CV model: default is LOOCV :return: cross-validation accuracy score """ # Check if transform() needs to be run first sample_obj = next(corpus.iter_objs(self.obj_type)) meta_keys = set(sample_obj.meta) required_keys = {self.pair_orientation_feat_name, self.pair_id_feat_name, self.label_feat_name} required_keys -= meta_keys if len(required_keys) > 0: raise ValueError("Some metadata features required for paired prediction are missing: {}. " "You may need to run transform() first.".format(required_keys)) pair_id_to_obj = {'pos': dict(), 'neg': dict()} for obj in corpus.iter_objs(self.obj_type, self.selector): if obj.meta[self.pair_orientation_feat_name] is None: continue pair_id_to_obj[obj.meta[self.label_feat_name]][obj.meta[self.pair_id_feat_name]] = obj pair_ids = set(pair_id_to_obj['pos'].keys()).intersection(set(pair_id_to_obj['neg'].keys())) # print(set(pair_id_to_obj['pos'].keys())) print("Found {} valid pairs.".format(len(pair_ids))) pair_id_to_objs = dict() for pair_id in pair_ids: pair_id_to_objs[pair_id] = (pair_id_to_obj['pos'][pair_id], pair_id_to_obj['neg'][pair_id]) X, y = self._generate_paired_X_y(pair_id_to_objs) self.clf.fit(X, y) return np.mean(cross_val_score(self.clf, X, y, cv=cv, error_score='raise'))
def test_basic_functions(self): """ Test basic meta functions """ corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) first_utt = corpus1.get_utterance("0") first_utt.meta['hey'] = 9 # correct class type stored self.assertEqual(corpus1.meta_index.utterances_index['hey'], repr(type(9))) # keyErrors result in None output self.assertRaises(KeyError, lambda: first_utt.meta['nonexistent key']) # test that setting a custom get still works self.assertEqual(first_utt.meta.get('nonexistent_key', {}), {})
def summarize(self, corpus: Corpus, use_selector=True, exclude_na=True): """ Returns a DataFrame of utterances and their forecasts (and forecast probabilities) :param corpus: target Corpus :param use_selector: whether to use Forecaster's convo and utterance selector functions :param exclude_na: whether to drop NaN results :return: a pandas DataFrame """ utt_forecast_prob = [] if use_selector: for convo in corpus.iter_conversations(self.convo_selector_func): for utt in convo.iter_utterances(self.utt_selector_func): utt_forecast_prob.append( (utt.id, utt.meta[self.forecast_feat_name], utt.meta[self.forecast_prob_feat_name])) else: for utt in corpus.iter_utterances(): utt_forecast_prob.append( (utt.id, utt.meta[self.forecast_feat_name], utt.meta[self.forecast_prob_feat_name])) forecast_df = pd.DataFrame(utt_forecast_prob, columns=["utt_id", self.forecast_feat_name, self.forecast_prob_feat_name]) \ .set_index('utt_id').sort_values(self.forecast_prob_feat_name, ascending=False) if exclude_na: forecast_df = forecast_df.dropna() return forecast_df
def setUp(self) -> None: self.corpus = Corpus(download('subreddit-hey')) self.utt_df = self.corpus.get_utterances_dataframe() self.convo_df = self.corpus.get_conversations_dataframe() self.speaker_df = self.corpus.get_speakers_dataframe() self.new_corpus = Corpus.from_pandas(self.utt_df, self.speaker_df, self.convo_df)
def transform(self, corpus: Corpus): ''' compiles a list of all utterances by each user, organized by conversation; also annotates user with summary statistics. :param corpus: the Corpus to transform. :type corpus: Corpus ''' user_to_convo_utts = defaultdict(lambda: defaultdict(list)) for utterance in corpus.iter_utterances(): if not self.utterance_filter(utterance): continue user_to_convo_utts[utterance.user.name][utterance.root].append( (utterance.id, utterance.timestamp)) for user, convo_utts in user_to_convo_utts.items(): user_convos = {} for convo, utts in convo_utts.items(): sorted_utts = sorted(utts, key=lambda x: x[1]) user_convos[convo] = { 'utterance_ids': [x[0] for x in sorted_utts], 'start_time': sorted_utts[0][1], 'n_utterances': len(sorted_utts) } corpus.get_user(user).add_meta('conversations', user_convos) for user in corpus.iter_users(): if 'conversations' not in user.meta: continue user.add_meta('n_convos', len(user.meta['conversations'])) sorted_convos = sorted(user.meta['conversations'].items(), key=lambda x: x[1]['start_time']) user.add_meta('start_time', sorted_convos[0][1]['start_time']) for idx, (convo_id, _) in enumerate(sorted_convos): user.meta['conversations'][convo_id]['idx'] = idx return corpus
def get_scores(self, corpus: Corpus, selector: Optional[Callable[[], bool]] = None): """ Calculates average occurance per utterance. Used in summarize() :param corpus: the corpus used to compute averages :param selector: lambda function which takes in meta data and returns a boolean. """ utts = [corpus.get_utterance(x) for x in corpus.get_utterance_ids()] if self.MRKR_NAME not in utts[0].meta: corpus = self.transform(corpus, markers=True) if selector != None: utts = [x for x in utts if selector(x.meta)] if len(utts) == 0: raise Exception("No query matches") counts = { k[21:len(k) - 2]: 0 for k in utts[0].meta[self.MRKR_NAME].keys() } for utt in utts: for k, v in utt.meta[self.MRKR_NAME].items(): counts[k[21:len(k) - 2]] += len(v) scores = {k: v / len(utts) for k, v in counts.items()} return scores
def fit_transform(self, corpus: Corpus) -> Corpus: """ fit_transform() retrieves features from the corpus conversational threads using retrieve_feats() :param corpus: Corpus object to retrieve feature information from :return: corpus with conversations having a new meta field "hyperconvo" containing the stats generated by retrieve_feats(). Each conversation's metadata then contains the stats for the thread(s) it contains. """ feats = HyperConvo.retrieve_feats(corpus, prefix_len=self.prefix_len, min_thread_len=self.min_thread_len, include_root=self.include_root) if self.include_root: # threads start at root (post) for root_id in feats.keys(): convo = corpus.get_conversation(root_id) convo.add_meta("hyperconvo", {root_id: feats[root_id]}) else: # threads start at top-level-comment # Construct top-level-comment to root mapping tlc_to_root_mapping = dict() # tlc = top level comment threads = corpus.utterance_threads(prefix_len=self.prefix_len, include_root=False) root_to_tlc = dict() for tlc_id, utts in threads.items(): if len(utts) < self.min_thread_len: continue thread_root = threads[tlc_id][tlc_id].root if thread_root in root_to_tlc: root_to_tlc[thread_root][tlc_id] = feats[tlc_id] else: root_to_tlc[thread_root] = {tlc_id: feats[tlc_id]} for root_id in root_to_tlc: convo = corpus.get_conversation(root_id) convo.add_meta("hyperconvo", root_to_tlc[root_id]) return corpus
def transform(self, corpus: Corpus) -> Corpus: """Computes the average number of questions asked in a conversation :param corpus: the corpus to compute features for. :type corpus: Corpus """ if self.verbose: print("Finding questions per utterance") questions = [] allutterids = corpus.get_utterance_ids() for i in list(range(0, len(allutterids))): utter_id = allutterids[i] text = corpus.get_utterance(utter_id).text nquestions = len(re.findall(r'\?+', text)) questions.append( nquestions) #gives number of questions in each utterance if self.verbose: print("Finding questions per conversation") allconvoids = corpus.get_conversation_ids() for i in list(range(0, len(allconvoids))): convo_id = allconvoids[i] convo_utters = corpus.get_conversation(convo_id)._utterance_ids avgquestion = np.mean( np.asarray(questions)[np.asarray(convo_utters)]) corpus.get_conversation(convo_id)._meta[ self.ATTR_NAME] = avgquestion #adds average questions per conversation to conversation metadata return corpus
def fit_transform(self, corpus: Corpus) -> Corpus: """ Groups threads together into communities. :param corpus: the Corpus to use :return: Modifies and returns Corpus with new meta key: "communityEmbedder", value: Dict, containing "pts": an array with rows corresponding to embedded communities, and "labels": an array whose ith entry is the community of the ith row of X. """ if self.community_key is None: raise RuntimeError( "Must specify community_key to retrieve label information from utterance" ) corpus_meta = corpus.get_meta() if "threadEmbedder" not in corpus_meta: raise RuntimeError( "Missing threadEmbedder metadata: " "threadEmbedder.fit_transform() must be run on the Corpus first" ) thread_embed_data = corpus_meta["threadEmbedder"] X_mid = thread_embed_data["X"] roots = thread_embed_data["roots"] if self.method.lower() == "svd": f = TruncatedSVD elif self.method.lower() == "tsne": f = TSNE elif self.method.lower() == "none": f = None else: raise Exception("Invalid embed_communities embedding method") if f is not None: X_embedded = f(n_components=self.n_components).fit_transform(X_mid) else: X_embedded = X_mid labels = [ corpus.get_utterance(root).get("meta")[self.community_key] for root in roots ] # label_counts = Counter(labels) subs = defaultdict(list) for x, label in zip(X_embedded, labels): subs[label].append(x / np.linalg.norm(x)) labels, subs = zip(*subs.items()) pts = [np.mean(sub, axis=0) for sub in subs] retval = {"pts": pts, "labels": labels} corpus.add_meta("communityEmbedder", retval) return corpus
def test_corpus_dump(self): corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus1.get_utterance("0").meta['foo'] = 'bar' corpus1.get_utterance("1").meta['foo'] = 'bar2' corpus1.get_utterance("2").meta['hey'] = 'jude' corpus1.get_conversation(None).meta['convo_meta'] = 1 corpus1.get_speaker("alice").meta['surname'] = 1.0 corpus1.dump('test_index_meta_corpus', base_path="./") corpus2 = Corpus(filename="test_index_meta_corpus") self.assertEqual(corpus1.meta_index.utterances_index, corpus2.meta_index.utterances_index) self.assertEqual(corpus1.meta_index.speakers_index, corpus2.meta_index.speakers_index) self.assertEqual(corpus1.meta_index.conversations_index, corpus2.meta_index.conversations_index) self.assertEqual(corpus1.meta_index.overall_index, corpus2.meta_index.overall_index)
def _get_context_reply_label_dict(self, corpus: Corpus, convo_selector, utt_excluder, include_label=True): """ Returns a dict mapping reply id to (context, reply, label). If self.forecast_mode == 'future': return a dict mapping the leaf utt id to the path from root utt to leaf utt """ dialogs = [] if self.convo_structure == "branched": for convo in corpus.iter_conversations(convo_selector): try: for path in convo.get_root_to_leaf_paths(): path = [utt for utt in path if not utt_excluder(utt)] if len(path) == 1: continue dialogs.append(path) except ValueError as e: if not self.skip_broken_convos: raise e elif self.convo_structure == "linear": for convo in corpus.iter_conversations(convo_selector): utts = convo.get_chronological_utterance_list( selector=lambda x: not utt_excluder(x)) if len(utts) == 1: continue dialogs.append(utts) id_to_context_reply_label = dict() if self.forecast_mode == 'future': for dialog in dialogs: id_to_context_reply_label[dialog[-1].id] = (dialog, dialog[-1], None) for dialog in dialogs: if self.use_last_only: reply = self.text_func(dialog[-1]) context = [self.text_func(utt) for utt in dialog[:-1]] label = self.label_func(dialog[-1]) if include_label else None id_to_context_reply_label[dialog[-1].id] = (context, reply, label) else: for idx in range(1, len(dialog)): reply = self.text_func(dialog[idx]) label = self.label_func( dialog[idx]) if include_label else None reply_id = dialog[idx].id context = [self.text_func(utt) for utt in dialog[:idx]] id_to_context_reply_label[reply_id] = ( context, reply, label) if include_label else (context, reply, None) return id_to_context_reply_label
def test_key_insertion_deletion(self): corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice")), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob")), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus1.get_utterance("0").meta['foo'] = 'bar' corpus1.get_utterance("1").meta['foo'] = 'bar2' corpus1.get_utterance("2").meta['hey'] = 'jude' corpus1.get_conversation(None).meta['convo_meta'] = 1 corpus1.get_speaker("alice").meta['surname'] = 1.0 self.assertEqual(corpus1.meta_index.utterances_index['foo'], str(type('bar'))) self.assertEqual(corpus1.meta_index.conversations_index['convo_meta'], str(type(1))) self.assertEqual(corpus1.meta_index.speakers_index['surname'], str(type(1.0))) # test that deleting a key from an utterance removes it from the index del corpus1.get_utterance("2").meta['hey'] self.assertRaises(KeyError, lambda: corpus1.meta_index.utterances_index['hey']) # test that deleting a key from an utterance removes it from the index and from all other objects of same type del corpus1.get_utterance("1").meta['foo'] self.assertRaises(KeyError, lambda: corpus1.meta_index.utterances_index['foo']) self.assertRaises(KeyError, lambda: corpus1.get_utterance("0").meta["foo"])
def fit_transform(self, corpus: Corpus) -> Corpus: """ :param corpus: the Corpus to use :return: Modifies and returns corpus with new meta key: "threadEmbedder", value: Dict, containing "X": an array with rows corresponding to embedded threads, "roots": an array whose ith entry is the thread root id of the ith row of X. If return_components is True, then the Dict contains a third key "components": the SVD components array """ convos = corpus.iter_conversations() sample_convo_meta = next(iter(convos)) if "hyperconvo" not in sample_convo_meta: raise RuntimeError( "Missing thread statistics: HyperConvo.fit_transform() must be run on the Corpus first" ) thread_stats = dict() for convo in convos: thread_stats.update(convo.meta["hyperconvo"]) X = [] roots = [] for root, feats in thread_stats.items(): roots.append(root) row = np.array([ v[1] if not (np.isnan(v[1]) or np.isinf(v[1])) else 0 for v in sorted(feats.items()) ]) X.append(row) X = np.array(X) if self.norm_method.lower() == "standard": X = StandardScaler().fit_transform(X) elif self.norm_method.lower() == "none": pass else: raise Exception("Invalid embed_feats normalization method") if self.method.lower() == "svd": f = TruncatedSVD elif self.method.lower() == "tsne": f = TSNE else: raise Exception("Invalid embed_feats embedding method") emb = f(n_components=self.n_components) X_mid = emb.fit_transform(X) / emb.singular_values_ retval = {"X": X_mid, "roots": roots} if self.return_components: retval["components"] = emb.components_ corpus.add_meta("threadEmbedder", retval) return corpus
def transform(self, corpus: Corpus): for character in corpus.get_usernames(): user1 = corpus.get_user(character) utterances = user1.get_utterance_ids() utterances_per_conversation = [] conversations = [] for uid in utterances: utterance = corpus.get_utterance(uid) conversation = corpus.get_conversation(utterance.root) conversations.append(utterance.root) utterances_per_conversation.append( (utterance.root, len(conversation.get_usernames()), len(conversation.get_utterance_ids()))) first_last = 0 if uid in (utterance.root, list(conversation.get_utterance_ids())[-1]): first_last += 1 raw_count = len(utterances) / len(list(corpus.utterances.values())) total_conversations = len(set(conversations)) #bootstrapping iterations = 0 for i in range(20): samples = random.choices(utterances, k=25) #for politeness complexity# politeness_rows = [] #many operations# for uid in samples: politeness_rows.append( list( corpus.get_utterance( uid).meta["politeness_strategies"].values())) #politeness# politeness_results = np.sum(politeness_rows, 0) politeness_results_count = len([ i / len(politeness_rows) for i in politeness_results if i != 0.0 ]) / len(politeness_rows) iterations += politeness_results_count #politness_final# politeness_final = iterations / 20 #first/last# first_last_count = first_last / total_conversations #utterances_per_conversation# utterances_per_conversations = Counter(utterances_per_conversation) upc_final = [] for k, v in utterances_per_conversations.items(): average = k[2] / k[1] upc_final.append(v / average) upc_count = sum(upc_final) / len(utterances_per_conversations) user1.add_meta('politeness_complexity', politeness_final) user1.add_meta('utterance_per_conversation', upc_count) user1.add_meta('first_last_word', first_last_count) user1.add_meta('raw_count', raw_count) return (corpus)
def transform(self, corpus: Corpus) -> Corpus: """Adds the ARI score to the metadata table of each utterance in the corpus. :return: corpus, modified with ARI and Flesch-Kincaid grade level scores assigned to each utterance """ utt_ids = corpus.get_utterance_ids() for utt_id in utt_ids: #add scores to each utterances metadata corpus.get_utterance(utt_id).meta['ARI'] = ARI( corpus.get_utterance(utt_id).text) corpus.get_utterance( utt_id).meta['Flesch-Kincaid'] = Flesch_Kincaid( corpus.get_utterance(utt_id).text) return corpus
def transform(self, corpus: Corpus) -> Corpus: super().transform(corpus) if self.replace_text: selector = lambda utt_: self.input_filter(utt_, None) for utt in corpus.iter_utterances(selector): cleaned_text = utt.retrieve_meta(self.output_field) if self.save_original: utt.add_meta(self.output_field, utt.text) utt.text = cleaned_text if not self.save_original: corpus.delete_metadata('utterance', self.output_field) return corpus
def transform(self, corpus: Corpus) -> Corpus: super().transform(corpus) if self.replace_text: selector = lambda utt_: self.input_filter(utt_, None) for utt in corpus.iter_utterances(selector): cleaned_text = utt.get_info(self.output_field) if self.save_original: utt.set_info(self.output_field, utt.text) utt.text = cleaned_text if not self.save_original: next(corpus.iter_utterances(selector)).del_info( self.output_field) # deletes for all return corpus
def transform(self, corpus: Corpus) -> Corpus: """ Annotate corpus objects with pair information (label, pair_id, pair_orientation) :param corpus: target Corpus :return: annotated Corpus """ pos_objs, neg_objs = self._get_pos_neg_objects(corpus) obj_pairs = self._pair_objs(pos_objs, neg_objs) pair_orientations = self._assign_pair_orientations(obj_pairs) for pair_id, (pos_obj, neg_obj) in obj_pairs.items(): pos_obj.add_meta(self.label_feat_name, "pos") neg_obj.add_meta(self.label_feat_name, "neg") pos_obj.add_meta(self.pair_id_feat_name, pair_id) neg_obj.add_meta(self.pair_id_feat_name, pair_id) pos_obj.add_meta(self.pair_orientation_feat_name, pair_orientations[pair_id]) neg_obj.add_meta(self.pair_orientation_feat_name, pair_orientations[pair_id]) for obj in corpus.iter_objs(self.obj_type): # unlabelled objects include both objects that did not pass the selector # and objects that were not selected in the pairing step if self.label_feat_name not in obj.meta: obj.add_meta(self.label_feat_name, None) obj.add_meta(self.pair_id_feat_name, None) obj.add_meta(self.pair_orientation_feat_name, None) return corpus
def fit(self, corpus: Corpus, text_func: Callable[[Utterance], List[str]] = None, selector: Callable[[Utterance], bool] = lambda utt: True): """ Fits a model for each group of utterances in a corpus. The group that an utterance belongs to is determined by the `model_key_selector` parameter in the transformer's constructor. :param corpus: corpus to create models from. :param text_func: optional function to define how the text a model is trained on should be selected. Takes an utterance as input and returns a list of strings to train the model corresponding to that utterance on. The model corresponding to the utterance is determined by `self.model_key_selector`. For every utterance corresponding to the same model key, this function should return the same result. If `text_func` is `None`, a model will be trained on the text from all the utterances that belong to its group. :param selector: determines which utterances in the corpus to train models for. """ self.model_groups = defaultdict(list) for utt in tqdm(corpus.iter_utterances(selector=selector), desc='fit1'): key = self.model_key_selector(utt) if text_func: if key not in self.model_groups: self.model_groups[key] = text_func(utt) else: self.model_groups[key].append(utt.text) for key in tqdm(self.model_groups, desc='fit2'): if not text_func: self.model_groups[key] = [' '.join(self.model_groups[key])] self.model_groups[key] = list( map(lambda x: self.tokenizer(x), self.model_groups[key])) return self
def transform(self, corpus: Corpus, selector: Optional[Callable[[Utterance], bool]] = lambda utt: True, markers: bool = False): """ Extract politeness strategies from each utterances in the corpus and annotate the utterances with the extracted strategies. Requires that the corpus has previously been transformed by a Parser, such that each utterance has dependency parse info in its metadata table. :param corpus: the corpus to compute features for. :param selector: a (lambda) function that takes an Utterance and returns a bool indicating whether the utterance should be included in this annotation step. :param markers: whether or not to add politeness occurrence markers """ for utt in corpus.iter_utterances(): if selector(utt): for i, sent in enumerate(utt.meta["parsed"]): for p in sent["toks"]: p["tok"] = re.sub("[^a-z,.:;]", "", p["tok"].lower()) utt.meta[self.ATTR_NAME], marks = get_politeness_strategy_features(utt) if markers: utt.meta[self.MRKR_NAME] = marks else: utt.meta[self.ATTR_NAME] = None utt.meta[self.MRKR_NAME] = None return corpus
def transform( self, corpus: Corpus, y=None, selector: Callable[[CorpusObject], bool] = lambda obj: True) -> Corpus: """ Annotate corpus objects with scores and rankings. :param corpus: target corpus :param selector: (lambda) function taking in a Corpus object and returning True / False; selects for Corpus objects to annotate. :return: annotated corpus """ obj_iters = { "conversation": corpus.iter_conversations, "user": corpus.iter_users, "utterance": corpus.iter_utterances } obj_scores = [(obj.id, self.score_func(obj)) for obj in obj_iters[self.obj_type](selector)] df = pd.DataFrame(obj_scores, columns=["id", self.score_feat_name]) \ .set_index('id').sort_values(self.score_feat_name, ascending=False) df[self.rank_feat_name] = [idx + 1 for idx, _ in enumerate(df.index)] for obj in corpus.iter_objs(obj_type=self.obj_type): if obj.id in df.index: obj.add_meta(self.score_feat_name, df.loc[obj.id][self.score_feat_name]) obj.add_meta(self.rank_feat_name, df.loc[obj.id][self.rank_feat_name]) else: obj.add_meta(self.score_feat_name, None) obj.add_meta(self.rank_feat_name, None) return corpus
def retrieve_feats(corpus: Corpus, prefix_len: int=10, min_thread_len: int=10, include_root: bool=True) -> Dict[Hashable, Dict]: """ Retrieve all hypergraph features for a given corpus (viewed as a set of conversation threads). See init() for further documentation. :return: A dictionary from a thread root id to its stats dictionary, which is a dictionary from feature names to feature values. For degree-related features specifically. """ threads_stats = dict() for i, (root, thread) in enumerate( corpus.utterance_threads(prefix_len=prefix_len, include_root=include_root).items()): if len(thread) < min_thread_len: continue stats = {} G = HyperConvo._make_hypergraph(uts=thread) G_mid = HyperConvo._make_hypergraph(uts=thread, exclude_id=root) for k, v in HyperConvo._degree_feats(G=G).items(): stats[k] = v for k, v in HyperConvo._motif_feats(G=G).items(): stats[k] = v for k, v in HyperConvo._degree_feats(G=G_mid, name_ext="mid-thread ").items(): stats[k] = v for k, v in HyperConvo._motif_feats(G=G_mid, name_ext=" over mid-thread").items(): stats[k] = v threads_stats[root] = stats return threads_stats
def transform(self, corpus: Corpus): """Adds metadata about self-reflection to each utterance. :param corpus: the corpus to compute features for. :type corpus: Corpus """ for conv_id in corpus.conversations: conv = corpus.get_conversation(conv_id) for utt in conv.iter_utterances(): if utt.text != None: tokenized = word_tokenize(utt.text.lower()) invocations = 0 length = len(tokenized) pol_words = [] for token in tokenized: if token in self.key_words: invocations += 1 pol_words.append(token) utt.meta["num_pol_refs"] = invocations if (length > 0): utt.meta["num_pol_refs_incidence"] = (invocations/length) else: utt.meta["num_pol_refs_incidence"] = 0 utt.meta["pol_words"] = pol_words return corpus
def _preprocess_utterances(self, corpus: Corpus) -> Tuple[List[Hashable], List[Dict]]: """Convert each Utterance in the given Corpus into the representation expected by the politeness API. Assumes that the Corpus has already been parsed, so that each Utterance contains the `parsed` metadata entry :param corpus: the corpus to compute features for. :type corpus: Corpus """ utt_ids = [] # keep track of the order in which we process the utterances, so we can join with the corpus at the end documents = [] for i, utterance in enumerate(corpus.iter_utterances()): if self.verbose and i > 0 and (i % self.verbose) == 0: print("\t%03d" % i) utt_ids.append(utterance.id) doc = {"text": utterance.text, "sentences": [], "parses": []} # the politeness API goes sentence-by-sentence for sent in utterance.meta["parsed"].sents: doc["sentences"].append(sent.text) sent_parses = [] pos = sent.start for tok in sent: if tok.dep_ != "punct": # the politeness API does not know how to handle punctuation in parses ele = "%s(%s-%d, %s-%d)"%(tok.dep_, tok.head.text, tok.head.i + 1 - pos, tok.text, tok.i + 1 - pos) sent_parses.append(ele) doc["parses"].append(sent_parses) doc["unigrams"], doc["bigrams"] = get_unigrams_and_bigrams(doc) documents.append(doc) if self.verbose: print("Done!") return utt_ids, documents
def transform( self, corpus: Corpus, selector: Callable[[Utterance], bool] = lambda x: True) -> Corpus: """ Annotates the corpus utterances with the lists of fighting words that the utterance contains. The relevant fighting words to use are specified by FightingWords.top_k or FightingWords.threshold, with FightingWords.annot_method indicating which criterion to use. Lists are stored under metadata keys 'fighting_words_class1', 'fighting_words_class2' :param corpus: corpus to annotate :param selector: a (lambda) function that takes an Utterance and returns True/False; this selects for utterances that should be annotated with the fighting words :return: annotated corpus """ class1_ngrams, class2_ngrams = self.get_top_k_ngrams() if self.annot_method == "top_k" \ else self.get_ngrams_past_threshold() for utt in corpus.iter_utterances( ): # improve the efficiency of this; tricky because ngrams #TODO if selector(utt): utt.meta['fighting_words_class1'] = [ ngram for ngram in class1_ngrams if ngram in utt.text ] utt.meta['fighting_words_class2'] = [ ngram for ngram in class2_ngrams if ngram in utt.text ] else: utt.meta['fighting_words_class1'] = None utt.meta['fighting_words_class2'] = None return corpus
def retrieve_feats(self, corpus: Corpus) -> Dict[str, Dict]: """ Retrieve all hypergraph features for a given corpus (viewed as a set of conversation threads). See init() for further documentation. :return: A dictionary from a thread root id to its stats dictionary, which is a dictionary from feature names to feature values. For degree-related features specifically. """ threads_stats = dict() for convo in corpus.iter_conversations(): ordered_utts = convo.get_chronological_utterance_list() if len(ordered_utts) < self.min_thread_len: continue utts = ordered_utts[:self.prefix_len] stats = {} G = Hypergraph.init_from_utterances(utterances=utts) G_mid = Hypergraph.init_from_utterances( utterances=utts[1:]) # exclude root for k, v in HyperConvo._degree_feats(graph=G).items(): stats[k] = v for k, v in HyperConvo._motif_feats(graph=G).items(): stats[k] = v for k, v in HyperConvo._degree_feats( graph=G_mid, name_ext="mid-thread ").items(): stats[k] = v for k, v in HyperConvo._motif_feats( graph=G_mid, name_ext=" over mid-thread").items(): stats[k] = v threads_stats[convo.id] = stats return threads_stats