def fit(self, corpus: Corpus, selector: Callable[[CorpusComponent], bool] = lambda x: True, y=None): """ Fit the Transformer's internal classifier model on the vector matrix that represents one of the Corpus components, with an optional selector that selects for objects to be fit on. :param corpus: the target Corpus :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus. :return: the fitted VectorClassifier """ # collect texts for vectorization obj_ids = [] y = [] for obj in corpus.iter_objs(self.obj_type, selector): obj_ids.append(obj.id) y.append(self.labeller(obj)) X = corpus.get_vectors(self.vector_name, ids=obj_ids, columns=self.columns) y = np.array(y) # print(corpus.get_vector_matrix(self.vector_name).matrix.shape) # print(X.shape) # print(y.shape) self.clf.fit(X, y) return self
def load_conversations(corpus_name, max_samples, eval_percent=0.1): logging.info('Loading data.') def split_data(inputs, outputs, eval_percent): eval_index = int(len(inputs) * (1 - eval_percent)) return (inputs[:eval_index], outputs[:eval_index], inputs[eval_index:], outputs[eval_index:]) corpus = Corpus(filename=download(corpus_name)) deleted_filter = re.compile(r'^(\[deleted]|\[removed])$') inputs, outputs = [], [] for paths in corpus.iter_conversations(): for path in paths.get_root_to_leaf_paths(): for i in range(len(path) - 1): if deleted_filter.match(path[i].text) \ or deleted_filter.match(path[i-1].text) \ or deleted_filter.match(path[i+1].text): continue inputs.append(path[i].text) outputs.append(path[i + 1].text) if len(inputs) >= max_samples: return split_data(inputs, outputs, eval_percent) logging.info('Done!') return split_data(inputs, outputs, eval_percent)
def transform( self, corpus: Corpus, selector: Callable[[CorpusComponent], bool] = lambda x: True) -> Corpus: """ Annotate the corpus components with the classifier prediction and prediction score, with an optional selector that selects for objects to be classified. Objects that are not selected will get a metadata value of 'None' instead of the classifier prediction. :param corpus: the target Corpus :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus. :return: the target Corpus annotated """ objs = [] for obj in corpus.iter_objs(self.obj_type): if selector(obj): objs.append(obj) else: obj.add_meta(self.clf_attribute_name, None) obj.add_meta(self.clf_prob_attribute_name, None) obj_ids = [obj.id for obj in objs] X = corpus.get_vector_matrix(self.vector_name).get_vectors( obj_ids, self.columns) clfs, clfs_probs = self.clf.predict(X), self.clf.predict_proba(X)[:, 1] for idx, (clf, clf_prob) in enumerate(list(zip(clfs, clfs_probs))): obj = objs[idx] obj.add_meta(self.clf_attribute_name, clf) obj.add_meta(self.clf_prob_attribute_name, clf_prob) return corpus
def process_corpus(corpus_name, to_download=TO_DOWNLOAD, min_wc_source=MIN_WC_SOURCE, max_wc_source=MAX_WC_SOURCE, min_wc_target=MIN_WC_TARGET, max_wc_target=MAX_WC_TARGET, source_filter=SOURCE_FILTER, target_filter=TARGET_FILTER, text_cols=TEXT_COLS, data_dir=DATA_DIR): if to_download: corpus = Corpus(download(corpus_name, data_dir=data_dir)) else: corpus = Corpus(os.path.join(data_dir, corpus_name)) corpus_name = corpus.get_meta()['name'] print(corpus_name) corpus.print_summary_stats() print('processing', corpus.get_meta()['name']) corpus.load_info('utterance', ['parsed']) corpus = text_prep_pipe().transform(corpus) source_df, target_df = get_train_subset(corpus, min_wc_source, max_wc_source, min_wc_target, max_wc_target, source_filter, target_filter, text_cols) source_df.to_csv(os.path.join(data_dir, corpus_name + '.source.tsv'), sep='\t') target_df.to_csv(os.path.join(data_dir, corpus_name + '.target.tsv'), sep='\t')
def transform(self, corpus: Corpus, selector: Callable[[CorpusComponent], bool] = lambda x: True) -> Corpus: """ Annotate the corpus objects with the vectorized representation of the object's text, with an optional selector that filters for objects to be transformed. Objects that are not selected will get a metadata value of 'None' instead of the vector. :param corpus: the target Corpus :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus. :return: the target Corpus annotated """ objs = list(corpus.iter_objs(self.obj_type, selector)) ids = [obj.id for obj in objs] docs = [self.text_func(obj) for obj in objs] matrix = self.vectorizer.transform(docs) try: column_names = self.vectorizer.get_feature_names() except AttributeError: column_names = np.arange(matrix.shape[1]) corpus.set_vector_matrix(self.vector_name, matrix=matrix, ids=ids, columns=column_names) for obj in objs: obj.add_vector(self.vector_name) return corpus
def _read(self, corpus_split): corpus_split = corpus_split.split('_') corpus_name = corpus_split[0] self.split = corpus_split[1] if len(corpus_split) > 1 else None corpus = Corpus(filename=download(corpus_name)) conversations = corpus.iter_conversations() if self.sample: conversations = itertools.islice(conversations, self.sample) for conv in conversations: meta = conv.meta if (meta.get('split') != self.split) and (meta.get( 'annotation_year', 2018) != 2018): continue label = str(meta[self.label_field]) # turns = [u.text for u in conv.iter_utterances() if u.text.strip() and (not u.meta.get('is_section_header'))] turns = [ u.meta.parsed for u in conv.iter_utterances() if not u.meta.get('is_section_header') ] end = len(turns) - 1 if self.forecast else None turns = turns[-self.max_turns:end] if turns and all(turns): inst = self.text_to_instance(turns, label) if inst: yield inst
def transform( self, corpus: Corpus, selector: Callable[[CorpusComponent], bool] = lambda x: True) -> Corpus: """ Computes the vector matrix for the Corpus component objects and then stores it in a ConvoKitMatrix object, which is saved in the Corpus as `vector_name`. :param corpus: the target Corpus :param selector: a (lambda) function that takes a Corpus component object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus. :return: the target Corpus annotated """ objs = list(corpus.iter_objs(self.obj_type, selector)) ids = [obj.id for obj in objs] docs = [self.text_func(obj) for obj in objs] matrix = self.vectorizer.transform(docs) try: column_names = self.vectorizer.get_feature_names() except AttributeError: column_names = np.arange(matrix.shape[1]) corpus.set_vector_matrix(self.vector_name, matrix=matrix, ids=ids, columns=column_names) for obj in objs: obj.add_vector(self.vector_name) return corpus
def convert_intermediate_to_corpus(accum: Intermediate) -> Corpus: """Generates a Corpus from an Intermediate. :param accum: the Intermediate to be converted :type accum: Intermediate :return: the Corpus generated from accum """ users = {} utterances = [] unknown_len = set() complete_utterances = set() block_hashes_to_segments = {} block_hashes_to_utt_ids = {} for block_hash, block in accum.blocks.items(): if block.user not in users: users[block.user] = User(id=block.user) segments = accum.segment_contiguous_blocks(block.reply_chain) for seg in segments[:-1]: sos = helpers.string_of_seg(seg) complete_utterances.add(sos) assert (block_hash == segments[-1][-1]) if not accum.blocks[segments[-1][-1]].is_followed: complete_utterances.add(helpers.string_of_seg(segments[-1])) block_hashes_to_segments[block_hash] = segments for utt in iter(complete_utterances): block_hashes = utt.split(" ") belongs_to_segment = block_hashes_to_segments[block_hashes[0]] first_block = accum.blocks[block_hashes[0]] u_id = block_hashes[0] u_user = users[first_block.user] u_root = belongs_to_segment[0][0] u_replyto = _find_reply_to_from_segment(belongs_to_segment) u_timestamp = first_block.timestamp u_text = "\n".join([accum.blocks[h].text for h in block_hashes]) u_meta = {} u_meta["constituent_blocks"] = block_hashes for each_hash in block_hashes: block_hashes_to_utt_ids[each_hash] = u_id this_utterance = Utterance(id=u_id, user=u_user, root=u_root, reply_to=u_replyto, timestamp=u_timestamp, text=u_text, meta=u_meta) # this_utterance.meta = u_meta utterances.append(this_utterance) corpus = Corpus(utterances=utterances) corpus.meta["reverse_block_index"] = block_hashes_to_utt_ids return corpus
def add_title_to_root(corpus: Corpus): for conversation in corpus.iter_conversations(): utterance = corpus.get_utterance(conversation.id) title = conversation.retrieve_meta('title') if title is None: title = '' if utterance.text is None: utterance.text = title else: utterance.text = title + ' ' + utterance.text
def test_partial_load_end_idx_specified_only(self): user_byte_arr1 = bytearray([120, 3, 255, 0, 100]) user_byte_arr2 = bytearray([110, 3, 255, 90]) utt_byte_arr1 = bytearray([99, 44, 33]) utt_byte_arr2 = bytearray([110, 200, 220, 28]) corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", user=User(name="alice", meta={'user_binary_data': user_byte_arr1}), meta={'utt_binary_data': utt_byte_arr1}), Utterance(id="1", text="my name is bob", user=User(name="bob", meta={'user_binary_data': user_byte_arr2}), meta={'utt_binary_data': utt_byte_arr2}), Utterance(id="2", text="this is a test", user=User( name="charlie")), ]) corpus1.dump('test_corpus', './') corpus2 = Corpus(filename="test_corpus", utterance_end_index=0) self.assertEqual(len(list(corpus2.iter_utterances())), 1) self.assertEqual(corpus1.get_utterance("0"), corpus2.get_utterance("0"))
def transform(self, corpus: Corpus) -> Corpus: corpus = copy.deepcopy(corpus) for convo in corpus.iter_conversations(): if 'rank' in convo.meta.keys(): raise Exception( 'rank is already a key in this conversations meta! aborting' ) t = 0 for id in convo._utterance_ids: u = corpus.get_utterance(id) t += len(u.text) convo.meta['rank'] = t return corpus
def rank2(self, corpus: Corpus, score=None): if score == None: score = self.convo_length h = defaultdict(list) for convo in corpus.iter_conversations(): h[score(corpus, convo)].append(convo) return h
def print_corpus(c: Corpus) -> None: leaves = get_corpus_leaf_ids(c) for leaf_id in leaves: utt = c.get_utterance(leaf_id) chain = [utt] while utt.reply_to: utt = c.get_utterance(utt.reply_to) chain.append(utt) depth = "" print("this conversation is", len(chain), "utterances long.") for utterance in reversed(chain): print(depth + utterance.text.replace("\n", " ")) depth += "--> " print("\n")
def convert_df_to_corpus(df: DataFrame, id_col: str, text_col: str, meta_cols: List[str]) -> Corpus: """ Helper function to convert data to Corpus format Arguments: df {DataFrame} -- Actual data, in a pandas Dataframe id_col {str} -- name of the column that corresponds to utterances ids text_col {str} -- name of the column that stores texts of the utterances meta_cols {List[str]} -- set of columns that stores relevant metadata Returns: Corpus -- the converted corpus """ # in this particular case, speaker, reply_to, and timestamp information are all not applicable # and we will simply either create a placeholder entry, or leave it as None generic_speaker = Speaker(id="speaker") time = "NOT_RECORDED" utterance_list = [] for index, row in tqdm(df.iterrows()): # extracting meta data metadata = {} for meta_col in meta_cols: metadata[meta_col] = row[meta_col] utterance_list.append(Utterance(id=str(row[id_col]), speaker=generic_speaker, \ conversation_id=str(row[id_col]), reply_to=None, \ timestamp=time, text=row[text_col], \ meta=metadata)) return Corpus(utterances=utterance_list)
def fit(self, corpus: Corpus, y=None): # collect texts for vectorization docs = [] for obj in corpus.iter_objs(self.obj_type, self.selector): docs.append(self.text_func(obj)) self.vectorizer.fit(docs)
def transform( self, corpus: Corpus, selector: Callable[[CorpusComponent], bool] = lambda x: True) -> Corpus: """ Annotate corpus objects with pair information (label, pair_id, pair_orientation), with an optional selector indicating which objects should be considered for pairing. :param corpus: target Corpus :param selector: a (lambda) function that takes a Corpus object and returns a bool (True = include) :return: annotated Corpus """ pos_objs, neg_objs = self._get_pos_neg_objects(corpus, selector) obj_pairs = self._pair_objs(pos_objs, neg_objs) pair_orientations = self._assign_pair_orientations(obj_pairs) for pair_id, (pos_obj, neg_obj) in obj_pairs.items(): pos_obj.add_meta(self.label_attribute_name, "pos") neg_obj.add_meta(self.label_attribute_name, "neg") pos_obj.add_meta(self.pair_id_attribute_name, pair_id) neg_obj.add_meta(self.pair_id_attribute_name, pair_id) pos_obj.add_meta(self.pair_orientation_attribute_name, pair_orientations[pair_id]) neg_obj.add_meta(self.pair_orientation_attribute_name, pair_orientations[pair_id]) for obj in corpus.iter_objs(self.obj_type): # unlabelled objects include both objects that did not pass the selector # and objects that were not selected in the pairing step if self.label_attribute_name not in obj.meta: obj.add_meta(self.label_attribute_name, None) obj.add_meta(self.pair_id_attribute_name, None) obj.add_meta(self.pair_orientation_attribute_name, None) return corpus
def transform(self, corpus: Corpus) -> Corpus: """ Annotate corpus objects with pair information (label, pair_id, pair_orientation) :param corpus: target Corpus :return: annotated Corpus """ pos_objs, neg_objs = self._get_pos_neg_objects(corpus) obj_pairs = self._pair_objs(pos_objs, neg_objs) pair_orientations = self._assign_pair_orientations(obj_pairs) for pair_id, (pos_obj, neg_obj) in obj_pairs.items(): pos_obj.add_meta(self.label_feat_name, "pos") neg_obj.add_meta(self.label_feat_name, "neg") pos_obj.add_meta(self.pair_id_feat_name, pair_id) neg_obj.add_meta(self.pair_id_feat_name, pair_id) pos_obj.add_meta(self.pair_orientation_feat_name, pair_orientations[pair_id]) neg_obj.add_meta(self.pair_orientation_feat_name, pair_orientations[pair_id]) for obj in corpus.iter_objs(self.obj_type): # unlabelled objects include both objects that did not pass the selector # and objects that were not selected in the pairing step if self.label_feat_name not in obj.meta: obj.add_meta(self.label_feat_name, None) obj.add_meta(self.pair_id_feat_name, None) obj.add_meta(self.pair_orientation_feat_name, None) return corpus
def transform(self, corpus: Corpus) -> Corpus: for utt in corpus.iter_utterances(): if self.utt_selector(utt): utt.add_meta(self.perplexity_feat_name, self.model.str_perplexity(self.utt_text_func(utt))) else: utt.add_meta(self.perplexity_feat_name, None) return corpus
def test_dump_and_load_with_binary(self): """ Dump a corpus containing speakers with binary metadata and utterances with binary metadata Check that dumped corpus is successfully loaded with the same data """ speaker_byte_arr1 = bytearray([120, 3, 255, 0, 100]) speaker_byte_arr2 = bytearray([110, 3, 255, 90]) utt_byte_arr1 = bytearray([99, 44, 33]) utt_byte_arr2 = bytearray([110, 200, 220, 28]) corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice", meta={ 'speaker_binary_data': speaker_byte_arr1, 'index': 99 }), meta={'utt_binary_data': utt_byte_arr1}), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob", meta={'speaker_binary_data': speaker_byte_arr2}), meta={'utt_binary_data': utt_byte_arr2}), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) alice = corpus1.utterances["0"].speaker bob = corpus1.utterances["1"].speaker corpus1.dump('test_corpus', './') corpus2 = Corpus(filename="test_corpus") alice2 = corpus2.utterances["0"].speaker bob2 = corpus2.utterances["1"].speaker self.assertEqual(alice.meta, alice2.meta) self.assertEqual(corpus1.utterances["0"].meta, corpus2.utterances["0"].meta) self.assertEqual(bob.meta, bob2.meta) self.assertEqual(corpus1.utterances["1"].meta, corpus2.utterances["1"].meta)
def rank(self, corpus: Corpus, score=None): if score == None: score = self.convo_length h = [] for convo in corpus.iter_conversations(): heappush(h, (score(corpus, convo), len(h), convo)) while len(h) > 0: yield heappop(h)
def test_partial_load_invalid_start_index(self): speaker_byte_arr1 = bytearray([120, 3, 255, 0, 100]) speaker_byte_arr2 = bytearray([110, 3, 255, 90]) utt_byte_arr1 = bytearray([99, 44, 33]) utt_byte_arr2 = bytearray([110, 200, 220, 28]) corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker( id="alice", meta={'speaker_binary_data': speaker_byte_arr1}), meta={'utt_binary_data': utt_byte_arr1}), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob", meta={'speaker_binary_data': speaker_byte_arr2}), meta={'utt_binary_data': utt_byte_arr2}), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus1.dump('test_corpus', './') corpus2 = Corpus(filename="test_corpus", utterance_start_index=99) self.assertEqual(len(list(corpus2.iter_utterances())), 0)
def get_manual_corpus() -> Corpus: try: return Corpus(filename='build/manual') except: manual_corpus = build_manual_corpus() manual_corpus.dump(name='manual', increment_version=False, base_path='build') return manual_corpus
def transform(self, corpus: Corpus) -> Corpus: for obj in corpus.iter_objs(self.obj_type): if self.selector(obj): obj.meta[self.vector_name] = self.vectorizer.transform( [self.text_func(obj)]) else: obj.meta[self.vector_name] = None return corpus
def get_imessage_corpus() -> Corpus: try: return Corpus(filename='build/imessages') except: imessage_corpus = build_imessage_corpus() imessage_corpus.dump(name='imessages', increment_version=False, base_path='build') return imessage_corpus
def fit(self, corpus: Corpus, y=None): # collect texts for vectorization X = [] y = [] for obj in corpus.iter_objs(self.obj_type, self.selector): X.append(obj.meta[self.vector_name]) y.append(self.labeller(obj)) X = vstack(X) self.clf.fit(X, y) return self
def __init__(self, subReddit=""): self._startIndex = 0 self._endIndex = 5 * (10**5) self._startDate = 2007 self._endDate = 2018 self._target = subReddit if self._target != "": Corpus(filename=download(self._target), utterance_start_index=self._startIndex, utterance_end_index=self._endIndex)
def get_corpus_leaf_ids(c: Corpus) -> set: leaves = set() not_leaves = set() for utt in c.iter_utterances(): if utt.id not in not_leaves: leaves.add(utt.id) if utt.reply_to in leaves: leaves.remove(utt.reply_to) not_leaves.add(utt.reply_to) return leaves
def summarize(self, corpus: Corpus, use_selector=True): objId_clf_prob = [] for obj in corpus.iter_objs( self.obj_type, self.selector if use_selector else lambda _: True): objId_clf_prob.append((obj.id, obj.meta[self.clf_feat_name], obj.meta[self.clf_prob_feat_name])) return pd.DataFrame(list(objId_clf_prob), columns=['id', self.clf_feat_name, self.clf_prob_feat_name])\ .set_index('id').sort_values(self.clf_prob_feat_name, ascending=False)
def fit(self, corpus: Corpus, y=None, selector: Callable[[CorpusComponent], bool] = lambda x: True): """ Fit the Transformer's internal vectorizer on the Corpus objects' texts, with an optional selector that filters for objects to be fit on. :param corpus: the target Corpus :param selector: a (lambda) function that takes a Corpus object and returns True or False (i.e. include / exclude). By default, the selector includes all objects of the specified type in the Corpus. :return: the fitted BoWTransformer """ # collect texts for vectorization docs = [self.text_func(obj) for obj in corpus.iter_objs(self.obj_type, selector)] self.vectorizer.fit(docs) return self
def _get_pos_neg_objects(self, corpus: Corpus, selector): """ Get positively-labelled and negatively-labelled lists of objects :param corpus: target Corpus :return: list of positive objects, list of negative objects """ pos_objects = [] neg_objects = [] for obj in corpus.iter_objs(self.obj_type, selector): if self.pos_label_func(obj): pos_objects.append(obj) elif self.neg_label_func(obj): neg_objects.append(obj) return pos_objects, neg_objects
# # The plots answer these questions: # - Do users on the whole coordinate more to admins or nonadmins? # - Do admins coordinate to other people more than nonadmins do? from convokit import Utterance, Corpus, Coordination, download import matplotlib.pyplot as plt import matplotlib.patches as mpatches import numpy as np # load corpus; split users by whether they are an admin # this means that if a user has spoken in the corpus as both an admin and # a non-admin, then we will split this user into two users, one for each of # these roles corpus = Corpus(filename=download("wiki-corpus")) split = ["is_admin"] # create coordination object coord = Coordination() coord.fit(corpus) # helper function to plot two coordination scores against each other as a chart, # on aggregate and by coordination marker # a is a tuple (speakers, targets) # b is a tuple (speakers, targets) def make_chart(a_scores, b_scores, a_description, b_description, a_color="b", b_color="g"): # get scores by marker and on aggregate _, a_score_by_marker, a_agg1, a_agg2, a_agg3 = coord.score_report(corpus, a_scores) _, b_score_by_marker, b_agg1, b_agg2, b_agg3 = coord.score_report(corpus, b_scores)
# This example extracts politeness strategies from the Conversations Gone Awry dataset, # one of the steps in the Conversations Gone Awry paper (http://www.cs.cornell.edu/~cristian/Conversations_gone_awry.html). # For code reproducing the full results of the paper, see the example notebook in the # `conversations-gone-awry` example subdirectory. import pandas as pd from convokit import PolitenessStrategies, Corpus, download print("Loading awry corpus...") corpus = Corpus(filename=download('conversations-gone-awry-corpus')) # extract the politeness strategies. # Note: politeness strategies are a hand-engineered feature set, so no fitting is needed. ps = PolitenessStrategies(verbose=100) print("Extracting politeness strategies...") corpus = ps.transform(corpus) values = [] idx = [] for utterance in corpus.iter_utterances(): values.append(utterance.meta["politeness_strategies"]) idx.append(utterance.id) pd.DataFrame(values, index=idx).to_csv("awry_strategy_df_v2.csv") print("Done, results written to awry_strategy_df_v2.csv")
# The plots answer these questions: # - Do lawyers coordinate more to justices than the other way around? # - Do lawyers coordinate more to unfavorable or favorable justices? # - Do unfavorable justices coordinate to lawyers more than favorable justices, # or vice versa? from convokit import Utterance, Corpus, Coordination, download import matplotlib.pyplot as plt import matplotlib.patches as mpatches import numpy as np # load corpus; split users by case id and split the justices by whether they are # favorable to the current presenting side # this treats the same person across two different cases as two different users corpus = Corpus(filename=download("supreme-corpus")) split = ["case", "justice-is-favorable"] # create coordination object coord = Coordination() coord.fit(corpus) # helper function to plot two coordination scores against each other as a chart, # on aggregate and by coordination marker # a is a tuple (speakers, targets) # b is a tuple (speakers, targets) def make_chart(a_scores, b_scores, a_description, b_description, a_color="b", b_color="g"): # get scores by marker and on aggregate _, a_score_by_marker, a_agg1, a_agg2, a_agg3 = coord.score_report(corpus, a_scores) _, b_score_by_marker, b_agg1, b_agg2, b_agg3 = coord.score_report(corpus, b_scores)