def test_partial_load_start_idx_specified_only(self): user_byte_arr1 = bytearray([120, 3, 255, 0, 100]) user_byte_arr2 = bytearray([110, 3, 255, 90]) utt_byte_arr1 = bytearray([99, 44, 33]) utt_byte_arr2 = bytearray([110, 200, 220, 28]) corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", user=User(name="alice", meta={'user_binary_data': user_byte_arr1}), meta={'utt_binary_data': utt_byte_arr1}), Utterance(id="1", text="my name is bob", user=User(name="bob", meta={'user_binary_data': user_byte_arr2}), meta={'utt_binary_data': utt_byte_arr2}), Utterance(id="2", text="this is a test", user=User( name="charlie")), ]) corpus1.dump('test_corpus', './') corpus2 = Corpus(filename="test_corpus", utterance_start_index=1) self.assertEqual(len(list(corpus2.iter_utterances())), 2) self.assertEqual(corpus1.get_utterance("1"), corpus2.get_utterance("1")) self.assertEqual(corpus1.get_utterance("2"), corpus2.get_utterance("2"))
def test_partial_load_invalid_end_index(self): speaker_byte_arr1 = bytearray([120, 3, 255, 0, 100]) speaker_byte_arr2 = bytearray([110, 3, 255, 90]) utt_byte_arr1 = bytearray([99, 44, 33]) utt_byte_arr2 = bytearray([110, 200, 220, 28]) corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker( id="alice", meta={'speaker_binary_data': speaker_byte_arr1}), meta={'utt_binary_data': utt_byte_arr1}), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob", meta={'speaker_binary_data': speaker_byte_arr2}), meta={'utt_binary_data': utt_byte_arr2}), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) corpus1.dump('test_corpus', './') corpus2 = Corpus(filename="test_corpus", utterance_end_index=-1) self.assertEqual(len(list(corpus2.iter_utterances())), 0)
def create_utterances(conversations): """Creates a convokit utterances class.""" utterance_corpus = {} ut_id = 1 for conv in tqdm(conversations): root = ut_id for ut in conv.get_conversation(): sp = ut[0] if sp != 'UNKFEMALE' and sp != 'UNKMALE' and sp != 'UNKMULTI': text = ut[1] if root == ut_id: u = Utterance(id='u' + str(ut_id), speaker=corpus_speakers[sp], text=text, root='u' + str(root), reply_to=None) utterance_corpus['u' + str(ut_id)] = u else: u = Utterance(id='u' + str(ut_id), speaker=corpus_speakers[sp], text=text, root='u' + str(root), reply_to='u' + str(ut_id - 1)) utterance_corpus['u' + str(ut_id)] = u ut_id += 1 utterance_list = utterance_corpus.values() return utterance_list
def test_dump_and_load_with_binary(self): """ Dump a corpus containing speakers with binary metadata and utterances with binary metadata Check that dumped corpus is successfully loaded with the same data """ speaker_byte_arr1 = bytearray([120, 3, 255, 0, 100]) speaker_byte_arr2 = bytearray([110, 3, 255, 90]) utt_byte_arr1 = bytearray([99, 44, 33]) utt_byte_arr2 = bytearray([110, 200, 220, 28]) corpus1 = Corpus(utterances=[ Utterance(id="0", text="hello world", speaker=Speaker(id="alice", meta={ 'speaker_binary_data': speaker_byte_arr1, 'index': 99 }), meta={'utt_binary_data': utt_byte_arr1}), Utterance(id="1", text="my name is bob", speaker=Speaker( id="bob", meta={'speaker_binary_data': speaker_byte_arr2}), meta={'utt_binary_data': utt_byte_arr2}), Utterance( id="2", text="this is a test", speaker=Speaker(id="charlie")), ]) alice = corpus1.get_speaker("alice") bob = corpus1.get_speaker("bob") corpus1.dump('test_corpus', './') corpus2 = Corpus(filename="test_corpus") alice2 = corpus2.get_speaker("alice") bob2 = corpus2.get_speaker("bob") self.assertEqual(alice.meta, alice2.meta) self.assertEqual( corpus1.get_utterance('0').meta, corpus2.get_utterance('0').meta) self.assertEqual(bob.meta, bob2.meta) self.assertEqual( corpus1.get_utterance('1').meta, corpus2.get_utterance('1').meta)
def convert_df_to_corpus(df: DataFrame, id_col: str, text_col: str, meta_cols: List[str]) -> Corpus: """ Helper function to convert data to Corpus format Arguments: df {DataFrame} -- Actual data, in a pandas Dataframe id_col {str} -- name of the column that corresponds to utterances ids text_col {str} -- name of the column that stores texts of the utterances meta_cols {List[str]} -- set of columns that stores relevant metadata Returns: Corpus -- the converted corpus """ # in this particular case, speaker, reply_to, and timestamp information are all not applicable # and we will simply either create a placeholder entry, or leave it as None generic_speaker = Speaker(id="speaker") time = "NOT_RECORDED" utterance_list = [] for index, row in tqdm(df.iterrows()): # extracting meta data metadata = {} for meta_col in meta_cols: metadata[meta_col] = row[meta_col] utterance_list.append(Utterance(id=str(row[id_col]), speaker=generic_speaker, \ conversation_id=str(row[id_col]), reply_to=None, \ timestamp=time, text=row[text_col], \ meta=metadata)) return Corpus(utterances=utterance_list)
def convert_intermediate_to_corpus(accum: Intermediate) -> Corpus: """Generates a Corpus from an Intermediate. :param accum: the Intermediate to be converted :type accum: Intermediate :return: the Corpus generated from accum """ users = {} utterances = [] unknown_len = set() complete_utterances = set() block_hashes_to_segments = {} block_hashes_to_utt_ids = {} for block_hash, block in accum.blocks.items(): if block.user not in users: users[block.user] = User(id=block.user) segments = accum.segment_contiguous_blocks(block.reply_chain) for seg in segments[:-1]: sos = helpers.string_of_seg(seg) complete_utterances.add(sos) assert (block_hash == segments[-1][-1]) if not accum.blocks[segments[-1][-1]].is_followed: complete_utterances.add(helpers.string_of_seg(segments[-1])) block_hashes_to_segments[block_hash] = segments for utt in iter(complete_utterances): block_hashes = utt.split(" ") belongs_to_segment = block_hashes_to_segments[block_hashes[0]] first_block = accum.blocks[block_hashes[0]] u_id = block_hashes[0] u_user = users[first_block.user] u_root = belongs_to_segment[0][0] u_replyto = _find_reply_to_from_segment(belongs_to_segment) u_timestamp = first_block.timestamp u_text = "\n".join([accum.blocks[h].text for h in block_hashes]) u_meta = {} u_meta["constituent_blocks"] = block_hashes for each_hash in block_hashes: block_hashes_to_utt_ids[each_hash] = u_id this_utterance = Utterance(id=u_id, user=u_user, root=u_root, reply_to=u_replyto, timestamp=u_timestamp, text=u_text, meta=u_meta) # this_utterance.meta = u_meta utterances.append(this_utterance) corpus = Corpus(utterances=utterances) corpus.meta["reverse_block_index"] = block_hashes_to_utt_ids return corpus
def format_as_corpus(self, conv): users = np.unique([utt['user'] for utt in conv]) users_dict = {user: Speaker(name=user) for user in users} utterances = [] for utt in conv: user = users_dict[utt['user']] utt_obj = Utterance(id=utt['utt_id'], user=user, text=utt['text'], root=str(utt['conv_id'])) utt_obj.add_meta('reply_depth', utt['indent_depth']) utterances.append(utt_obj) corpus = Corpus(utterances=utterances) return corpus
def test_dump_and_load_with_binary(self): """ Dump a corpus containing users with binary metadata and utterances with binary metadata Check that dumped corpus is successfully loaded with the same data """ user_byte_arr1 = bytearray([120, 3, 255, 0, 100]) user_byte_arr2 = bytearray([110, 3, 255, 90]) utt_byte_arr1 = bytearray([99, 44, 33]) utt_byte_arr2 = bytearray([110, 200, 220, 28]) corpus1 = Corpus(utterances=[ Utterance(id=0, text="hello world", user=User(name="alice", meta={'user_binary_data': user_byte_arr1}), meta={'utt_binary_data': utt_byte_arr1}), Utterance(id=1, text="my name is bob", user=User(name="bob", meta={'user_binary_data': user_byte_arr2}), meta={'utt_binary_data': utt_byte_arr2}), Utterance(id=2, text="this is a test", user=User(name="charlie")), ]) alice = corpus1.utterances[0].user bob = corpus1.utterances[1].user corpus1.dump('test_corpus', './') corpus2 = Corpus(filename="test_corpus") alice2 = corpus2.utterances[0].user bob2 = corpus2.utterances[1].user self.assertEqual(alice.meta, alice2.meta) self.assertEqual(corpus1.utterances[0].meta, corpus2.utterances[0].meta) self.assertEqual(bob.meta, bob2.meta) self.assertEqual(corpus1.utterances[1].meta, corpus2.utterances[1].meta)
def build_manual_corpus() -> Corpus: print('Building corpus from manually created yml files...') manual_files = [] for root, dirs, files in os.walk('data/manual'): manual_files.extend([os.path.join(root, f) for f in files]) conversations = [] for path in manual_files: with open(path) as f: cs = yaml.load(f.read())['conversations'] for c in cs: conversations.append((c[0], c[1])) speakers = {'0': Speaker(id='0'), '_analysis': Speaker(id='_analysis')} utterances = [] i = 0 for _ in range(10): for prompt, response in conversations: id_1 = "M" + str(i) id_2 = "M" + str(i + 1) utts = [ Utterance(id=id_1, text=prompt, speaker=speakers["_analysis"], root=id_1, reply_to=None), Utterance( id=id_2, text=response, speaker=speakers["0"], root=id_1, reply_to=id_1, ), ] i = i + 2 utterances.extend(utts) return Corpus(utterances=utterances)
def transform_utterance(self, utt): """ Computes representations and statistics for a single utterance, which can be a ConvoKit Utterance or a string. Will return an Utterance object a nd write all of these characterizations (including vectors) to the utterance's metadata; attribute names are prefixed with the `output_prefix` constructor argument. :param utt: Utterance or string :return: the utterance, with per-utterance representation, range and cluster assignments. """ if isinstance(utt, str): utt = Utterance(text=utt, speaker=Speaker()) self.text_pipe.transform_utterance(utt) self.tfidf_model.transform_utterance(utt) return self.dualmodel.transform_utterance(utt)
def reconstruct_corpus(dataset): users = [utt['event_user_id'] for utt in dataset] users = np.unique(users) users_dict = {user: User(name=user) for user in users} utterances = [] for utt in tqdm(dataset): user = users_dict[utt['event_user_id']] if utt[ 'event_user_id'] is not None else users_dict['none'] utterances.append( Utterance(id=utt['revision_id'], user=user, text=utt['event_comment\n'])) corpus = Corpus(utterances=utterances) return corpus
def build_imessage_corpus() -> Corpus: print('Building corpus from iMessages...') conn = sqlite3.connect(os.path.expanduser('~/Library/Messages/chat.db')) conn.row_factory = sqlite3.Row cur = conn.cursor() # Handles (AKA Speakers) cur.execute("select ROWID as handle_id, id as phone_number from handle") handles = [dict(x) for x in cur.fetchall()] speakers = { str(h['handle_id']): Speaker(id=str(h['handle_id']), meta=h) for h in handles } speakers.update({ '0': Speaker(id='0', meta={'phone_number': '+12155889243'}) }) # don't call me unless you want # Chats chats = pd.read_sql_query("select * from chat", conn) chats.rename(columns={ 'ROWID': 'chat_id', 'chat_identifier': 'chat_name' }, inplace=True) chat_cols = list(chats) chats[chat_cols] = chats[chat_cols].astype(str) # Messages messages = pd.read_sql_query("select * from message", conn) messages.rename(columns={'ROWID': 'message_id'}, inplace=True) messages = messages[[ 'message_id', 'text', 'handle_id', 'date', 'is_from_me' ]] messages['sender_id'] = messages.apply(lambda r: r['handle_id'] if r['is_from_me'] == 0 else '0', axis=1) # Add chat data to messages chat_message_joins = pd.read_sql_query("select * from chat_message_join", conn) messages = pd.merge(messages, chat_message_joins[['chat_id', 'message_id']], on='message_id', how='left').dropna() messages['chat_id'] = messages['chat_id'].astype(int) cols = list(messages) messages[cols] = messages[cols].astype(str) utterances = [] for _, chat in chats.iterrows(): chat_messages = messages.loc[messages['chat_id'] == chat['chat_id']].sort_values(by=['date']) num_messages = len(chat_messages.index) if num_messages == 0: print("Warning: chat '%s' has no messages" % chat['chat_name']) continue root_msg = chat_messages.iloc[0] for i in range(num_messages): msg = chat_messages.iloc[i] last_msg = chat_messages.iloc[i - 1] if i != 0 else None last_msg_id = chat_messages.iloc[ i - 1]['message_id'] if i != 0 else None # Make a new conversation if more than an hour has passed between messages if last_msg is not None and int(msg['date']) - int( chat_messages.iloc[i - 1]['date']) > 3.6e12: root_msg = chat_messages.iloc[i] last_msg_id = None msg_utt = Utterance(id=msg['message_id'], text=msg['text'], speaker=speakers[msg['sender_id']], root=root_msg['message_id'], reply_to=last_msg_id, meta=msg) utterances.append(msg_utt) return Corpus(utterances=utterances)
def rough_convert_intermediate_to_corpus(accum: Intermediate) -> Corpus: """Generates a rougher approximation of a Corpus from an Intermediate. Does not worry about reply_to structure, and instead sorts replies by the chronological order in which utterances are posted to discussions. :param accum: the Intermediate to be converted :type accum: Intermediate :return: the Corpus generated from accum """ users = {} utterances = [] unknown_len = set() complete_utterances = set() block_hashes_to_segments = {} block_hashes_to_utt_ids = {} for block_hash, block in accum.blocks.items(): try: if block.user not in users: users[block.user] = User(id=block.user) segments = accum.segment_contiguous_blocks(block.reply_chain) assert (block_hash == segments[-1][-1]) # any complete contiguous block is a complete utterance for seg in segments[:-1]: sos = helpers.string_of_seg(seg) complete_utterances.add(sos) if block.is_header or not accum.blocks[segments[-1] [-1]].is_followed: complete_utterances.add(helpers.string_of_seg(segments[-1])) block_hashes_to_segments[block_hash] = segments except Exception as e: logging.debug(e, exc_info=True) logging.warning( 'Issue with conversion to corpus; skipping adding block "%s..."', block.text[:32]) children_of_root = {} for utt in iter(complete_utterances): block_hashes = utt.split(" ") belongs_to_segment = block_hashes_to_segments[block_hashes[0]] first_block = accum.blocks[block_hashes[0]] u_id = block_hashes[0] u_user = users[first_block.user] u_root = accum.find_ultimate_hash(first_block.root_hash) u_timestamp = first_block.timestamp u_text = "\n".join([accum.blocks[h].text for h in block_hashes]) u_meta = {} u_meta["last_revision"] = first_block.revision_ids[ -1] if first_block.revision_ids[-1] != "unknown" else 0 this_utterance = Utterance(id=u_id, user=u_user, root=u_root, reply_to=None, timestamp=u_timestamp, text=u_text, meta=u_meta) if u_root in children_of_root: children_of_root[u_root].append(this_utterance) else: children_of_root[u_root] = [this_utterance] utterances = [] for root, utt_list in children_of_root.items(): if root == None: continue utt_list.sort(key=lambda x: x.timestamp) ind_of_root = 0 try: while utt_list[ind_of_root].id != root: ind_of_root += 1 except Exception as e: logging.debug(e, exc_info=True) logging.warning( 'Skipping section in conversion to corpus: could not find section header for root %s', root) continue if ind_of_root > 0: utt_list.insert(0, utt_list.pop(ind_of_root)) utterances.append(utt_list[0]) added = set([utt_list[0].id]) i, j = 0, 1 while j < len(utt_list): if utt_list[j].id not in added: utt_list[j].reply_to = utt_list[i].id added.add(utt_list[j].id) utterances.append(utt_list[j]) i = j j += 1 # for i in range(1, len(utt_list)): # if utt_list[i-1].id == utt_list[i].id: # logging.warning("Skipping utterance in conversion to corpus: reply to self %s", utt_list[i].id) # else: # utt_list[i].reply_to = utt_list[i-1].id # utterances.append(utt_list[i]) corpus = Corpus(utterances=utterances) return corpus
def convert_intermediate_to_corpus(accum: Intermediate) -> Corpus: """Generates a Corpus from an Intermediate. :param accum: the Intermediate to be converted :type accum: Intermediate :return: the Corpus generated from accum """ users = {} utterances = [] unknown_len = set() complete_utterances = set() block_hashes_to_segments = {} block_hashes_to_utt_ids = {} for block_hash, block in accum.blocks.items(): try: if block.user not in users: users[block.user] = User(id=block.user) segments = accum.segment_contiguous_blocks(block.reply_chain) assert (block_hash == segments[-1][-1]) # any complete contiguous block is a complete utterance for seg in segments[:-1]: sos = helpers.string_of_seg(seg) complete_utterances.add(sos) if block.is_header or not accum.blocks[segments[-1] [-1]].is_followed: complete_utterances.add(helpers.string_of_seg(segments[-1])) block_hashes_to_segments[block_hash] = segments except Exception as e: logging.debug(e, exc_info=True) logging.warning( 'Issue with conversion to corpus; skipping adding block "%s..."', block.text[:32]) for utt in iter(complete_utterances): block_hashes = utt.split(" ") belongs_to_segment = block_hashes_to_segments[block_hashes[0]] first_block = accum.blocks[block_hashes[0]] u_id = block_hashes[0] u_user = users[first_block.user] u_root = accum.find_ultimate_hash(first_block.root_hash) u_replyto = _find_reply_to_from_segment(belongs_to_segment) u_timestamp = first_block.timestamp u_text = "\n".join([accum.blocks[h].text for h in block_hashes]) u_meta = {} u_meta["constituent_blocks"] = block_hashes u_meta["last_revision"] = first_block.revision_ids[ -1] if first_block.revision_ids[-1] != "unknown" else 0 for each_hash in block_hashes: block_hashes_to_utt_ids[each_hash] = u_id this_utterance = Utterance(id=u_id, user=u_user, root=u_root, reply_to=u_replyto, timestamp=u_timestamp, text=u_text, meta=u_meta) utterances.append(this_utterance) corpus = Corpus(utterances=utterances) corpus.meta["reverse_block_index"] = block_hashes_to_utt_ids return corpus