Пример #1
0
    def test_partial_load_start_idx_specified_only(self):
        user_byte_arr1 = bytearray([120, 3, 255, 0, 100])
        user_byte_arr2 = bytearray([110, 3, 255, 90])
        utt_byte_arr1 = bytearray([99, 44, 33])
        utt_byte_arr2 = bytearray([110, 200, 220, 28])

        corpus1 = Corpus(utterances=[
            Utterance(id="0",
                      text="hello world",
                      user=User(name="alice",
                                meta={'user_binary_data': user_byte_arr1}),
                      meta={'utt_binary_data': utt_byte_arr1}),
            Utterance(id="1",
                      text="my name is bob",
                      user=User(name="bob",
                                meta={'user_binary_data': user_byte_arr2}),
                      meta={'utt_binary_data': utt_byte_arr2}),
            Utterance(id="2", text="this is a test", user=User(
                name="charlie")),
        ])

        corpus1.dump('test_corpus', './')

        corpus2 = Corpus(filename="test_corpus", utterance_start_index=1)

        self.assertEqual(len(list(corpus2.iter_utterances())), 2)
        self.assertEqual(corpus1.get_utterance("1"),
                         corpus2.get_utterance("1"))
        self.assertEqual(corpus1.get_utterance("2"),
                         corpus2.get_utterance("2"))
    def test_partial_load_invalid_end_index(self):
        speaker_byte_arr1 = bytearray([120, 3, 255, 0, 100])
        speaker_byte_arr2 = bytearray([110, 3, 255, 90])
        utt_byte_arr1 = bytearray([99, 44, 33])
        utt_byte_arr2 = bytearray([110, 200, 220, 28])

        corpus1 = Corpus(utterances=[
            Utterance(id="0",
                      text="hello world",
                      speaker=Speaker(
                          id="alice",
                          meta={'speaker_binary_data': speaker_byte_arr1}),
                      meta={'utt_binary_data': utt_byte_arr1}),
            Utterance(id="1",
                      text="my name is bob",
                      speaker=Speaker(
                          id="bob",
                          meta={'speaker_binary_data': speaker_byte_arr2}),
                      meta={'utt_binary_data': utt_byte_arr2}),
            Utterance(
                id="2", text="this is a test", speaker=Speaker(id="charlie")),
        ])

        corpus1.dump('test_corpus', './')

        corpus2 = Corpus(filename="test_corpus", utterance_end_index=-1)

        self.assertEqual(len(list(corpus2.iter_utterances())), 0)
def create_utterances(conversations):
    """Creates a convokit utterances class."""
    utterance_corpus = {}
    ut_id = 1
    for conv in tqdm(conversations):
        root = ut_id
        for ut in conv.get_conversation():
            sp = ut[0]
            if sp != 'UNKFEMALE' and sp != 'UNKMALE' and sp != 'UNKMULTI':
                text = ut[1]
                if root == ut_id:
                    u = Utterance(id='u' + str(ut_id),
                                  speaker=corpus_speakers[sp],
                                  text=text,
                                  root='u' + str(root),
                                  reply_to=None)
                    utterance_corpus['u' + str(ut_id)] = u
                else:
                    u = Utterance(id='u' + str(ut_id),
                                  speaker=corpus_speakers[sp],
                                  text=text,
                                  root='u' + str(root),
                                  reply_to='u' + str(ut_id - 1))
                    utterance_corpus['u' + str(ut_id)] = u

                ut_id += 1

    utterance_list = utterance_corpus.values()
    return utterance_list
    def test_dump_and_load_with_binary(self):
        """
        Dump a corpus containing speakers with binary metadata and utterances with binary metadata
        Check that dumped corpus is successfully loaded with the same data
        """

        speaker_byte_arr1 = bytearray([120, 3, 255, 0, 100])
        speaker_byte_arr2 = bytearray([110, 3, 255, 90])
        utt_byte_arr1 = bytearray([99, 44, 33])
        utt_byte_arr2 = bytearray([110, 200, 220, 28])

        corpus1 = Corpus(utterances=[
            Utterance(id="0",
                      text="hello world",
                      speaker=Speaker(id="alice",
                                      meta={
                                          'speaker_binary_data':
                                          speaker_byte_arr1,
                                          'index': 99
                                      }),
                      meta={'utt_binary_data': utt_byte_arr1}),
            Utterance(id="1",
                      text="my name is bob",
                      speaker=Speaker(
                          id="bob",
                          meta={'speaker_binary_data': speaker_byte_arr2}),
                      meta={'utt_binary_data': utt_byte_arr2}),
            Utterance(
                id="2", text="this is a test", speaker=Speaker(id="charlie")),
        ])

        alice = corpus1.get_speaker("alice")
        bob = corpus1.get_speaker("bob")

        corpus1.dump('test_corpus', './')
        corpus2 = Corpus(filename="test_corpus")

        alice2 = corpus2.get_speaker("alice")
        bob2 = corpus2.get_speaker("bob")

        self.assertEqual(alice.meta, alice2.meta)
        self.assertEqual(
            corpus1.get_utterance('0').meta,
            corpus2.get_utterance('0').meta)
        self.assertEqual(bob.meta, bob2.meta)
        self.assertEqual(
            corpus1.get_utterance('1').meta,
            corpus2.get_utterance('1').meta)
def convert_df_to_corpus(df: DataFrame, id_col: str, text_col: str,
                         meta_cols: List[str]) -> Corpus:
    """ Helper function to convert data to Corpus format
     
    Arguments:
        df {DataFrame} -- Actual data, in a pandas Dataframe
        id_col {str} -- name of the column that corresponds to utterances ids 
        text_col {str} -- name of the column that stores texts of the utterances  
        meta_cols {List[str]} -- set of columns that stores relevant metadata 
    
    Returns:
        Corpus -- the converted corpus
    """

    # in this particular case, speaker, reply_to, and timestamp information are all not applicable
    # and we will simply either create a placeholder entry, or leave it as None

    generic_speaker = Speaker(id="speaker")
    time = "NOT_RECORDED"

    utterance_list = []
    for index, row in tqdm(df.iterrows()):

        # extracting meta data
        metadata = {}
        for meta_col in meta_cols:
            metadata[meta_col] = row[meta_col]

        utterance_list.append(Utterance(id=str(row[id_col]), speaker=generic_speaker, \
                                        conversation_id=str(row[id_col]), reply_to=None, \
                                        timestamp=time, text=row[text_col], \
                                        meta=metadata))

    return Corpus(utterances=utterance_list)
Пример #6
0
def convert_intermediate_to_corpus(accum: Intermediate) -> Corpus:
    """Generates a Corpus from an Intermediate.

    :param accum: the Intermediate to be converted
    :type accum: Intermediate

    :return: the Corpus generated from accum
    """
    users = {}
    utterances = []
    unknown_len = set()
    complete_utterances = set()
    block_hashes_to_segments = {}
    block_hashes_to_utt_ids = {}
    for block_hash, block in accum.blocks.items():
        if block.user not in users:
            users[block.user] = User(id=block.user)
        segments = accum.segment_contiguous_blocks(block.reply_chain)
        for seg in segments[:-1]:
            sos = helpers.string_of_seg(seg)
            complete_utterances.add(sos)

        assert (block_hash == segments[-1][-1])
        if not accum.blocks[segments[-1][-1]].is_followed:
            complete_utterances.add(helpers.string_of_seg(segments[-1]))
        block_hashes_to_segments[block_hash] = segments

    for utt in iter(complete_utterances):
        block_hashes = utt.split(" ")
        belongs_to_segment = block_hashes_to_segments[block_hashes[0]]
        first_block = accum.blocks[block_hashes[0]]

        u_id = block_hashes[0]
        u_user = users[first_block.user]
        u_root = belongs_to_segment[0][0]
        u_replyto = _find_reply_to_from_segment(belongs_to_segment)
        u_timestamp = first_block.timestamp
        u_text = "\n".join([accum.blocks[h].text for h in block_hashes])
        u_meta = {}
        u_meta["constituent_blocks"] = block_hashes

        for each_hash in block_hashes:
            block_hashes_to_utt_ids[each_hash] = u_id

        this_utterance = Utterance(id=u_id,
                                   user=u_user,
                                   root=u_root,
                                   reply_to=u_replyto,
                                   timestamp=u_timestamp,
                                   text=u_text,
                                   meta=u_meta)
        # this_utterance.meta = u_meta

        utterances.append(this_utterance)

    corpus = Corpus(utterances=utterances)
    corpus.meta["reverse_block_index"] = block_hashes_to_utt_ids

    return corpus
Пример #7
0
    def format_as_corpus(self, conv):
        users = np.unique([utt['user'] for utt in conv])
        users_dict = {user: Speaker(name=user) for user in users}

        utterances = []

        for utt in conv:
            user = users_dict[utt['user']]
            utt_obj = Utterance(id=utt['utt_id'],
                                user=user,
                                text=utt['text'],
                                root=str(utt['conv_id']))
            utt_obj.add_meta('reply_depth', utt['indent_depth'])
            utterances.append(utt_obj)

        corpus = Corpus(utterances=utterances)

        return corpus
Пример #8
0
    def test_dump_and_load_with_binary(self):
        """
        Dump a corpus containing users with binary metadata and utterances with binary metadata
        Check that dumped corpus is successfully loaded with the same data
        """

        user_byte_arr1 = bytearray([120, 3, 255, 0, 100])
        user_byte_arr2 = bytearray([110, 3, 255, 90])
        utt_byte_arr1 = bytearray([99, 44, 33])
        utt_byte_arr2 = bytearray([110, 200, 220, 28])

        corpus1 = Corpus(utterances=[
            Utterance(id=0,
                      text="hello world",
                      user=User(name="alice",
                                meta={'user_binary_data': user_byte_arr1}),
                      meta={'utt_binary_data': utt_byte_arr1}),
            Utterance(id=1,
                      text="my name is bob",
                      user=User(name="bob",
                                meta={'user_binary_data': user_byte_arr2}),
                      meta={'utt_binary_data': utt_byte_arr2}),
            Utterance(id=2, text="this is a test", user=User(name="charlie")),
        ])

        alice = corpus1.utterances[0].user
        bob = corpus1.utterances[1].user

        corpus1.dump('test_corpus', './')
        corpus2 = Corpus(filename="test_corpus")

        alice2 = corpus2.utterances[0].user
        bob2 = corpus2.utterances[1].user

        self.assertEqual(alice.meta, alice2.meta)
        self.assertEqual(corpus1.utterances[0].meta,
                         corpus2.utterances[0].meta)
        self.assertEqual(bob.meta, bob2.meta)
        self.assertEqual(corpus1.utterances[1].meta,
                         corpus2.utterances[1].meta)
Пример #9
0
def build_manual_corpus() -> Corpus:
    print('Building corpus from manually created yml files...')

    manual_files = []
    for root, dirs, files in os.walk('data/manual'):
        manual_files.extend([os.path.join(root, f) for f in files])

    conversations = []
    for path in manual_files:
        with open(path) as f:
            cs = yaml.load(f.read())['conversations']
            for c in cs:
                conversations.append((c[0], c[1]))

    speakers = {'0': Speaker(id='0'), '_analysis': Speaker(id='_analysis')}

    utterances = []
    i = 0
    for _ in range(10):
        for prompt, response in conversations:
            id_1 = "M" + str(i)
            id_2 = "M" + str(i + 1)
            utts = [
                Utterance(id=id_1,
                          text=prompt,
                          speaker=speakers["_analysis"],
                          root=id_1,
                          reply_to=None),
                Utterance(
                    id=id_2,
                    text=response,
                    speaker=speakers["0"],
                    root=id_1,
                    reply_to=id_1,
                ),
            ]
            i = i + 2
            utterances.extend(utts)

    return Corpus(utterances=utterances)
    def transform_utterance(self, utt):
        """
        Computes representations and statistics for a single utterance, which can be a ConvoKit Utterance or a string. 
        Will return an Utterance object a nd write all of these characterizations (including vectors) to the utterance's metadata; attribute names are prefixed with the `output_prefix` constructor argument.

        :param utt: Utterance or string
        :return: the utterance, with per-utterance representation, range and cluster assignments.
        """
        if isinstance(utt, str):
            utt = Utterance(text=utt, speaker=Speaker())
        self.text_pipe.transform_utterance(utt)
        self.tfidf_model.transform_utterance(utt)
        return self.dualmodel.transform_utterance(utt)
def reconstruct_corpus(dataset):
    users = [utt['event_user_id'] for utt in dataset]
    users = np.unique(users)
    users_dict = {user: User(name=user) for user in users}

    utterances = []

    for utt in tqdm(dataset):
        user = users_dict[utt['event_user_id']] if utt[
            'event_user_id'] is not None else users_dict['none']
        utterances.append(
            Utterance(id=utt['revision_id'],
                      user=user,
                      text=utt['event_comment\n']))

    corpus = Corpus(utterances=utterances)

    return corpus
Пример #12
0
def build_imessage_corpus() -> Corpus:
    print('Building corpus from iMessages...')
    conn = sqlite3.connect(os.path.expanduser('~/Library/Messages/chat.db'))
    conn.row_factory = sqlite3.Row
    cur = conn.cursor()

    # Handles (AKA Speakers)
    cur.execute("select ROWID as handle_id, id as phone_number from handle")
    handles = [dict(x) for x in cur.fetchall()]
    speakers = {
        str(h['handle_id']): Speaker(id=str(h['handle_id']), meta=h)
        for h in handles
    }
    speakers.update({
        '0': Speaker(id='0', meta={'phone_number': '+12155889243'})
    })  # don't call me unless you want

    # Chats
    chats = pd.read_sql_query("select * from chat", conn)
    chats.rename(columns={
        'ROWID': 'chat_id',
        'chat_identifier': 'chat_name'
    },
                 inplace=True)
    chat_cols = list(chats)
    chats[chat_cols] = chats[chat_cols].astype(str)

    # Messages
    messages = pd.read_sql_query("select * from message", conn)
    messages.rename(columns={'ROWID': 'message_id'}, inplace=True)
    messages = messages[[
        'message_id', 'text', 'handle_id', 'date', 'is_from_me'
    ]]
    messages['sender_id'] = messages.apply(lambda r: r['handle_id']
                                           if r['is_from_me'] == 0 else '0',
                                           axis=1)

    # Add chat data to messages
    chat_message_joins = pd.read_sql_query("select * from chat_message_join",
                                           conn)
    messages = pd.merge(messages,
                        chat_message_joins[['chat_id', 'message_id']],
                        on='message_id',
                        how='left').dropna()
    messages['chat_id'] = messages['chat_id'].astype(int)
    cols = list(messages)
    messages[cols] = messages[cols].astype(str)

    utterances = []
    for _, chat in chats.iterrows():
        chat_messages = messages.loc[messages['chat_id'] ==
                                     chat['chat_id']].sort_values(by=['date'])
        num_messages = len(chat_messages.index)

        if num_messages == 0:
            print("Warning: chat '%s' has no messages" % chat['chat_name'])
            continue

        root_msg = chat_messages.iloc[0]
        for i in range(num_messages):
            msg = chat_messages.iloc[i]
            last_msg = chat_messages.iloc[i - 1] if i != 0 else None
            last_msg_id = chat_messages.iloc[
                i - 1]['message_id'] if i != 0 else None

            # Make a new conversation if more than an hour has passed between messages
            if last_msg is not None and int(msg['date']) - int(
                    chat_messages.iloc[i - 1]['date']) > 3.6e12:
                root_msg = chat_messages.iloc[i]
                last_msg_id = None

            msg_utt = Utterance(id=msg['message_id'],
                                text=msg['text'],
                                speaker=speakers[msg['sender_id']],
                                root=root_msg['message_id'],
                                reply_to=last_msg_id,
                                meta=msg)
            utterances.append(msg_utt)

    return Corpus(utterances=utterances)
Пример #13
0
def rough_convert_intermediate_to_corpus(accum: Intermediate) -> Corpus:
    """Generates a rougher approximation of a Corpus from an Intermediate.
    Does not worry about reply_to structure, and instead sorts replies by the 
    chronological order in which utterances are posted to discussions.

    :param accum: the Intermediate to be converted
    :type accum: Intermediate

    :return: the Corpus generated from accum
    """
    users = {}
    utterances = []
    unknown_len = set()
    complete_utterances = set()
    block_hashes_to_segments = {}
    block_hashes_to_utt_ids = {}
    for block_hash, block in accum.blocks.items():
        try:
            if block.user not in users:
                users[block.user] = User(id=block.user)
            segments = accum.segment_contiguous_blocks(block.reply_chain)
            assert (block_hash == segments[-1][-1])
            # any complete contiguous block is a complete utterance
            for seg in segments[:-1]:
                sos = helpers.string_of_seg(seg)
                complete_utterances.add(sos)
            if block.is_header or not accum.blocks[segments[-1]
                                                   [-1]].is_followed:
                complete_utterances.add(helpers.string_of_seg(segments[-1]))
            block_hashes_to_segments[block_hash] = segments
        except Exception as e:
            logging.debug(e, exc_info=True)
            logging.warning(
                'Issue with conversion to corpus; skipping adding block "%s..."',
                block.text[:32])

    children_of_root = {}

    for utt in iter(complete_utterances):
        block_hashes = utt.split(" ")
        belongs_to_segment = block_hashes_to_segments[block_hashes[0]]
        first_block = accum.blocks[block_hashes[0]]

        u_id = block_hashes[0]
        u_user = users[first_block.user]
        u_root = accum.find_ultimate_hash(first_block.root_hash)
        u_timestamp = first_block.timestamp
        u_text = "\n".join([accum.blocks[h].text for h in block_hashes])
        u_meta = {}
        u_meta["last_revision"] = first_block.revision_ids[
            -1] if first_block.revision_ids[-1] != "unknown" else 0

        this_utterance = Utterance(id=u_id,
                                   user=u_user,
                                   root=u_root,
                                   reply_to=None,
                                   timestamp=u_timestamp,
                                   text=u_text,
                                   meta=u_meta)

        if u_root in children_of_root:
            children_of_root[u_root].append(this_utterance)
        else:
            children_of_root[u_root] = [this_utterance]

    utterances = []
    for root, utt_list in children_of_root.items():
        if root == None:
            continue

        utt_list.sort(key=lambda x: x.timestamp)

        ind_of_root = 0
        try:
            while utt_list[ind_of_root].id != root:
                ind_of_root += 1
        except Exception as e:
            logging.debug(e, exc_info=True)
            logging.warning(
                'Skipping section in conversion to corpus: could not find section header for root %s',
                root)
            continue

        if ind_of_root > 0:
            utt_list.insert(0, utt_list.pop(ind_of_root))

        utterances.append(utt_list[0])
        added = set([utt_list[0].id])
        i, j = 0, 1
        while j < len(utt_list):
            if utt_list[j].id not in added:
                utt_list[j].reply_to = utt_list[i].id
                added.add(utt_list[j].id)
                utterances.append(utt_list[j])
                i = j
            j += 1

        # for i in range(1, len(utt_list)):
        #     if utt_list[i-1].id == utt_list[i].id:
        #         logging.warning("Skipping utterance in conversion to corpus: reply to self %s", utt_list[i].id)
        #     else:
        #         utt_list[i].reply_to = utt_list[i-1].id
        #         utterances.append(utt_list[i])

    corpus = Corpus(utterances=utterances)
    return corpus
Пример #14
0
def convert_intermediate_to_corpus(accum: Intermediate) -> Corpus:
    """Generates a Corpus from an Intermediate.

    :param accum: the Intermediate to be converted
    :type accum: Intermediate

    :return: the Corpus generated from accum
    """
    users = {}
    utterances = []
    unknown_len = set()
    complete_utterances = set()
    block_hashes_to_segments = {}
    block_hashes_to_utt_ids = {}
    for block_hash, block in accum.blocks.items():
        try:
            if block.user not in users:
                users[block.user] = User(id=block.user)
            segments = accum.segment_contiguous_blocks(block.reply_chain)
            assert (block_hash == segments[-1][-1])
            # any complete contiguous block is a complete utterance
            for seg in segments[:-1]:
                sos = helpers.string_of_seg(seg)
                complete_utterances.add(sos)
            if block.is_header or not accum.blocks[segments[-1]
                                                   [-1]].is_followed:
                complete_utterances.add(helpers.string_of_seg(segments[-1]))
            block_hashes_to_segments[block_hash] = segments
        except Exception as e:
            logging.debug(e, exc_info=True)
            logging.warning(
                'Issue with conversion to corpus; skipping adding block "%s..."',
                block.text[:32])

    for utt in iter(complete_utterances):
        block_hashes = utt.split(" ")
        belongs_to_segment = block_hashes_to_segments[block_hashes[0]]
        first_block = accum.blocks[block_hashes[0]]

        u_id = block_hashes[0]
        u_user = users[first_block.user]
        u_root = accum.find_ultimate_hash(first_block.root_hash)
        u_replyto = _find_reply_to_from_segment(belongs_to_segment)
        u_timestamp = first_block.timestamp
        u_text = "\n".join([accum.blocks[h].text for h in block_hashes])
        u_meta = {}
        u_meta["constituent_blocks"] = block_hashes
        u_meta["last_revision"] = first_block.revision_ids[
            -1] if first_block.revision_ids[-1] != "unknown" else 0
        for each_hash in block_hashes:
            block_hashes_to_utt_ids[each_hash] = u_id

        this_utterance = Utterance(id=u_id,
                                   user=u_user,
                                   root=u_root,
                                   reply_to=u_replyto,
                                   timestamp=u_timestamp,
                                   text=u_text,
                                   meta=u_meta)

        utterances.append(this_utterance)

    corpus = Corpus(utterances=utterances)
    corpus.meta["reverse_block_index"] = block_hashes_to_utt_ids

    return corpus