示例#1
0
def list_dialogs(client):
    dialogs = client.get_dialogs()
    result = []
    for item in dialogs:
        dialog = item.dialog
        if isinstance(dialog.peer, PeerUser):
            result.extend(process_dialog_with_user(client, item))
        elif isinstance(dialog.peer, (PeerChannel, PeerChat)):
            log.debug('Dialogs in chats/channels are not supported yet')
        else:
            log.warning('Unknown dialog type %s', dialog)

    return result
示例#2
0
def main():
    args = parse_arguments()
    own_name = args.own_name

    print('Parsing JSON file...')
    with open(args.file_path) as f:
        archive = json.loads(f.read())

    names = {}

    def idToName(id):
        if id in names:
            return names[id]
        else:
            return None

    def saveNameForId(name, id):
        if not id in names:
            names[id] = name
        elif names[id] != name:
            print('Assuming', name, 'is', names[id])

    data = []
    conversationWithId = ''
    conversationWithName = ''

    print('Extracting messages...')
    for state in archive["conversation_state"]:
        if "conversation" in state["conversation_state"]:
            for participant in state["conversation_state"]["conversation"][
                    "participant_data"]:
                if "fallback_name" in participant:
                    saveNameForId(participant["fallback_name"],
                                  participant["id"]["gaia_id"])

        for event in state["conversation_state"]["event"]:
            timestamp = int(event["timestamp"])

            if "chat_message" in event and "segment" in event["chat_message"][
                    "message_content"]:
                content = event["chat_message"]["message_content"]
                text = content["segment"][0]["text"]
                conversationId = event["conversation_id"]["id"]
                senderId = event["sender_id"]["chat_id"]

                participants = state["conversation_state"]["conversation"][
                    "current_participant"]

                if len(participants) == 2:
                    for participant in participants:
                        if idToName(participant["gaia_id"]) != own_name:
                            conversationWithId = participant["gaia_id"]

                    if idToName(senderId) is not None or idToName(
                            conversationWithId) is not None:
                        if idToName(
                                senderId
                        ) != own_name and senderId != conversationWithId:
                            # print idToName(senderId), 'in conversation with', idToName(conversationWithId), '!'
                            print('Parsing error, is your ownId correct?')
                            exit(0)

                        # saves the message
                        timestamp = timestamp / 1000000
                        data += [[
                            timestamp, conversationId,
                            idToName(conversationWithId),
                            idToName(senderId), text
                        ]]

                    else:
                        # unknown sender
                        print("No senderName for either senderId", senderId,
                              conversationWithId)

                    if len(data) >= args.max_exported_messages:
                        break

    log.debug(len(data), 'messages parsed.')

    log.info('Converting to DataFrame...')
    df = pd.DataFrame(data)
    df.columns = config.DATAFRAME_COLUMNS
    df['platform'] = 'hangouts'

    log.info('Detecting languages...')
    df['language'] = 'unknown'
    for name, group in df.groupby(df.conversationWithName):
        sample = ''
        df2 = df[df.conversationWithName == name].dropna()

        if len(df2) > 10:
            for x in range(0, min(len(df2), 100)):
                sample = sample + df2.iloc[randint(0, len(df2) - 1)]['text']

            print('\t', name, detect(sample))
            df.loc[df.conversationWithName == name,
                   'language'] = detect(sample)

    log.info('Computing dates...')
    df['datetime'] = df['timestamp'].apply(utils.timestamp_to_ordinal)

    print(df.head())
    utils.export_dataframe(df, 'hangouts.pkl')
    log.info('Done.')