async def _main_loop(client): own_name = await get_own_name(client) data = await list_dialogs(client, own_name) log.info('{:,} messages parsed.'.format(len(data))) if len(data) < 1: log.info('Nothing to save.') exit(0) log.info('Converting to DataFrame...') df = pd.DataFrame(data, columns=config['ALL_COLUMNS']) df['platform'] = 'telegram' log.info('Detecting languages...') df = detect_language(df) export_dataframe(df, config['telegram']['OUTPUT_PICKLE_NAME']) log.info('Done.')
def main(): client = TelegramClient('session_name', config.TELEGRAM_API_ID, config.TELEGRAM_API_HASH) client.connect() me = sign_in(client) data = list_dialogs(client) log.info('Converting to DataFrame...') df = pd.DataFrame(data) df.columns = config.ALL_COLUMNS # import pdb; pdb.set_trace() df['platform'] = 'telegram' own_name = '{} {}'.format(me.first_name, me.last_name).strip() df['senderName'] = own_name log.info('Detecting languages...') df['language'] = 'unknown' utils.export_dataframe(df, 'telegram.pkl') log.info('Done.')
def main(own_name, file_path, max_exported_messages): global MAX_EXPORTED_MESSAGES MAX_EXPORTED_MESSAGES = max_exported_messages log.info('Parsing Facebook messenger data...') if len(glob.glob(os.path.join(file_path, '**', '*.json'))) == 0: log.error(f'No input files found under {file_path}') exit(0) if own_name is None: own_name = infer_own_name(file_path) data = parse_messages(file_path, own_name) log.info('{:,} messages parsed.'.format(len(data))) if len(data) < 1: log.info('Nothing to save.') exit(0) log.info('Converting to DataFrame...') df = pd.DataFrame(data, columns=config['ALL_COLUMNS']) df['platform'] = 'messenger' log.info('Detecting languages...') df = detect_language(df) export_dataframe(df, config['messenger']['OUTPUT_PICKLE_NAME']) log.info('Done.')
def main(own_name, file_path, max_exported_messages, infer_datetime): global MAX_EXPORTED_MESSAGES MAX_EXPORTED_MESSAGES = max_exported_messages log.info('Parsing Whatsapp data...') files = glob.glob(os.path.join(file_path, '*.txt')) if len(files) == 0: log.error(f'No input files found under {file_path}') exit(0) if own_name is None: own_name = infer_own_name(files) data = parse_messages(files, own_name, infer_datetime) log.info('{:,} messages parsed.'.format(len(data))) if len(data) < 1: log.info('Nothing to save.') exit(0) df = pd.DataFrame(data, columns=config['ALL_COLUMNS']) df['platform'] = 'whatsapp' log.info('Detecting languages...') df = detect_language(df) # Export export_dataframe(df, config['whatsapp']['OUTPUT_PICKLE_NAME']) log.info('Done.')
def main(own_name, file_path, max_exported_messages): global MAX_EXPORTED_MESSAGES MAX_EXPORTED_MESSAGES = max_exported_messages log.info('Parsing Google Hangouts data...') if not os.path.isfile(file_path): log.error(f'No input file under {file_path}') exit(0) archive = read_archive(file_path) if own_name is None: own_name = infer_own_name(archive) data = parse_messages(archive, own_name) log.info('{:,} messages parsed.'.format(len(data))) if len(data) < 1: log.info('Nothing to save.') exit(0) log.info('Converting to DataFrame...') df = pd.DataFrame(data, columns=config['ALL_COLUMNS']) df['platform'] = 'hangouts' log.info('Detecting languages...') df = detect_language(df) export_dataframe(df, config['hangouts']['OUTPUT_PICKLE_NAME']) log.info('Done.')
def main(): args = parse_arguments() own_name = args.own_name print('Parsing JSON file...') with open(args.file_path) as f: archive = json.loads(f.read()) names = {} def idToName(id): if id in names: return names[id] else: return None def saveNameForId(name, id): if not id in names: names[id] = name elif names[id] != name: print('Assuming', name, 'is', names[id]) data = [] conversationWithId = '' conversationWithName = '' print('Extracting messages...') for state in archive["conversation_state"]: if "conversation" in state["conversation_state"]: for participant in state["conversation_state"]["conversation"][ "participant_data"]: if "fallback_name" in participant: saveNameForId(participant["fallback_name"], participant["id"]["gaia_id"]) for event in state["conversation_state"]["event"]: timestamp = int(event["timestamp"]) if "chat_message" in event and "segment" in event["chat_message"][ "message_content"]: content = event["chat_message"]["message_content"] text = content["segment"][0]["text"] conversationId = event["conversation_id"]["id"] senderId = event["sender_id"]["chat_id"] participants = state["conversation_state"]["conversation"][ "current_participant"] if len(participants) == 2: for participant in participants: if idToName(participant["gaia_id"]) != own_name: conversationWithId = participant["gaia_id"] if idToName(senderId) is not None or idToName( conversationWithId) is not None: if idToName( senderId ) != own_name and senderId != conversationWithId: # print idToName(senderId), 'in conversation with', idToName(conversationWithId), '!' print('Parsing error, is your ownId correct?') exit(0) # saves the message timestamp = timestamp / 1000000 data += [[ timestamp, conversationId, idToName(conversationWithId), idToName(senderId), text ]] else: # unknown sender print("No senderName for either senderId", senderId, conversationWithId) if len(data) >= args.max_exported_messages: break log.debug(len(data), 'messages parsed.') log.info('Converting to DataFrame...') df = pd.DataFrame(data) df.columns = config.DATAFRAME_COLUMNS df['platform'] = 'hangouts' log.info('Detecting languages...') df['language'] = 'unknown' for name, group in df.groupby(df.conversationWithName): sample = '' df2 = df[df.conversationWithName == name].dropna() if len(df2) > 10: for x in range(0, min(len(df2), 100)): sample = sample + df2.iloc[randint(0, len(df2) - 1)]['text'] print('\t', name, detect(sample)) df.loc[df.conversationWithName == name, 'language'] = detect(sample) log.info('Computing dates...') df['datetime'] = df['timestamp'].apply(utils.timestamp_to_ordinal) print(df.head()) utils.export_dataframe(df, 'hangouts.pkl') log.info('Done.')
def main(): args = parse_arguments() fallbackDateParsing = False data = [] warnedNameChanges = [] nbInvalidSender = 0 # make sure we don't crash if chat logs contain exotic characters etree.set_default_parser( etree.XMLParser(encoding='utf-8', ns_clean=True, recover=True)) for filename in os.listdir(args.file_path): if not filename.endswith('.html'): continue document = os.path.join(args.file_path, filename) archive = etree.parse(document) conversationId = filename.replace('.html', '') groupConversation = False timestamp = '' senderName = '' conversationWithName = None for element in archive.iter(): tag = element.tag className = element.get('class') content = element.text if tag == 'p': text = content if conversationWithName != '' and senderName != '': # handles when the interlocutor's name changed at some point if (senderName != conversationWithName) and (senderName != args.own_name) and \ (senderName not in warnedNameChanges) and (not groupConversation): if senderName not in warnedNameChanges: print('\t', 'Assuming', senderName, 'is', conversationWithName) warnedNameChanges.append(senderName) senderName = conversationWithName data += [[ timestamp, conversationId, conversationWithName, senderName, text ]] else: nbInvalidSender = nbInvalidSender + 1 elif tag == 'span': if className == 'user': senderName = content elif className == 'meta': try: if not fallbackDateParsing: timestamp = time.mktime( pd.to_datetime( content, format='%A, %B %d, %Y at %H:%M%p', exact=False).timetuple()) else: timestamp = time.mktime( pd.to_datetime( content, infer_datetime_format=True).timetuple()) except ValueError: if not fallbackDateParsing: print( 'Unexpected date format. ' 'Falling back to infer_datetime_format, parsing will be slower.' ) timestamp = time.mktime( pd.to_datetime( content, format='%A, %B %d, %Y at %H:%M%p', exact=False).timetuple()) fallbackDateParsing = True else: raise elif tag == 'div' and className == 'thread': nbParticipants = str(element.xpath("text()")).count(', ') + 1 if nbParticipants > 1: groupConversation = True elif tag == 'h3': if conversationWithName is not None: print( 'Something is wrong. File format changed? (multiple conversation hearder in a single file)' ) exit(0) else: content = content.replace('Conversation with ', '') conversationWithName = content print(conversationId, conversationWithName, "(group?", groupConversation, ")") if len(data) >= args.max_exported_messages: break print(len(data), 'messages parsed.') if nbInvalidSender > 0: print(nbInvalidSender, 'messages discarded because of bad ID.') if len(data) < 1: print('Nothing to save.') exit(0) log.info('Converting to DataFrame...') df = pd.DataFrame(data) df.columns = config.DATAFRAME_COLUMNS df['platform'] = 'messenger' log.info('Detecting languages...') df['language'] = 'unknown' for name, group in df.groupby(df.conversationWithName): sample = '' df2 = df[df.conversationWithName == name].dropna() if len(df2) > 10: for x in range(0, min(len(df2), 100)): sample = sample + df2.iloc[random.randint( 0, len(df2) - 1)]['text'] print('\t', name, detect(sample), "(", len(df2), "msgs)") df.loc[df.conversationWithName == name, 'language'] = detect(sample) log.info('Computing dates...') df['datetime'] = df['timestamp'].apply(utils.timestamp_to_ordinal) print(df.head()) utils.export_dataframe(df, 'messenger.pkl') log.info('Done.')