def parse_voice_audio_log_body(dic_my_activity_voice_audio, voice_audio_logs): list_voice_audio_event_logs = TakeoutHtmlParser.find_log_body( voice_audio_logs) if list_voice_audio_event_logs != []: idx = 0 for content in list_voice_audio_event_logs: content = str(content).strip() content = content.replace(u'\xa0', ' ') if idx == 0: if content.startswith('Said'): dic_my_activity_voice_audio['type'] = 'Search' if content != 'Said': dic_my_activity_voice_audio['keyword'] = content[ 4:].lstrip() else: dic_my_activity_voice_audio['type'] = content else: if idx == 1 and dic_my_activity_voice_audio[ 'type'] == 'Search': if content.startswith('<a href="'): idx2 = content.find('">') keyword = content[idx2 + 2:content.find('</a>')] dic_my_activity_voice_audio[ 'keyword'] = TakeoutHtmlParser.remove_special_char( keyword) url = content[9:idx2] url = unquote(url) dic_my_activity_voice_audio['keyword_url'] = url elif content.endswith('UTC'): dic_my_activity_voice_audio[ 'timestamp'] = TakeoutHtmlParser.convert_datetime_to_unixtime( content) idx += 1
def parse_gmail_log_body(dic_my_activity_gmail, gmail_logs): list_gmail_search_logs = TakeoutHtmlParser.find_log_body(gmail_logs) if list_gmail_search_logs != []: idx = 0 for content in list_gmail_search_logs: content = str(content).strip() content = content.replace(u'\xa0', ' ') if idx == 0: if content == 'Searched for': dic_my_activity_gmail['type'] = 'Search' else: dic_my_activity_gmail['type'] = content else: if idx == 1: if content.startswith('<a href="'): idx2 = content.find('">') url = content[9:idx2] url = unquote(url) dic_my_activity_gmail['keyword_url'] = url keyword = content[idx2 + 2:content.find('</a>')] dic_my_activity_gmail[ 'keyword'] = TakeoutHtmlParser.remove_special_char( keyword) elif content.endswith('UTC'): dic_my_activity_gmail[ 'timestamp'] = TakeoutHtmlParser.convert_datetime_to_unixtime( content) idx += 1
def parse_analytics_log_body(dic_my_activity_google_analytics, analytics_logs): list_analytics_event_logs = TakeoutHtmlParser.find_log_body( analytics_logs) if list_analytics_event_logs != []: idx = 0 for content in list_analytics_event_logs: content = str(content).strip() content = content.replace(u'\xa0', ' ') if idx == 0: if content == 'Used': dic_my_activity_google_analytics['type'] = 'Use' elif content == 'Visited': dic_my_activity_google_analytics['type'] = 'Visit' else: dic_my_activity_google_analytics['type'] = content else: if idx == 1: if content.startswith('<a href="'): idx2 = content.find('">') keyword = content[idx2 + 2:content.find('</a>')] dic_my_activity_google_analytics[ 'keyword'] = TakeoutHtmlParser.remove_special_char( keyword) url = content[9:idx2] url = unquote(url) dic_my_activity_google_analytics[ 'keyword_url'] = url o = urlparse(url) if o.query.startswith('q=') and o.query.find( '&'): real_url = o.query[2:o.query.find('&')] real_url = unquote(real_url) dic_my_activity_google_analytics[ 'keyword_url'] = real_url o = urlparse(real_url) if o.netloc.startswith('m.'): dic_my_activity_google_analytics[ 'used_device'] = 'mobile' if o.netloc.startswith('m.'): dic_my_activity_google_analytics[ 'used_device'] = 'mobile' elif content.endswith('UTC'): dic_my_activity_google_analytics[ 'timestamp'] = TakeoutHtmlParser.convert_datetime_to_unixtime( content) idx += 1
def parse_maps(case): file_path = case.takeout_my_activity_maps_path if os.path.exists(file_path) == False: return False with open(file_path, 'r', encoding='utf-8') as f: file_contents = f.read() soup = BeautifulSoup(file_contents, 'lxml') list_maps_logs = TakeoutHtmlParser.find_log(soup) if list_maps_logs != []: for i in trange( len(list_maps_logs), desc= "[Parsing the My Activity -> Maps data...............]", unit="epoch"): # print("..........................................................................") dic_my_activity_maps = {'timestamp':"", 'service':"", 'type':"", 'keyword':"", 'keyword_url':"", \ 'keyword_latitude':"", 'keyword_longitude':"", 'latitude':"", 'longitude':"", 'geodata_description':"", \ 'used_device':""} MyActivityMaps.parse_maps_log_title( dic_my_activity_maps, list_maps_logs[i]) MyActivityMaps.parse_maps_log_body(dic_my_activity_maps, list_maps_logs[i]) MyActivityMaps.parse_maps_log_caption( dic_my_activity_maps, list_maps_logs[i]) MyActivityMaps.insert_log_info_to_preprocess_db( dic_my_activity_maps, case.preprocess_db_path)
def parse_maps_log_caption(dic_my_activity_maps, maps_logs): list_maps_logs = TakeoutHtmlParser.find_log_caption(maps_logs) if list_maps_logs != []: for content in list_maps_logs: content = str(content).strip() if content == '<br/>': continue elif content.startswith( '<a href="https://www.google.com/maps/'): idx2 = content.find('">') url = content[9:idx2] o = urlparse(url) list_query_value = o.query.split(';') if list_query_value != []: for query_value in list_query_value: if query_value.startswith('center='): geodata = query_value.lstrip('center=').rstrip( '&') dic_my_activity_maps[ 'latitude'] = geodata.split(',')[0] dic_my_activity_maps[ 'longitude'] = geodata.split(',')[1] elif query_value.startswith('query='): geodata = query_value.lstrip('query=') dic_my_activity_maps[ 'latitude'] = geodata.split(',')[0] dic_my_activity_maps[ 'longitude'] = geodata.split(',')[1] dic_my_activity_maps['geodata_description'] = content[ idx2 + 2:content.find('</a>')] elif content == '- From your device': dic_my_activity_maps['used_device'] = 'mobile'
def parse_gmail(case): file_path = case.takeout_my_activity_gmail_path if os.path.exists(file_path) == False: return False with open(file_path, 'r', encoding='utf-8') as f: file_contents = f.read() soup = BeautifulSoup(file_contents, 'lxml') list_gmail_logs = TakeoutHtmlParser.find_log(soup) if list_gmail_logs != []: for i in trange( len(list_gmail_logs), desc= "[Parsing the My Activity -> Gmail data..............]", unit="epoch"): # print("..........................................................................") dic_my_activity_gmail = { 'service': "", 'type': "", 'keyword_url': "", 'keyword': "", 'timestamp': "" } MyActivityGmail.parse_gmail_log_title( dic_my_activity_gmail, list_gmail_logs[i]) MyActivityGmail.parse_gmail_log_body( dic_my_activity_gmail, list_gmail_logs[i]) MyActivityGmail.insert_log_info_to_preprocess_db( dic_my_activity_gmail, case.preprocess_db_path)
def parse_assistant_log_caption(dic_my_activity_assistant, assistant_logs): list_assistant_geodata_logs = TakeoutHtmlParser.find_log_caption( assistant_logs) if list_assistant_geodata_logs != []: for content in list_assistant_geodata_logs: content = str(content).strip() if content == '<br/>': continue if content.startswith('<a href="https://www.google.com/maps/'): idx = content.find('">') url = content[9:idx] o = urlparse(url) list_query_value = o.query.split(';') if list_query_value != []: for query_value in list_query_value: if query_value.startswith('center='): geodata = query_value.lstrip('center=').rstrip( '&') dic_my_activity_assistant[ 'latitude'] = geodata.split(',', 1)[0] dic_my_activity_assistant[ 'longitude'] = geodata.split(',', 1)[1] elif query_value.startswith('query='): geodata = query_value.lstrip('query=') dic_my_activity_assistant[ 'latitude'] = geodata.split(',', 1)[0] dic_my_activity_assistant[ 'longitude'] = geodata.split(',', 1)[1] if dic_my_activity_assistant['geodata_description'] == "": dic_my_activity_assistant[ 'geodata_description'] = content[idx + 2:content. find('</a>')]
def parse_gmail_log_title(dic_my_activity_gmail, gmail_logs): list_gmail_title_logs = TakeoutHtmlParser.find_log_title(gmail_logs) if list_gmail_title_logs != []: for content in list_gmail_title_logs: content = str(content).strip() dic_my_activity_gmail['service'] = content.split('>')[1].split( '<br')[0]
def parse_youtube_log_body(dic_my_activity_youtube, youtube_logs): list_youtube_event_logs = TakeoutHtmlParser.find_log_body(youtube_logs) if list_youtube_event_logs != []: idx = 0 for content in list_youtube_event_logs: content = str(content).strip() content = content.replace(u'\xa0', ' ') if idx == 0: if content == 'Searched for': dic_my_activity_youtube['type'] = 'Search' elif content.startswith('Watched'): dic_my_activity_youtube['type'] = 'Watch' if len(content) >= 8 and content.find(' ') >= 1: dic_my_activity_youtube[ 'keyword'] = TakeoutHtmlParser.remove_special_char( content) elif content.startswith('Visited'): dic_my_activity_youtube['type'] = 'Visit' if len(content) >= 8 and content.find(' ') >= 1: dic_my_activity_youtube[ 'keyword'] = TakeoutHtmlParser.remove_special_char( content) else: dic_my_activity_youtube['type'] = content else: if idx == 1: if content.startswith('<a href="'): idx2 = content.find('">') keyword = content[idx2 + 2:content.find('</a>')] dic_my_activity_youtube[ 'keyword'] = TakeoutHtmlParser.remove_special_char( keyword) url = content[9:idx2] url = unquote(url) dic_my_activity_youtube[ 'keyword_url'] = TakeoutHtmlParser.remove_special_char( url) else: if dic_my_activity_youtube['type'] == 'Watch': if content.startswith('<a href="'): idx2 = content.find('">') channel_name = content[idx2 + 2:content.find('</a>')] dic_my_activity_youtube[ 'channel_name'] = TakeoutHtmlParser.remove_special_char( channel_name) url = content[9:idx2] url = unquote(url) dic_my_activity_youtube[ 'channel_url'] = TakeoutHtmlParser.remove_special_char( url) if content.endswith('UTC'): dic_my_activity_youtube[ 'timestamp'] = TakeoutHtmlParser.convert_datetime_to_unixtime( content) idx += 1
def parse_assistant_log_title(dic_my_activity_assistant, assistant_logs): list_assistant_title_logs = TakeoutHtmlParser.find_log_title( assistant_logs) if list_assistant_title_logs != []: for content in list_assistant_title_logs: content = str(content).strip() dic_my_activity_assistant['service'] = content.split( '>')[1].split('<br')[0] dic_my_activity_assistant['used_device'] = 'mobile'
def parse_ganalytics_log_title(dic_my_activity_google_analytics, analytics_logs): list_analytics_title_logs = TakeoutHtmlParser.find_log_title( analytics_logs) if list_analytics_title_logs != []: for content in list_analytics_title_logs: content = str(content).strip() dic_my_activity_google_analytics['service'] = content.split( '>')[1].split('<br')[0]
def parse_assistant_log_body_text(dic_my_activity_assistant, assistant_logs, file_path): list_assistant_trained_logs = TakeoutHtmlParser.find_log_body_text( assistant_logs) if list_assistant_trained_logs != []: for content in list_assistant_trained_logs: content = str(content).strip() if content.startswith('<audio controls'): attachment = content.split('>')[2].split('<')[0].lstrip( 'Audio file: ').split(' ')[0] attachment_path = os.path.dirname( file_path) + os.sep + attachment if os.path.exists(attachment_path): dic_my_activity_assistant['filepath'] = attachment_path
def parse_history_logs(dic_browser_history, history_logs): for k, v in history_logs.items(): if k == 'time_usec': dic_browser_history['timestamp'] = int(v) // 1000000 elif k == 'page_transition': dic_browser_history['page_transition'] = v elif k == 'url': dic_browser_history[ 'url'] = TakeoutHtmlParser.remove_special_char(unquote(v)) o = urlparse(v) if o.netloc.startswith('m.'): dic_browser_history['used_device'] = 'mobile' elif k == 'title': dic_browser_history['title'] = v.replace("\"", "\'") elif k == 'client_id': dic_browser_history['client_id'] = v elif k == 'favicon_url': dic_browser_history['favicon_url'] = v
def parse_device_info(case): # print("input dir: ", case.takeout_android_device_configuration_service_path) list_target_files = os.listdir( case.takeout_android_device_configuration_service_path) if list_target_files == []: logger.error('Takeout data not exist.') return False for file_name in list_target_files: if file_name.startswith('Device-') == False: continue file_path = case.takeout_android_device_configuration_service_path + os.sep + file_name with open(file_path, 'r', encoding='utf-8') as f: file_contents = f.read() soup = BeautifulSoup(file_contents, 'lxml') list_device_conf_logs = TakeoutHtmlParser.find_category_title( soup) print(list_device_conf_logs)
def parse_assistant_log_body(dic_my_activity_assistant, assistant_logs): list_assistant_search_logs = TakeoutHtmlParser.find_log_body( assistant_logs) if list_assistant_search_logs != []: idx = 0 for content in list_assistant_search_logs: content = str(content).strip() content = content.replace(u'\xa0', ' ') if idx == 0: if content.startswith('Said'): dic_my_activity_assistant['type'] = 'Search' if len(content) >= 5 and content.find(' ') >= 1: keyword = content.split(' ', 1)[1] dic_my_activity_assistant[ 'keyword'] = TakeoutHtmlParser.remove_special_char( keyword) elif content.startswith('Used'): dic_my_activity_assistant['type'] = 'Use' if len(content) >= 5 and content.find(' ') >= 1: keyword = content.split(' ', 1)[1] dic_my_activity_assistant[ 'keyword'] = TakeoutHtmlParser.remove_special_char( keyword) elif content.startswith('Trained'): dic_my_activity_assistant['type'] = 'Train' if len(content) >= 8 and content.find(' ') >= 1: keyword = content.split(' ', 1)[1] dic_my_activity_assistant[ 'keyword'] = TakeoutHtmlParser.remove_special_char( keyword) elif content.startswith('Selected') or content.startswith( 'Listened'): dic_my_activity_assistant['type'] = 'Use' if len(content) >= 9 and content.find(' ') >= 1: dic_my_activity_assistant[ 'keyword'] = TakeoutHtmlParser.remove_special_char( content) else: if idx == 1: if content.startswith('<a href="'): idx2 = content.find('">') keyword = content[idx2 + 2:content.find('</a>')] dic_my_activity_assistant[ 'keyword'] = TakeoutHtmlParser.remove_special_char( keyword) url = content[9:idx2] url = unquote(url) dic_my_activity_assistant['keyword_url'] = url elif content.endswith('UTC'): dic_my_activity_assistant[ 'timestamp'] = TakeoutHtmlParser.convert_datetime_to_unixtime( content) elif idx != 1 and content != '<br/>': if content.startswith('<a href="'): idx2 = content.find('">') keyword = content[idx2 + 2:content.find('</a>')] dic_my_activity_assistant[ 'result'] = TakeoutHtmlParser.remove_special_char( keyword) url = content[9:idx2] url = unquote(url) dic_my_activity_assistant['result_url'] = url o = urlparse(url) else: dic_my_activity_assistant[ 'result'] += TakeoutHtmlParser.remove_special_char( content) idx += 1
def parse_video_search_log_title(dic_my_activity_video_search, video_search_logs): list_video_search_title_logs = TakeoutHtmlParser.find_log_title(video_search_logs) if list_video_search_title_logs != []: for content in list_video_search_title_logs: content = str(content).strip() dic_my_activity_video_search['service'] = content.split('>')[1].split('<br')[0]
def parse_chrome_log_title(dic_my_activity_chrome, chrome_logs): list_chrome_title_logs = TakeoutHtmlParser.find_log_title(chrome_logs) if list_chrome_title_logs != []: for content in list_chrome_title_logs: content = str(content).strip() dic_my_activity_chrome['service'] = content.split('>', 1)[1].split('<br')[0]
def parse_maps_log_body(dic_my_activity_maps, maps_logs): list_maps_event_logs = TakeoutHtmlParser.find_log_body(maps_logs) if list_maps_event_logs != []: idx = 0 for content in list_maps_event_logs: content = str(content).strip() content = content.replace(u'\xa0', ' ') if idx == 0: if content.startswith('<a href="'): url = content[9:content.find('">')] keyword = content.split('>')[1].split('</a')[0] dic_my_activity_maps['keyword'] = keyword.replace( "\"", "\'") if keyword.startswith('View'): dic_my_activity_maps['type'] = 'View' else: dic_my_activity_maps['type'] = 'Search' url = unquote(url) dic_my_activity_maps[ 'keyword_url'] = TakeoutHtmlParser.remove_special_char( url) o = urlparse(url) if o.path.startswith('/maps/@'): list_value = o.path.lstrip('/maps/@').split(',') if list_value != []: latitude = list_value[0] longitude = list_value[1] dic_my_activity_maps[ 'keyword_latitude'] = latitude dic_my_activity_maps[ 'keyword_longitude'] = longitude elif o.path.find('@') >= 1: list_value = o.path.split('@')[1].split(',') if list_value != []: latitude = list_value[0] longitude = list_value[1] dic_my_activity_maps[ 'keyword_latitude'] = latitude dic_my_activity_maps[ 'keyword_longitude'] = longitude elif o.query.find('sll=') >= 1: list_value = o.query.split('sll=', 1)[1].split(',') if list_value != []: latitude = list_value[0] longitude = list_value[1].split('&')[0] dic_my_activity_maps[ 'keyword_latitude'] = latitude dic_my_activity_maps[ 'keyword_longitude'] = longitude else: if content == 'Searched for': dic_my_activity_maps['type'] = 'Search' elif content.startswith('Shared'): dic_my_activity_maps['type'] = 'Share' elif content.startswith('Viewed'): dic_my_activity_maps['type'] = 'View' if content == 'Viewed For you': dic_my_activity_maps[ 'keyword'] = TakeoutHtmlParser.remove_special_char( content) elif content == 'Used Maps': dic_my_activity_maps['type'] = 'Use' dic_my_activity_maps[ 'keyword'] = TakeoutHtmlParser.remove_special_char( content) elif content.startswith('Answered'): dic_my_activity_maps['type'] = 'Answer' dic_my_activity_maps[ 'keyword'] = TakeoutHtmlParser.remove_special_char( content) else: dic_my_activity_maps['type'] = content else: if idx == 1: if content.startswith('<a href="'): idx2 = content.find('">') keyword = content[idx2 + 2:content.find('</a>')] dic_my_activity_maps[ 'keyword'] = TakeoutHtmlParser.remove_special_char( keyword) url = content[9:idx2] url = unquote(url) dic_my_activity_maps[ 'keyword_url'] = TakeoutHtmlParser.remove_special_char( url) o = urlparse(url) if o.path.startswith( '/maps/') and o.path.find('@') >= 1: list_value = o.path.split('@')[1].split(',') if list_value != []: latitude = list_value[0] longitude = list_value[1] dic_my_activity_maps[ 'keyword_latitude'] = latitude dic_my_activity_maps[ 'keyword_longitude'] = longitude elif o.query.find('sll=') >= 1: list_value = o.query.split('sll=', 1)[1].split(',') if list_value != []: latitude = list_value[0] longitude = list_value[1].split('&')[0] dic_my_activity_maps[ 'keyword_latitude'] = latitude dic_my_activity_maps[ 'keyword_longitude'] = longitude else: if content.endswith('UTC'): dic_my_activity_maps[ 'timestamp'] = TakeoutHtmlParser.convert_datetime_to_unixtime( content) elif idx == 4 and dic_my_activity_maps[ 'type'] == '1 notification': dic_my_activity_maps[ 'keyword'] = TakeoutHtmlParser.remove_special_char( content) idx += 1