def parseAppleData(appleDataDumpName): # Define dictionary to map json data to Dict = {} # Parse through apple media root directory rootPathName = "./media/unzippedFiles/apple/" + appleDataDumpName if os.path.exists(rootPathName): # Get total size Dict["total_size_GB"] = genericParser.getDirSizeInGB(rootPathName) #TODO: double check, this number doesn't make sense # ---------- General Data ---------- genDirPath = rootPathName + "/Apple ID account and device information" # ----- ----- file_account_info = genDirPath + "/Apple ID Account Information.csv" if os.path.exists(file_account_info): fieldNames = ("Apple ID Number", "First Name", "Last Name", "Official Address") data_account_info = genericParser.csvToDict( file_account_info, fieldNames) apple_id = data_account_info[0]["Apple ID Number"] name = data_account_info[0][ "First Name"] + " " + data_account_info[0]["Last Name"] address = data_account_info[0]["Official Address"] Dict["account_info_header"] = [apple_id, name, address] else: print( "/Apple ID account and device information/Apple ID Account Information.csv not found" ) # ----- ----- file_devices = genDirPath + "/Apple ID Device Information.csv" if os.path.exists(file_devices): fieldNames = ("Device Name", "Device Added Date", "Device Serial Number", "Device Last Heartbeat IP") data_devices = genericParser.csvToDict(file_devices, fieldNames) Dict["devices_list"] = data_devices else: print( "/Apple ID account and device information/Apple ID Device Information.csv not found" ) # ---------- Music Data ---------- musicDirPath = rootPathName + "/Apple Media Services information" if os.path.exists(musicDirPath): zipPath = musicDirPath + "/Apple_Media_Services.zip" with zipfile.ZipFile(zipPath, "r") as zip_ref: zip_ref.extractall(musicDirPath) zipPath1 = musicDirPath + "/Apple_Media_Services/Apple Music Activity/Apple Music Library Tracks.json.zip" with zipfile.ZipFile(zipPath1, "r") as zip_ref1: zip_ref1.extractall( musicDirPath + "/Apple_Media_Services/Apple Music Activity") # ===== PLAY ACTIVITY ===== file_play_activity = musicDirPath + "/Apple_Media_Services/Apple Music Activity/Apple Music Play Activity.csv" if os.path.exists(file_play_activity): fieldNames = ("Artist Name", "Content Name", "Client IP Address", "End Reason Type", "Genre", "Milliseconds Since Play", "Play Duration Milliseconds", "Event Received Timestamp") data_play_activity = genericParser.csvToDict( file_play_activity, fieldNames) #----- ----- dates = genericParser.filterByField(data_play_activity, ("Event Received Timestamp", )) dateBounds = genericParser.getDateBounds(dates) Dict["activity_date_range"] = dateBounds # ----- ----- play_ms = genericParser.filterByField( data_play_activity, ("Milliseconds Since Play", "Play Duration Milliseconds")) msSincePlay = [s["Milliseconds Since Play"] for s in play_ms] msPlayDuration = [d["Play Duration Milliseconds"] for d in play_ms] for i in range(0, len(msPlayDuration)): if not isinstance(msPlayDuration[i], float) and not isinstance( msPlayDuration[i], int): msPlayDuration[i] = 0 if msPlayDuration[i] < 0: msPlayDuration[i] = -msPlayDuration[i] totalListenTime = sum(msPlayDuration) totalListenTime = genericParser.convertMillis(totalListenTime) Dict["listen_time"] = { "hours": totalListenTime[0], "minutes": totalListenTime[1], "seconds": totalListenTime[2] } # ----- ----- genres = genericParser.filterByField(data_play_activity, ("Genre", )) dictGenreFreq = genericParser.countFrequencies(genres, "Genre") dictTopGenres = genericParser.countTopTen(dictGenreFreq) dictGenreFreq = genericParser.formatDictionary(dictGenreFreq) dictTopGenres = genericParser.formatDictionary(dictTopGenres) Dict["genres_piechart"] = dictGenreFreq Dict["top_ten_genres_list"] = dictTopGenres # ----- ----- artists = genericParser.filterByField(data_play_activity, ("Artist Name", )) dictArtistFreq = genericParser.countFrequencies( artists, "Artist Name") dictTopArtists = genericParser.countTopTen(dictArtistFreq) dictArtistFreq = genericParser.formatDictionary(dictArtistFreq) dictTopArtists = genericParser.formatDictionary(dictTopArtists) Dict["artists_barchart"] = dictArtistFreq Dict["top_ten_artists_list"] = dictTopArtists # ----- ----- tracks = genericParser.filterByField( data_play_activity, ("Content Name", "Artist Name")) dictTrackFreq = genericParser.countFrequencies( tracks, "Content Name") dictTrackArtist = {} for d in tracks: dictTrackArtist[d["Content Name"]] = d["Artist Name"] #replace track name with track + artist name in key dictTrackArtistFreq = {} for track in dictTrackFreq: if track in dictTrackArtist: dictTrackArtistFreq[ track + " - " + dictTrackArtist[track]] = dictTrackFreq[track] dictTopTracks = genericParser.countTopTen(dictTrackArtistFreq) dictTrackArtistFreq = genericParser.formatDictionary( dictTrackArtistFreq) dictTopTracks = genericParser.formatDictionary(dictTopTracks) Dict["tracks_barchart"] = dictTrackArtistFreq Dict["top_ten_tracks_list"] = dictTopTracks # ----- ----- trackIP = genericParser.filterByField( data_play_activity, ("Content Name", "Artist Name", "Client IP Address")) Dict["play_activity_map"] = trackIP # ----- ----- artist_song_endtype = genericParser.filterByField( data_play_activity, ("Artist Name", "Content Name", "End Reason Type", "Milliseconds Since Play")) #TODO: infer actual favorite songs that had natural end of track rather than skip else: print( "/Apple_Media_Services/Apple Music Activity/Apple Music Play Activity.csv not found" ) # ===== LIBRARY TRACKS ====== file_library_tracks = musicDirPath + "/Apple_Media_Services/Apple Music Activity/Apple Music Library Tracks.json" if os.path.exists(file_library_tracks): fieldNames = ("Title", "Artist", "Album", "Album Artist", "Genre", "Track Year", "Date Added To Library", "Last Played Date", "Skip Count", "Date of Last Skip", "Release Date") data_library_tracks = genericParser.jsonToDict( file_library_tracks, fieldNames) # ----- ----- totalNumTracks = len(data_library_tracks) Dict["library_track_count"] = totalNumTracks # ----- ----- titleArtistDates = genericParser.filterByField( data_library_tracks, ("Title", "Artist", "Genre", "Date Added To Library", "Last Played Date", "Release Date")) titleArtistDatesList = genericParser.formatGanttData( titleArtistDates) Dict["library_song_ganttchart"] = titleArtistDatesList # ----- ----- genre_dates = genericParser.filterByField( data_library_tracks, ("Genre", "Last Played Date")) Dict["genre_timeline"] = genre_dates else: print( "/Apple_Media_Services/Apple Music Activity/Apple Music Library Tracks.json not found" ) # ---------- Apps/Games Data ---------- file_apps = musicDirPath + "/Update and Redownload History/iTunes and App-Book Re-download and Update History.csv" if os.path.exists(file_apps): #----- ----- fieldNames = ("Activity Date", "Item Description", "Device IP Address") data_apps = genericParser.csvToDict(file_apps, fieldNames) # ----- ----- app_date = genericParser.filterByField( data_apps, ("Activity Date", "Item Description")) Dict["apps_timeline"] = app_date # ----- ----- app_ip = genericParser.filterByField( data_apps, ("Device IP Address", "Item Description")) app_ip_coord = genericParser.insertCoordinatesFromIP( app_ip, "Device IP Address") Dict["apps_map"] = app_ip_coord else: print( "/Update and Redownload History/iTunes and App-Book Re-download and Update History.csv not found" ) file_game_center = rootPathName + "/Game Center/Game Center Data.json" if os.path.exists(file_game_center): # ----- ----- fieldNames = ("game_name", "last_played_utc") data_game_center = genericParser.jsonToDict( file_game_center, fieldNames) key_games = "games" val_games = data_game_center Dict[key_games] = val_games else: print("/Game Center/Game Center Data.json not found") else: print("given root path does not exist") #write parsed data dictionary to json file genericParser.writeToJsonFile( Dict, './media/processedData/apple/' + appleDataDumpName + '/parsedAppleData.json')
def parseGoogleData(googleDataDumpName): # Define dictionary to map json data to Dict = {} # Parse through apple media root directory rootPathName = "./media/unzippedFiles/google/" + googleDataDumpName + "/Takeout" if os.path.exists(rootPathName): # Get total size Dict["total_size_GB"] = genericParser.getDirSizeInGB(rootPathName) #TODO: double check, this number doesn't make sense # ---------- Profile Data ---------- profileDirPath = rootPathName + "/Profile" file_profile = profileDirPath + "/Profile.json" if os.path.exists(file_profile): data_profile = genericParser.jsonToDict(file_profile, ()) val_profile = {} val_profile["name"] = data_profile["displayName"] val_profile["email"] = data_profile["emails"][0]["value"] Dict["profile_info_header"] = val_profile else: print("/Profile/Profile.json not found") # ---------- Bookmarks Data ---------- bookmarksDirPath = rootPathName + "/Chrome" file_bookmarks = bookmarksDirPath + "/Bookmarks.html" if os.path.exists(file_bookmarks): bookmarks = genericParser.htmlToSoup(file_bookmarks, "dl", "") val_bookmarks = list(filter(None, bookmarks[0].text.split("\n"))) Dict["bookmarks_count"] = len(val_bookmarks) else: print("/Bookmarks/Bookmarks.html not found") # ---------- Maps Data ---------- mapsDirPath = rootPathName + "/Maps (your places)" file_saved_places = mapsDirPath + "/Saved Places.json" if os.path.exists(file_saved_places): data_saved_places = genericParser.jsonToDict(file_saved_places, ()) data_saved_places = data_saved_places["features"] val_saved_places = [] for data_pt in data_saved_places: place = [] name = data_pt["properties"]["Title"] locations = data_pt["properties"]["Location"] if "Geo Coordinates" in locations.keys( ) and "Address" in locations.keys(): address = locations["Address"] coords = locations["Geo Coordinates"] place.append(name) place.append(address) place.append(coords) val_saved_places.append(place) Dict["saved_places_map"] = val_saved_places else: print("/Maps (your places)/Saved Places.json not found") # ---------- YouTube Data ---------- file_playlists = rootPathName + "/YouTube and YouTube Music/playlists/all-playlists.json" file_playlists2 = rootPathName + "/YouTube/playlists/all-playlists.json" if os.path.exists(file_playlists): data_playlists = genericParser.jsonToDict(file_playlists, ()) # ----- ----- Dict["youtube_playlists"] = data_playlists # ----- ----- Dict["youtube_playlists_count"] = len(data_playlists) elif os.path.exists(file_playlists2): data_playlists = genericParser.jsonToDict(file_playlists2, ()) # ----- ----- Dict["youtube_playlists"] = data_playlists # ----- ----- Dict["youtube_playlists_count"] = len(data_playlists) else: print("/playlists/all-playlists.json not found") file_subscriptions = rootPathName + "/YouTube and YouTube Music/subscriptions/subscriptions.json" file_subscriptions2 = rootPathName + "/YouTube/subscriptions/subscriptions.json" if os.path.exists(file_subscriptions): data_subscriptions = genericParser.jsonToDict( file_subscriptions, ()) # ----- ----- Dict["youtube_subscriptions"] = data_subscriptions # ----- ----- Dict["youtube_subscriptions_count"] = len(data_subscriptions) elif os.path.exists(file_subscriptions2): data_subscriptions = genericParser.jsonToDict( file_subscriptions2, ()) # ----- ----- Dict["youtube_subscriptions"] = data_subscriptions # ----- ----- Dict["youtube_subscriptions_count"] = len(data_subscriptions) else: print("/subscriptions/subscriptions.json not found") # ---------- Activity Data ---------- activityDirPath = rootPathName + "/My Activity" # ----- ----- file_ads = activityDirPath + "/Ads/MyActivity.html" if os.path.exists(file_ads): # list of (link, date) tuples ads = genericParser.htmlToSoup( file_ads, "div", "content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1") val_ads = [] for ad in ads: if len(ad.contents) == 4: link = ad.contents[1]['href'] date = ad.contents[3] val_ads.append((link, date)) # ----- ----- Dict["ads_count"] = len(val_ads) # ----- ----- Dict["ads_list"] = val_ads # ----- ----- ad_waffle_data = [] ad_values = {} for ad in val_ads: date = ad[1].split(' ') month = date[0] day = date[1][:-1] year = date[2][:-1] day_str = day if int(day) < 10: day_str = "0" + str(day) date = year + "-" + genericParser.monthToNum[ month] + "-" + day_str if date in ad_values: ad_values[date] += 1 else: ad_values[date] = 1 for item in ad_values: ad_waffle_data.append({"day": item, "value": ad_values[item]}) Dict["ads_waffle"] = ad_waffle_data else: print("/My Activity/Ads/MyActivity.html not found") # ----- ----- file_maps = activityDirPath + "/Maps/MyActivity.html" if os.path.exists(file_maps): # dict of lists of (link, date) tuples val_maps = {} views = [] searches = [] calls = [] directions = [] maps = genericParser.htmlToSoup( file_maps, "div", "content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1") for item in maps: if len(item.contents) == 4: if "Viewed" in str(item.contents[0]): try: link = item.contents[1]['href'] link = link.split('/') coords = link[4][1:] coords = coords.split(',')[:2] date = item.contents[3] views.append(("Viewed", coords, date)) except: print("Beautiful Soup can't parse this") elif "Searched" in str(item.contents[0]): try: link = item.contents[1]['href'] link = link.split('/') date = item.contents[3] if link[4] == "search": query = link[5] coords = link[6][1:-4].split(',') searches.append( ("Searched", query, coords, date)) except: print("Beautiful Soup can't parse this") elif "Called" in str(item.contents[0]): try: link = item.contents[1]['href'] link = link.split('/') name = link[3].split('=')[1].split('&')[0] date = item.contents[3] calls.append(("Called", name, date)) except: print("Beautiful Soup can't parse this") elif len(item.contents) == 8: try: link = item.contents[1]['href'] origin = item.contents[3] dest = item.contents[5] date = item.contents[7] directions.append( ("Directions", link, origin, dest, date)) except: print("Beautiful Soup can't parse this") val_maps["views"] = views val_maps["searches"] = searches Dict["maps_activity"] = val_maps # ----- ----- Dict["maps_routes_count"] = len(directions) # ----- ----- Dict["maps_call_list"] = calls else: print("/My Activity/Maps/MyActivity.html not found") # ----- ----- file_search = activityDirPath + "/Search/MyActivity.html" if os.path.exists(file_search): # dict of lists of (link, date) tuples val_searches = {} views = [] visits = [] searches = [] engineSearches = genericParser.htmlToSoup( file_search, "div", "content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1") for item in engineSearches: if len(item.contents) == 4: if "Viewed" in str(item.contents[0]): view = item.contents[1].string link = item.contents[1]['href'] date = item.contents[3] caption = item.next_sibling.next_sibling.contents if len(caption) > 4 and "Locations" in str(caption[4]): coordLink = caption[7]['href'] if "center" in str(coordLink): coords = coordLink.split("center")[1][1:21] else: coords = coordLink.split("=")[2][1:21] views.append((view, link, date, coords)) else: views.append((view, link, date)) elif "Visited" in str(item.contents[0]): visit = item.contents[1].string link = item.contents[1]['href'] date = item.contents[3] caption = item.next_sibling.next_sibling.contents if len(caption) > 4 and "Locations" in str(caption[4]): coordLink = caption[7]['href'] if "center" in str(coordLink): coords = coordLink.split("center")[1][1:21] else: coords = coordLink.split("=")[2][1:21] visits.append((visit, link, date, coords)) else: visits.append((visit, link, date)) elif "Searched" in str(item.contents[0]): search = item.contents[1].string link = item.contents[1]['href'] date = item.contents[3] caption = item.next_sibling.next_sibling.contents if len(caption) > 4 and "Locations" in str(caption[4]): coordLink = caption[7]['href'] if "center" in str(coordLink): coords = coordLink.split("center")[1][1:21] else: coords = coordLink.split("=")[2][1:21] searches.append((search, link, date, coords)) else: searches.append((search, link, date)) # ----- ----- Dict["search_count"] = len(searches) # ----- ----- google_search_waffle_data = [] google_search_values = {} for search in searches: date = search[2].split(' ') month = date[0] day = date[1][:-1] year = date[2][:-1] day_str = day if int(day) < 10: day_str = "0" + str(day) date = year + "-" + genericParser.monthToNum[ month] + "-" + day_str #print("gg_search_date: " + date) if date in google_search_values: google_search_values[date] += 1 else: google_search_values[date] = 1 for item in google_search_values: google_search_waffle_data.append({ "day": item, "value": google_search_values[item] }) Dict["search_waffle"] = google_search_waffle_data else: print("/My Activity/Search/MyActivity.html not found") # ----- ----- file_youtube = activityDirPath + "/YouTube/MyActivity.html" if os.path.exists(file_youtube): watches = [] searches = [] youtubeActions = genericParser.htmlToSoup( file_youtube, "div", "content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1") for item in youtubeActions: if len(item.contents) == 6 and "Watched" in str( item.contents[0]): video = item.contents[1].string link = item.contents[1]['href'] uploader = item.contents[3].string date = item.contents[5] watches.append((video, link, uploader, date)) elif len(item.contents) == 4 and "Searched" in str( item.contents[0]): query = item.contents[1].string link = item.contents[1]['href'] date = item.contents[3] searches.append((query, link, date)) # ----- ----- channels = {} for video in watches: if video[2] in channels: channels[video[2]] += 1 else: channels[video[2]] = 1 sorted_chan = sorted(channels.items(), reverse=True, key=operator.itemgetter(1)) pie_format_chan = [] count = 0 for item in sorted_chan: if count == 10: break else: pie_format_chan.append({ "id": count, "label": item[0], "value": item[1] }) count += 1 Dict["youtube_piechart"] = pie_format_chan # ----- ----- youtube_search_waffle_data = [] youtube_search_values = {} for search in searches: date = search[2].split(' ') month = date[0] day = date[1][:-1] year = date[2][:-1] day_str = day if int(day) < 10: day_str = "0" + str(day) date = year + "-" + genericParser.monthToNum[ month] + "-" + day_str #print("yt_search_date: " + date) if date in youtube_search_values: youtube_search_values[date] += 1 else: youtube_search_values[date] = 1 for item in youtube_search_values: youtube_search_waffle_data.append({ "day": item, "value": youtube_search_values[item] }) Dict["youtube_search_waffle"] = youtube_search_waffle_data else: print("/My Activity/Youtube/MyActivity.html not found") # ---------- Contacts Data ---------- contactsDirPath = rootPathName + "/Contacts" file_contacts = contactsDirPath + "/All Contacts/All Contacts.vcf" # ----- ----- else: print("Google data dump path does not exist") #write parsed data dictionary to json file genericParser.writeToJsonFile( Dict, './media/processedData/google/' + googleDataDumpName + '/parsedGoogleData.json')
def parseFacebookData(facebookDataDumpName): # Define dictionary to map json data to Dict = {} # Define tuple to store root directory names of interest rootCategoriesOfInterest = ("about_you", "ads_and_businesses", "apps_and_websites", "friends", "likes_and_reactions", "other_activity", "posts", "profile_information", "security_and_login_information") # Parse through facebook media root directory #TODO: uncomment first if running through django and second if through python rootPathName = "./media/unzippedFiles/facebook/" + facebookDataDumpName #rootPathName = "../media/unzippedFiles/facebook/" + facebookDataDumpName if os.path.exists(rootPathName): # Get total size Dict["total_size_GB"] = genericParser.getDirSizeInGB(rootPathName) # Extract json data for root, dirs, files in genericParser.walklevel(rootPathName, level=1): # from https://stackoverflow.com/a/7253830 categoryDirName = root.rsplit('/', 1)[-1] # if category is valid, # add data of interest to dictionary dirPath = rootPathName + "/" + categoryDirName if any(categoryDirName in category for category in rootCategoriesOfInterest) and os.path.exists(dirPath): if categoryDirName == "about_you": # ----- US 6.1 ----- file_peer_group = "friend_peer_group.json" data_peer_group = genericParser.jsonToDict(dirPath + "/" + file_peer_group, ()) # user friend group category Dict["friend_peer_group"] = data_peer_group["friend_peer_group"] elif categoryDirName == "ads_and_businesses": # ----- US 6.8 ----- file_off_facebook_activity = "your_off-facebook_activity.json" data_off_facebook_activity = genericParser.jsonToDict(dirPath + "/" + file_off_facebook_activity, ()) # overall json superset of off facebook activity Dict["off_facebook_activity"] = data_off_facebook_activity["off_facebook_activity"] # count of off facebook business with data Dict["num_businesses_off_facebook"] = len(data_off_facebook_activity["off_facebook_activity"]) # list of off facebook businesses with data # from https://stackoverflow.com/a/56163468 list_businesses_off_facebook = [item.get("name") for item in data_off_facebook_activity["off_facebook_activity"]] Dict["businesses_off_facebook"] = list_businesses_off_facebook # ----- US 6.9 ----- file_advs = "advertisers_who_uploaded_a_contact_list_with_your_information.json" data_advs = genericParser.jsonToDict(dirPath + "/" + file_advs, ()) # list of advertisers with your contact info key_advs = file_advs[:-5] val_advs = data_advs["custom_audiences"] Dict[key_advs] = val_advs # count of advertisers with your contact info Dict["num_advertisers"] = len(val_advs) elif categoryDirName == "apps_and_websites": # ----- US 6.5 & 6.6 ----- file_apps_websites = "apps_and_websites.json" data_apps_websites = genericParser.jsonToDict(dirPath + "/" + file_apps_websites, ()) # count of apps/websites that you used facebook to login Dict["num_apps_and_websites_logged_into_with_facebook"] = len(data_apps_websites["installed_apps"]) # list of apps/websites that you used facebook to login Dict["apps_and_websites_logged_into_with_facebook"] = data_apps_websites["installed_apps"] elif categoryDirName == "friends": # ----- US 6.10 ----- file_friends = "friends.json" data_friends = genericParser.jsonToDict(dirPath + "/" + file_friends, ()) # count of facebook friends Dict["num_friends"] = len(data_friends["friends"]) # list of facebook friends list_friends = [item.get("name") for item in data_friends["friends"]] Dict["friends"] = list_friends elif categoryDirName == "likes_and_reactions": # ----- US 6.4 ----- file_reactions = "posts_and_comments.json" data_reactions = genericParser.jsonToDict(dirPath + "/" + file_reactions, ()) # overall json superset of reactions Dict["reactions"] = data_reactions["reactions"] elif categoryDirName == "other_activity": # ----- US 6.10 ----- file_pokes = "pokes.json" data_pokes = genericParser.jsonToDict(dirPath + "/" + file_pokes, ()) # count of pokes if "activity_log_data" in data_pokes["pokes"]: val_ct_pokes = len(data_pokes["pokes"]["activity_log_data"]) pokes = dirPath + "/" + file_pokes else: val_ct_pokes = 0 Dict["num_pokes"] = val_ct_pokes # overall json superset of pokes if "activity_log_data" in data_pokes["pokes"]: val_pokes = data_pokes["pokes"]["activity_log_data"] else: val_pokes = 'no pokes' Dict["pokes"] = val_pokes elif categoryDirName == "posts": # ----- US 6.3 ----- file_others_posts = "other_people's_posts_to_your_timeline.json" data_others_posts = genericParser.jsonToDict(dirPath + "/" + file_others_posts, ()) # overall json superset of others posts key_others_posts = file_others_posts[:-5] val_others_posts = data_others_posts["wall_posts_sent_to_you"]["activity_log_data"] Dict[key_others_posts] = val_others_posts # ----- US 6.3 ----- file_your_posts = "your_posts_1.json" data_your_posts = genericParser.jsonToDict(dirPath + "/" + file_your_posts, ()) # overall json superset of your posts Dict["your_posts"] = data_your_posts elif categoryDirName == "profile_information": # ----- US 6.1 ----- file_profile_info = "profile_information.json" data_profile_info = genericParser.jsonToDict(dirPath + "/" + file_profile_info, ()) # overall json superset of your profile info Dict["profile_information"] = data_profile_info["profile"] # your name Dict["name"] = data_profile_info["profile"]["name"]["full_name"] # ----- US 6.10 ----- file_profile_update_history = "profile_update_history.json" data_profile_update_history = genericParser.jsonToDict(dirPath + "/" + file_profile_update_history, ()) # overall json superset of your profile update history Dict["profile_update_history"] = data_profile_update_history["profile_updates"] elif categoryDirName == "security_and_login_information": # ----- US 6.2 ----- file_logins_logouts = "logins_and_logouts.json" data_logins_logouts = genericParser.jsonToDict(dirPath + "/" + file_logins_logouts, ()) # overall json superset of login and logouts Dict["logins_and_logouts"] = data_logins_logouts["account_accesses"] else: print("category not found") else: print("path not interesting") else: print("root path not found") #write parsed data dictionary to json file genericParser.writeToJsonFile(Dict, 'media/processedData/facebook/' + facebookDataDumpName + '/parsedFacebookData.json')
def analyzeFacebookData(facebookUserFileName): data = genericParser.getParsedJson("media/processedData/facebook/" + facebookUserFileName + "/parsedFacebookData.json") #data = genericParser.getParsedJson("../media/processedData/facebook/" + facebookUserFileName + "/parsedFacebookData.json") Dict = {} # ----- US 6.1 ----- if "name" in data.keys() and "friend_peer_group" in data.keys(): Dict["name_category_header"] = [data["name"], data["friend_peer_group"]] # ----- US 6.2 ----- if "logins_and_logouts" in data.keys(): Dict["locations_piechart"] = data["logins_and_logouts"] # ip address bar chart locationsMap = {} for action in data["logins_and_logouts"]: if action["ip_address"] in locationsMap: locationsMap[action["ip_address"]] = locationsMap[action["ip_address"]] + 1 else: locationsMap[action["ip_address"]] = 1 locationsArray = [] count = 0 for loc, amt in locationsMap.items(): locationsArray.append({"id": count, "label": loc, "value": amt}) count += 1 Dict["locations_barchart"] = locationsArray # ip address map List = [] for action in data["logins_and_logouts"]: loginLogoutMap = {} loginLogoutMap["action"] = action["action"] loginLogoutMap["timestamp"] = genericParser.getDatetimeFromEpoch(action["timestamp"]) loginLogoutMap["ip_address"] = action["ip_address"] loginLogoutMap["location"] = genericParser.getLocation(action["ip_address"]) List.append(loginLogoutMap) Dict["login_logout_map"] = List # ----- US 6.3 ----- if "your_posts" in data.keys() and "other_people's_posts_to_your_timeline" in data.keys(): Dict["posts_linegraph"] = [data["your_posts"], data["other_people's_posts_to_your_timeline"]] # posts pie chart Dict["posts_piechart"] = [{"id": 1, "label": "Your Posts", "value": len(data["your_posts"])}, {"id": 2, "label": "Friend's Posts", "value": len(data["other_people's_posts_to_your_timeline"])}] # ----- US 6.4 ----- if "reactions" in data.keys(): Dict["reactions_pictograph"] = data["reactions"] reactionsMap = {} for action in data["reactions"]: if action["data"][0]["reaction"]["reaction"] in reactionsMap: reactionsMap[action["data"][0]["reaction"]["reaction"]] = reactionsMap[action["data"][0]["reaction"]["reaction"]] + 1 else: reactionsMap[action["data"][0]["reaction"]["reaction"]] = 1 reactionsArray = [] count = 0 for reaction, amt in reactionsMap.items(): reactionsArray.append({"id": count, "label": reaction, "value": amt}) count += 1 Dict["reactions_barchart"] = reactionsArray # ----- US 6.5 ----- if "num_apps_and_websites_logged_into_with_facebook" in data.keys(): Dict["websites_count"] = data["num_apps_and_websites_logged_into_with_facebook"] # ----- US 6.6/6.7 ----- if "apps_and_websites_logged_into_with_facebook" in data.keys(): Dict["websites_list"] = data["apps_and_websites_logged_into_with_facebook"] # ----- US 6.8 ----- if "num_businesses_off_facebook" in data.keys(): Dict["off_facebook_activity_count"] = data["num_businesses_off_facebook"] if "businesses_off_facebook" in data.keys(): Dict["off_facebook_activity_list"] = data["businesses_off_facebook"] # ----- US 6.9 ----- if "advertisers_who_uploaded_a_contact_list_with_your_information" in data.keys(): Dict["advertisers_list"] = data["advertisers_who_uploaded_a_contact_list_with_your_information"] if "num_advertisers" in data.keys(): Dict["advertisers_count"] = data["num_advertisers"] # ----- US 6.10 ----- if "friends" in data.keys() and "num_friends" in data.keys() and "pokes" in data.keys() and"num_pokes" in data.keys() and "profile_update_history" in data.keys(): Dict["master_linegraph"] = [data["friends"], data["num_friends"], data["pokes"], data["num_pokes"], data["profile_update_history"]] #write analyzed data dictionary to json file genericParser.writeToJsonFile(Dict, "media/processedData/facebook/" + facebookUserFileName + "/analyzedFacebookData.json")
def parseNetflixData(netflixDataDumpName): rootPathName = "./media/unzippedFiles/netflix/" + netflixDataDumpName if os.path.exists(rootPathName): netflixData = genericParser.csvToDict(rootPathName, ("Title", "Date")) #print("Uploading netflix data") #print(netflixData) #print("End of raw Netflix data") shows = {} movies = [] for item in netflixData: if isShow(item["Title"]): title = getShowTitle(item["Title"]) if title in shows: shows[title]["count"] = shows[title]["count"] + 1 shows[title]["firstDate"] = earlierDate(parseDate(item["Date"]), shows[title]["firstDate"]) shows[title]["lastDate"] = laterDate(parseDate(item["Date"]), shows[title]["lastDate"]) else: shows[title] = {"count": 1, "firstDate": parseDate(item["Date"]), "lastDate": parseDate(item["Date"])} else: movies.append({"title": item["Title"], "date": item["Date"]}) # for item in netflixData: #pp = pprint.PrettyPrinter(indent=4) #print("Shows") #pp.pprint(shows) # print("Movies") # print(movies) analyzedData = {} totalWatchCount = len(netflixData) # print(totalWatchCount) # format for gantt chart showsGantt = [] count = 1 for show in shows: formattedShow = [] formattedShow.append(str(count)) formattedShow.append(show) formattedShow.append(str(shows[show]["count"]) + " episodes watched") formattedShow.append(shows[show]["firstDate"]) formattedShow.append(shows[show]["lastDate"]) formattedShow.append("null") formattedShow.append(0) formattedShow.append("null") showsGantt.append(formattedShow) count += 1 #pp.pprint(showsGantt) # all shows list allShows = [] count = 0 for show in shows: allShows.append({"id": count, "label": show, "value": shows[show]["count"]}) count += 1 # all movies list allMovies = [] count = 0 for movie in movies: allMovies.append({"id": count, "label": movie["title"], "value": movie["date"]}) count += 1 # find top 10 shows topTenShows = [] for show in shows: heapq.heappush(topTenShows, (shows[show]["count"], show)) topTenShows = heapq.nlargest(10, topTenShows) # print("Top 10 shows") # print(topTenShows) pieChartTopTenShows = [] count = 0 for show in topTenShows: pieChartTopTenShows.append({"id": count, "label": show[1], "value": show[0]}) count += 1 analyzedData["shows_ganttchart"] = showsGantt analyzedData["shows_piechart"] = pieChartTopTenShows analyzedData["totalCount"] = totalWatchCount analyzedData["movies"] = allMovies analyzedData["shows"] = allShows #print(analyzedData) genericParser.writeToJsonFile(analyzedData, "media/processedData/netflix/" + netflixDataDumpName)