예제 #1
0
def parseAppleData(appleDataDumpName):
    # Define dictionary to map json data to
    Dict = {}

    # Parse through apple media root directory
    rootPathName = "./media/unzippedFiles/apple/" + appleDataDumpName

    if os.path.exists(rootPathName):

        # Get total size
        Dict["total_size_GB"] = genericParser.getDirSizeInGB(rootPathName)
        #TODO: double check, this number doesn't make sense

        # ---------- General Data ----------
        genDirPath = rootPathName + "/Apple ID account and device information"

        # -----  -----
        file_account_info = genDirPath + "/Apple ID Account Information.csv"
        if os.path.exists(file_account_info):

            fieldNames = ("Apple ID Number", "First Name", "Last Name",
                          "Official Address")
            data_account_info = genericParser.csvToDict(
                file_account_info, fieldNames)

            apple_id = data_account_info[0]["Apple ID Number"]
            name = data_account_info[0][
                "First Name"] + " " + data_account_info[0]["Last Name"]
            address = data_account_info[0]["Official Address"]

            Dict["account_info_header"] = [apple_id, name, address]
        else:
            print(
                "/Apple ID account and device information/Apple ID Account Information.csv not found"
            )

        # -----  -----
        file_devices = genDirPath + "/Apple ID Device Information.csv"
        if os.path.exists(file_devices):

            fieldNames = ("Device Name", "Device Added Date",
                          "Device Serial Number", "Device Last Heartbeat IP")
            data_devices = genericParser.csvToDict(file_devices, fieldNames)

            Dict["devices_list"] = data_devices

        else:
            print(
                "/Apple ID account and device information/Apple ID Device Information.csv not found"
            )

        # ---------- Music Data ----------
        musicDirPath = rootPathName + "/Apple Media Services information"

        if os.path.exists(musicDirPath):
            zipPath = musicDirPath + "/Apple_Media_Services.zip"
            with zipfile.ZipFile(zipPath, "r") as zip_ref:
                zip_ref.extractall(musicDirPath)

            zipPath1 = musicDirPath + "/Apple_Media_Services/Apple Music Activity/Apple Music Library Tracks.json.zip"
            with zipfile.ZipFile(zipPath1, "r") as zip_ref1:
                zip_ref1.extractall(
                    musicDirPath +
                    "/Apple_Media_Services/Apple Music Activity")

        # ===== PLAY ACTIVITY =====
        file_play_activity = musicDirPath + "/Apple_Media_Services/Apple Music Activity/Apple Music Play Activity.csv"
        if os.path.exists(file_play_activity):

            fieldNames = ("Artist Name", "Content Name", "Client IP Address",
                          "End Reason Type", "Genre",
                          "Milliseconds Since Play",
                          "Play Duration Milliseconds",
                          "Event Received Timestamp")
            data_play_activity = genericParser.csvToDict(
                file_play_activity, fieldNames)

            #-----  -----
            dates = genericParser.filterByField(data_play_activity,
                                                ("Event Received Timestamp", ))
            dateBounds = genericParser.getDateBounds(dates)
            Dict["activity_date_range"] = dateBounds

            # -----   -----
            play_ms = genericParser.filterByField(
                data_play_activity,
                ("Milliseconds Since Play", "Play Duration Milliseconds"))
            msSincePlay = [s["Milliseconds Since Play"] for s in play_ms]
            msPlayDuration = [d["Play Duration Milliseconds"] for d in play_ms]

            for i in range(0, len(msPlayDuration)):
                if not isinstance(msPlayDuration[i], float) and not isinstance(
                        msPlayDuration[i], int):
                    msPlayDuration[i] = 0

                if msPlayDuration[i] < 0:
                    msPlayDuration[i] = -msPlayDuration[i]

            totalListenTime = sum(msPlayDuration)
            totalListenTime = genericParser.convertMillis(totalListenTime)

            Dict["listen_time"] = {
                "hours": totalListenTime[0],
                "minutes": totalListenTime[1],
                "seconds": totalListenTime[2]
            }

            # -----   -----
            genres = genericParser.filterByField(data_play_activity,
                                                 ("Genre", ))
            dictGenreFreq = genericParser.countFrequencies(genres, "Genre")
            dictTopGenres = genericParser.countTopTen(dictGenreFreq)

            dictGenreFreq = genericParser.formatDictionary(dictGenreFreq)
            dictTopGenres = genericParser.formatDictionary(dictTopGenres)

            Dict["genres_piechart"] = dictGenreFreq
            Dict["top_ten_genres_list"] = dictTopGenres

            # -----   -----
            artists = genericParser.filterByField(data_play_activity,
                                                  ("Artist Name", ))
            dictArtistFreq = genericParser.countFrequencies(
                artists, "Artist Name")
            dictTopArtists = genericParser.countTopTen(dictArtistFreq)

            dictArtistFreq = genericParser.formatDictionary(dictArtistFreq)
            dictTopArtists = genericParser.formatDictionary(dictTopArtists)

            Dict["artists_barchart"] = dictArtistFreq
            Dict["top_ten_artists_list"] = dictTopArtists

            # -----   -----
            tracks = genericParser.filterByField(
                data_play_activity, ("Content Name", "Artist Name"))
            dictTrackFreq = genericParser.countFrequencies(
                tracks, "Content Name")

            dictTrackArtist = {}
            for d in tracks:
                dictTrackArtist[d["Content Name"]] = d["Artist Name"]

            #replace track name with track + artist name in key
            dictTrackArtistFreq = {}
            for track in dictTrackFreq:
                if track in dictTrackArtist:
                    dictTrackArtistFreq[
                        track + " - " +
                        dictTrackArtist[track]] = dictTrackFreq[track]

            dictTopTracks = genericParser.countTopTen(dictTrackArtistFreq)

            dictTrackArtistFreq = genericParser.formatDictionary(
                dictTrackArtistFreq)
            dictTopTracks = genericParser.formatDictionary(dictTopTracks)

            Dict["tracks_barchart"] = dictTrackArtistFreq
            Dict["top_ten_tracks_list"] = dictTopTracks

            # -----   -----
            trackIP = genericParser.filterByField(
                data_play_activity,
                ("Content Name", "Artist Name", "Client IP Address"))
            Dict["play_activity_map"] = trackIP

            # -----   -----
            artist_song_endtype = genericParser.filterByField(
                data_play_activity,
                ("Artist Name", "Content Name", "End Reason Type",
                 "Milliseconds Since Play"))

            #TODO: infer actual favorite songs that had natural end of track rather than skip

        else:
            print(
                "/Apple_Media_Services/Apple Music Activity/Apple Music Play Activity.csv not found"
            )

        # ===== LIBRARY TRACKS ======
        file_library_tracks = musicDirPath + "/Apple_Media_Services/Apple Music Activity/Apple Music Library Tracks.json"
        if os.path.exists(file_library_tracks):

            fieldNames = ("Title", "Artist", "Album", "Album Artist", "Genre",
                          "Track Year", "Date Added To Library",
                          "Last Played Date", "Skip Count",
                          "Date of Last Skip", "Release Date")
            data_library_tracks = genericParser.jsonToDict(
                file_library_tracks, fieldNames)

            # -----   -----
            totalNumTracks = len(data_library_tracks)
            Dict["library_track_count"] = totalNumTracks

            # -----   -----
            titleArtistDates = genericParser.filterByField(
                data_library_tracks,
                ("Title", "Artist", "Genre", "Date Added To Library",
                 "Last Played Date", "Release Date"))
            titleArtistDatesList = genericParser.formatGanttData(
                titleArtistDates)
            Dict["library_song_ganttchart"] = titleArtistDatesList

            # -----   -----
            genre_dates = genericParser.filterByField(
                data_library_tracks, ("Genre", "Last Played Date"))
            Dict["genre_timeline"] = genre_dates

        else:
            print(
                "/Apple_Media_Services/Apple Music Activity/Apple Music Library Tracks.json not found"
            )

        # ---------- Apps/Games Data ----------
        file_apps = musicDirPath + "/Update and Redownload History/iTunes and App-Book Re-download and Update History.csv"
        if os.path.exists(file_apps):
            #-----  -----
            fieldNames = ("Activity Date", "Item Description",
                          "Device IP Address")
            data_apps = genericParser.csvToDict(file_apps, fieldNames)

            # -----   -----
            app_date = genericParser.filterByField(
                data_apps, ("Activity Date", "Item Description"))
            Dict["apps_timeline"] = app_date

            # -----   -----
            app_ip = genericParser.filterByField(
                data_apps, ("Device IP Address", "Item Description"))
            app_ip_coord = genericParser.insertCoordinatesFromIP(
                app_ip, "Device IP Address")
            Dict["apps_map"] = app_ip_coord

        else:
            print(
                "/Update and Redownload History/iTunes and App-Book Re-download and Update History.csv not found"
            )

        file_game_center = rootPathName + "/Game Center/Game Center Data.json"
        if os.path.exists(file_game_center):
            # -----  -----
            fieldNames = ("game_name", "last_played_utc")
            data_game_center = genericParser.jsonToDict(
                file_game_center, fieldNames)

            key_games = "games"
            val_games = data_game_center
            Dict[key_games] = val_games
        else:
            print("/Game Center/Game Center Data.json not found")

    else:
        print("given root path does not exist")

    #write parsed data dictionary to json file
    genericParser.writeToJsonFile(
        Dict, './media/processedData/apple/' + appleDataDumpName +
        '/parsedAppleData.json')
예제 #2
0
def parseGoogleData(googleDataDumpName):
    # Define dictionary to map json data to
    Dict = {}

    # Parse through apple media root directory
    rootPathName = "./media/unzippedFiles/google/" + googleDataDumpName + "/Takeout"

    if os.path.exists(rootPathName):
        # Get total size
        Dict["total_size_GB"] = genericParser.getDirSizeInGB(rootPathName)
        #TODO: double check, this number doesn't make sense

        # ---------- Profile Data ----------
        profileDirPath = rootPathName + "/Profile"
        file_profile = profileDirPath + "/Profile.json"
        if os.path.exists(file_profile):

            data_profile = genericParser.jsonToDict(file_profile, ())

            val_profile = {}
            val_profile["name"] = data_profile["displayName"]
            val_profile["email"] = data_profile["emails"][0]["value"]

            Dict["profile_info_header"] = val_profile

        else:
            print("/Profile/Profile.json not found")

        # ---------- Bookmarks Data ----------
        bookmarksDirPath = rootPathName + "/Chrome"
        file_bookmarks = bookmarksDirPath + "/Bookmarks.html"
        if os.path.exists(file_bookmarks):

            bookmarks = genericParser.htmlToSoup(file_bookmarks, "dl", "")
            val_bookmarks = list(filter(None, bookmarks[0].text.split("\n")))

            Dict["bookmarks_count"] = len(val_bookmarks)

        else:
            print("/Bookmarks/Bookmarks.html not found")

        # ---------- Maps Data ----------
        mapsDirPath = rootPathName + "/Maps (your places)"
        file_saved_places = mapsDirPath + "/Saved Places.json"
        if os.path.exists(file_saved_places):

            data_saved_places = genericParser.jsonToDict(file_saved_places, ())
            data_saved_places = data_saved_places["features"]

            val_saved_places = []
            for data_pt in data_saved_places:
                place = []

                name = data_pt["properties"]["Title"]
                locations = data_pt["properties"]["Location"]

                if "Geo Coordinates" in locations.keys(
                ) and "Address" in locations.keys():
                    address = locations["Address"]
                    coords = locations["Geo Coordinates"]

                    place.append(name)
                    place.append(address)
                    place.append(coords)

                    val_saved_places.append(place)

            Dict["saved_places_map"] = val_saved_places

        else:
            print("/Maps (your places)/Saved Places.json not found")

        # ---------- YouTube Data ----------
        file_playlists = rootPathName + "/YouTube and YouTube Music/playlists/all-playlists.json"
        file_playlists2 = rootPathName + "/YouTube/playlists/all-playlists.json"
        if os.path.exists(file_playlists):
            data_playlists = genericParser.jsonToDict(file_playlists, ())

            # -----  -----
            Dict["youtube_playlists"] = data_playlists

            # -----  -----
            Dict["youtube_playlists_count"] = len(data_playlists)

        elif os.path.exists(file_playlists2):
            data_playlists = genericParser.jsonToDict(file_playlists2, ())

            # -----  -----
            Dict["youtube_playlists"] = data_playlists

            # -----  -----
            Dict["youtube_playlists_count"] = len(data_playlists)

        else:
            print("/playlists/all-playlists.json not found")

        file_subscriptions = rootPathName + "/YouTube and YouTube Music/subscriptions/subscriptions.json"
        file_subscriptions2 = rootPathName + "/YouTube/subscriptions/subscriptions.json"
        if os.path.exists(file_subscriptions):
            data_subscriptions = genericParser.jsonToDict(
                file_subscriptions, ())

            # -----  -----
            Dict["youtube_subscriptions"] = data_subscriptions

            # -----  -----
            Dict["youtube_subscriptions_count"] = len(data_subscriptions)

        elif os.path.exists(file_subscriptions2):
            data_subscriptions = genericParser.jsonToDict(
                file_subscriptions2, ())

            # -----  -----
            Dict["youtube_subscriptions"] = data_subscriptions

            # -----  -----
            Dict["youtube_subscriptions_count"] = len(data_subscriptions)

        else:
            print("/subscriptions/subscriptions.json not found")

        # ---------- Activity Data ----------
        activityDirPath = rootPathName + "/My Activity"

        # -----  -----
        file_ads = activityDirPath + "/Ads/MyActivity.html"
        if os.path.exists(file_ads):

            # list of (link, date) tuples
            ads = genericParser.htmlToSoup(
                file_ads, "div",
                "content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1")

            val_ads = []
            for ad in ads:
                if len(ad.contents) == 4:
                    link = ad.contents[1]['href']
                    date = ad.contents[3]

                    val_ads.append((link, date))

            # -----  -----
            Dict["ads_count"] = len(val_ads)

            # -----  -----
            Dict["ads_list"] = val_ads

            # -----  -----
            ad_waffle_data = []
            ad_values = {}

            for ad in val_ads:
                date = ad[1].split(' ')

                month = date[0]
                day = date[1][:-1]
                year = date[2][:-1]

                day_str = day
                if int(day) < 10:
                    day_str = "0" + str(day)

                date = year + "-" + genericParser.monthToNum[
                    month] + "-" + day_str
                if date in ad_values:
                    ad_values[date] += 1
                else:
                    ad_values[date] = 1

            for item in ad_values:
                ad_waffle_data.append({"day": item, "value": ad_values[item]})

            Dict["ads_waffle"] = ad_waffle_data

        else:
            print("/My Activity/Ads/MyActivity.html not found")

        # -----  -----
        file_maps = activityDirPath + "/Maps/MyActivity.html"
        if os.path.exists(file_maps):

            # dict of lists of (link, date) tuples
            val_maps = {}

            views = []
            searches = []
            calls = []
            directions = []

            maps = genericParser.htmlToSoup(
                file_maps, "div",
                "content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1")

            for item in maps:

                if len(item.contents) == 4:
                    if "Viewed" in str(item.contents[0]):
                        try:
                            link = item.contents[1]['href']
                            link = link.split('/')
                            coords = link[4][1:]
                            coords = coords.split(',')[:2]

                            date = item.contents[3]

                            views.append(("Viewed", coords, date))

                        except:
                            print("Beautiful Soup can't parse this")

                    elif "Searched" in str(item.contents[0]):
                        try:
                            link = item.contents[1]['href']
                            link = link.split('/')

                            date = item.contents[3]

                            if link[4] == "search":
                                query = link[5]
                                coords = link[6][1:-4].split(',')

                                searches.append(
                                    ("Searched", query, coords, date))
                        except:
                            print("Beautiful Soup can't parse this")

                    elif "Called" in str(item.contents[0]):
                        try:
                            link = item.contents[1]['href']
                            link = link.split('/')
                            name = link[3].split('=')[1].split('&')[0]

                            date = item.contents[3]

                            calls.append(("Called", name, date))
                        except:
                            print("Beautiful Soup can't parse this")

                elif len(item.contents) == 8:
                    try:
                        link = item.contents[1]['href']

                        origin = item.contents[3]
                        dest = item.contents[5]
                        date = item.contents[7]

                        directions.append(
                            ("Directions", link, origin, dest, date))
                    except:
                        print("Beautiful Soup can't parse this")

            val_maps["views"] = views
            val_maps["searches"] = searches

            Dict["maps_activity"] = val_maps

            # -----  -----
            Dict["maps_routes_count"] = len(directions)

            # ----- -----
            Dict["maps_call_list"] = calls

        else:
            print("/My Activity/Maps/MyActivity.html not found")

        # -----  -----
        file_search = activityDirPath + "/Search/MyActivity.html"

        if os.path.exists(file_search):

            # dict of lists of (link, date) tuples
            val_searches = {}

            views = []
            visits = []
            searches = []

            engineSearches = genericParser.htmlToSoup(
                file_search, "div",
                "content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1")

            for item in engineSearches:

                if len(item.contents) == 4:

                    if "Viewed" in str(item.contents[0]):
                        view = item.contents[1].string
                        link = item.contents[1]['href']
                        date = item.contents[3]

                        caption = item.next_sibling.next_sibling.contents
                        if len(caption) > 4 and "Locations" in str(caption[4]):
                            coordLink = caption[7]['href']

                            if "center" in str(coordLink):
                                coords = coordLink.split("center")[1][1:21]
                            else:
                                coords = coordLink.split("=")[2][1:21]

                            views.append((view, link, date, coords))

                        else:
                            views.append((view, link, date))

                    elif "Visited" in str(item.contents[0]):
                        visit = item.contents[1].string
                        link = item.contents[1]['href']
                        date = item.contents[3]

                        caption = item.next_sibling.next_sibling.contents
                        if len(caption) > 4 and "Locations" in str(caption[4]):
                            coordLink = caption[7]['href']

                            if "center" in str(coordLink):
                                coords = coordLink.split("center")[1][1:21]
                            else:
                                coords = coordLink.split("=")[2][1:21]

                            visits.append((visit, link, date, coords))

                        else:
                            visits.append((visit, link, date))

                    elif "Searched" in str(item.contents[0]):
                        search = item.contents[1].string
                        link = item.contents[1]['href']
                        date = item.contents[3]

                        caption = item.next_sibling.next_sibling.contents
                        if len(caption) > 4 and "Locations" in str(caption[4]):
                            coordLink = caption[7]['href']

                            if "center" in str(coordLink):
                                coords = coordLink.split("center")[1][1:21]
                            else:
                                coords = coordLink.split("=")[2][1:21]

                            searches.append((search, link, date, coords))

                        else:
                            searches.append((search, link, date))

            # -----  -----
            Dict["search_count"] = len(searches)

            # -----  -----
            google_search_waffle_data = []
            google_search_values = {}

            for search in searches:
                date = search[2].split(' ')

                month = date[0]
                day = date[1][:-1]
                year = date[2][:-1]

                day_str = day
                if int(day) < 10:
                    day_str = "0" + str(day)

                date = year + "-" + genericParser.monthToNum[
                    month] + "-" + day_str
                #print("gg_search_date: " + date)
                if date in google_search_values:
                    google_search_values[date] += 1
                else:
                    google_search_values[date] = 1

            for item in google_search_values:
                google_search_waffle_data.append({
                    "day":
                    item,
                    "value":
                    google_search_values[item]
                })

            Dict["search_waffle"] = google_search_waffle_data

        else:
            print("/My Activity/Search/MyActivity.html not found")

        # -----  -----
        file_youtube = activityDirPath + "/YouTube/MyActivity.html"
        if os.path.exists(file_youtube):

            watches = []
            searches = []

            youtubeActions = genericParser.htmlToSoup(
                file_youtube, "div",
                "content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1")

            for item in youtubeActions:

                if len(item.contents) == 6 and "Watched" in str(
                        item.contents[0]):
                    video = item.contents[1].string
                    link = item.contents[1]['href']
                    uploader = item.contents[3].string
                    date = item.contents[5]

                    watches.append((video, link, uploader, date))

                elif len(item.contents) == 4 and "Searched" in str(
                        item.contents[0]):
                    query = item.contents[1].string
                    link = item.contents[1]['href']
                    date = item.contents[3]

                    searches.append((query, link, date))

            # -----  -----
            channels = {}
            for video in watches:
                if video[2] in channels:
                    channels[video[2]] += 1
                else:
                    channels[video[2]] = 1

            sorted_chan = sorted(channels.items(),
                                 reverse=True,
                                 key=operator.itemgetter(1))

            pie_format_chan = []

            count = 0
            for item in sorted_chan:
                if count == 10:
                    break
                else:
                    pie_format_chan.append({
                        "id": count,
                        "label": item[0],
                        "value": item[1]
                    })
                    count += 1

            Dict["youtube_piechart"] = pie_format_chan

            # -----  -----
            youtube_search_waffle_data = []
            youtube_search_values = {}

            for search in searches:
                date = search[2].split(' ')

                month = date[0]
                day = date[1][:-1]
                year = date[2][:-1]

                day_str = day
                if int(day) < 10:
                    day_str = "0" + str(day)

                date = year + "-" + genericParser.monthToNum[
                    month] + "-" + day_str
                #print("yt_search_date: " + date)
                if date in youtube_search_values:
                    youtube_search_values[date] += 1
                else:
                    youtube_search_values[date] = 1

            for item in youtube_search_values:
                youtube_search_waffle_data.append({
                    "day":
                    item,
                    "value":
                    youtube_search_values[item]
                })

            Dict["youtube_search_waffle"] = youtube_search_waffle_data

        else:
            print("/My Activity/Youtube/MyActivity.html not found")

        # ---------- Contacts Data ----------
        contactsDirPath = rootPathName + "/Contacts"
        file_contacts = contactsDirPath + "/All Contacts/All Contacts.vcf"

        # -----  -----

    else:
        print("Google data dump path does not exist")

    #write parsed data dictionary to json file
    genericParser.writeToJsonFile(
        Dict, './media/processedData/google/' + googleDataDumpName +
        '/parsedGoogleData.json')
예제 #3
0
def parseFacebookData(facebookDataDumpName): 
    # Define dictionary to map json data to
    Dict = {}

    # Define tuple to store root directory names of interest
    rootCategoriesOfInterest = ("about_you", "ads_and_businesses", "apps_and_websites", 
                                "friends", "likes_and_reactions", "other_activity", "posts", 
                                "profile_information", "security_and_login_information")

    # Parse through facebook media root directory
    #TODO: uncomment first if running through django and second if through python
    rootPathName = "./media/unzippedFiles/facebook/" + facebookDataDumpName
    #rootPathName = "../media/unzippedFiles/facebook/" + facebookDataDumpName

    if os.path.exists(rootPathName):

        # Get total size
        Dict["total_size_GB"] = genericParser.getDirSizeInGB(rootPathName)
        
        # Extract json data
        for root, dirs, files in genericParser.walklevel(rootPathName, level=1):
            # from https://stackoverflow.com/a/7253830
            categoryDirName = root.rsplit('/', 1)[-1]
            
            # if category is valid,
            # add data of interest to dictionary
            dirPath = rootPathName + "/" + categoryDirName
            if any(categoryDirName in category for category in rootCategoriesOfInterest) and os.path.exists(dirPath):
                if categoryDirName == "about_you":
                    # ----- US 6.1 -----
                    file_peer_group = "friend_peer_group.json"
                    data_peer_group = genericParser.jsonToDict(dirPath + "/" + file_peer_group, ())

                    # user friend group category
                    Dict["friend_peer_group"] = data_peer_group["friend_peer_group"]

                elif categoryDirName == "ads_and_businesses":
                    # ----- US 6.8 -----
                    file_off_facebook_activity = "your_off-facebook_activity.json"
                    data_off_facebook_activity = genericParser.jsonToDict(dirPath + "/" + file_off_facebook_activity, ())

                    # overall json superset of off facebook activity
                    Dict["off_facebook_activity"] = data_off_facebook_activity["off_facebook_activity"]

                    # count of off facebook business with data
                    Dict["num_businesses_off_facebook"] = len(data_off_facebook_activity["off_facebook_activity"])

                    # list of off facebook businesses with data
                    # from https://stackoverflow.com/a/56163468
                    list_businesses_off_facebook = [item.get("name") for item in data_off_facebook_activity["off_facebook_activity"]]
                    Dict["businesses_off_facebook"] = list_businesses_off_facebook

                    # ----- US 6.9 -----
                    file_advs = "advertisers_who_uploaded_a_contact_list_with_your_information.json"
                    data_advs = genericParser.jsonToDict(dirPath + "/" + file_advs, ())

                    # list of advertisers with your contact info
                    key_advs = file_advs[:-5]
                    val_advs = data_advs["custom_audiences"]
                    Dict[key_advs] = val_advs

                    # count of advertisers with your contact info
                    Dict["num_advertisers"] = len(val_advs)

                elif categoryDirName == "apps_and_websites":
                    # ----- US 6.5 & 6.6 -----
                    file_apps_websites = "apps_and_websites.json"
                    data_apps_websites = genericParser.jsonToDict(dirPath + "/" + file_apps_websites, ())

                    # count of apps/websites that you used facebook to login
                    Dict["num_apps_and_websites_logged_into_with_facebook"] = len(data_apps_websites["installed_apps"])

                    # list of apps/websites that you used facebook to login 
                    Dict["apps_and_websites_logged_into_with_facebook"] = data_apps_websites["installed_apps"]

                elif categoryDirName == "friends":
                    # ----- US 6.10 -----
                    file_friends = "friends.json"
                    data_friends = genericParser.jsonToDict(dirPath + "/" + file_friends, ())

                    # count of facebook friends
                    Dict["num_friends"] = len(data_friends["friends"])

                    # list of facebook friends
                    list_friends = [item.get("name") for item in data_friends["friends"]]
                    Dict["friends"] = list_friends
                    
                elif categoryDirName == "likes_and_reactions":
                    # ----- US 6.4 -----
                    file_reactions = "posts_and_comments.json"
                    data_reactions = genericParser.jsonToDict(dirPath + "/" + file_reactions, ())

                    # overall json superset of reactions
                    Dict["reactions"] = data_reactions["reactions"]

                elif categoryDirName == "other_activity":
                    # ----- US 6.10 -----
                    file_pokes = "pokes.json"
                    data_pokes = genericParser.jsonToDict(dirPath + "/" + file_pokes, ())

                    # count of pokes
                    if "activity_log_data" in data_pokes["pokes"]:
                        val_ct_pokes = len(data_pokes["pokes"]["activity_log_data"])
                        pokes = dirPath + "/" + file_pokes
                    else:
                        val_ct_pokes = 0
                    Dict["num_pokes"] = val_ct_pokes

                    # overall json superset of pokes
                    if "activity_log_data" in data_pokes["pokes"]:
                        val_pokes = data_pokes["pokes"]["activity_log_data"]
                    else:
                        val_pokes = 'no pokes'
                    Dict["pokes"] = val_pokes

                elif categoryDirName == "posts":
                    # ----- US 6.3 -----
                    file_others_posts = "other_people's_posts_to_your_timeline.json"
                    data_others_posts = genericParser.jsonToDict(dirPath + "/" + file_others_posts, ())

                    # overall json superset of others posts
                    key_others_posts = file_others_posts[:-5]
                    val_others_posts = data_others_posts["wall_posts_sent_to_you"]["activity_log_data"]
                    Dict[key_others_posts] = val_others_posts

                    # ----- US 6.3 -----
                    file_your_posts = "your_posts_1.json"
                    data_your_posts = genericParser.jsonToDict(dirPath + "/" + file_your_posts, ())

                    # overall json superset of your posts
                    Dict["your_posts"] = data_your_posts

                elif categoryDirName == "profile_information":
                    # ----- US 6.1 -----
                    file_profile_info = "profile_information.json"
                    data_profile_info = genericParser.jsonToDict(dirPath + "/" + file_profile_info, ())

                    # overall json superset of your profile info
                    Dict["profile_information"] = data_profile_info["profile"]

                    # your name
                    Dict["name"] = data_profile_info["profile"]["name"]["full_name"]

                    # ----- US 6.10 -----
                    file_profile_update_history = "profile_update_history.json"
                    data_profile_update_history = genericParser.jsonToDict(dirPath + "/" + file_profile_update_history, ())

                    # overall json superset of your profile update history
                    Dict["profile_update_history"] = data_profile_update_history["profile_updates"]

                elif categoryDirName == "security_and_login_information":
                    # ----- US 6.2 -----
                    file_logins_logouts = "logins_and_logouts.json"
                    data_logins_logouts = genericParser.jsonToDict(dirPath + "/" + file_logins_logouts, ())

                    # overall json superset of login and logouts
                    Dict["logins_and_logouts"] = data_logins_logouts["account_accesses"]

                else: print("category not found")
            
            else: print("path not interesting")

    else: print("root path not found")

    #write parsed data dictionary to json file
    genericParser.writeToJsonFile(Dict, 'media/processedData/facebook/' + facebookDataDumpName + '/parsedFacebookData.json')
예제 #4
0
def analyzeFacebookData(facebookUserFileName):
    data = genericParser.getParsedJson("media/processedData/facebook/" + facebookUserFileName + "/parsedFacebookData.json")
    #data = genericParser.getParsedJson("../media/processedData/facebook/" + facebookUserFileName + "/parsedFacebookData.json")

    Dict = {}

    # ----- US 6.1 -----
    if "name" in data.keys() and "friend_peer_group" in data.keys():
        Dict["name_category_header"] = [data["name"], data["friend_peer_group"]]
    
    # ----- US 6.2 -----
    if "logins_and_logouts" in data.keys():
        Dict["locations_piechart"] = data["logins_and_logouts"]

        # ip address bar chart
        locationsMap = {}
        for action in data["logins_and_logouts"]:
            if action["ip_address"] in locationsMap:
                locationsMap[action["ip_address"]] = locationsMap[action["ip_address"]] + 1
            else:
                locationsMap[action["ip_address"]] = 1

        locationsArray = []
        count = 0
        for loc, amt in locationsMap.items():
            locationsArray.append({"id": count, "label": loc, "value": amt})
            count += 1
        
        Dict["locations_barchart"] = locationsArray 

        # ip address map
        List = []
        for action in data["logins_and_logouts"]:
            loginLogoutMap = {}
            loginLogoutMap["action"] = action["action"]
            loginLogoutMap["timestamp"] = genericParser.getDatetimeFromEpoch(action["timestamp"])
            loginLogoutMap["ip_address"] = action["ip_address"]
            loginLogoutMap["location"] = genericParser.getLocation(action["ip_address"])
            List.append(loginLogoutMap)
            
        Dict["login_logout_map"] = List

    # ----- US 6.3 -----
    if "your_posts" in data.keys() and "other_people's_posts_to_your_timeline" in data.keys():
        Dict["posts_linegraph"] = [data["your_posts"], data["other_people's_posts_to_your_timeline"]]

        # posts pie chart
        Dict["posts_piechart"] = [{"id": 1, "label": "Your Posts", "value": len(data["your_posts"])}, {"id": 2, "label": "Friend's Posts", "value": len(data["other_people's_posts_to_your_timeline"])}]

    # ----- US 6.4 -----
    if "reactions" in data.keys():
        Dict["reactions_pictograph"] = data["reactions"]

        reactionsMap = {}
        for action in data["reactions"]:
            if action["data"][0]["reaction"]["reaction"] in reactionsMap:
                reactionsMap[action["data"][0]["reaction"]["reaction"]] = reactionsMap[action["data"][0]["reaction"]["reaction"]] + 1
            else:
                reactionsMap[action["data"][0]["reaction"]["reaction"]] = 1
        
        reactionsArray = []
        count = 0
        for reaction, amt in reactionsMap.items():
            reactionsArray.append({"id": count, "label": reaction, "value": amt})
            count += 1
        
        Dict["reactions_barchart"] = reactionsArray

    # ----- US 6.5 -----
    if "num_apps_and_websites_logged_into_with_facebook" in data.keys():
        Dict["websites_count"] = data["num_apps_and_websites_logged_into_with_facebook"]

    # ----- US 6.6/6.7 -----
    if "apps_and_websites_logged_into_with_facebook" in data.keys():
        Dict["websites_list"] = data["apps_and_websites_logged_into_with_facebook"]

    # ----- US 6.8 -----
    if "num_businesses_off_facebook" in data.keys():
        Dict["off_facebook_activity_count"] = data["num_businesses_off_facebook"]

    if "businesses_off_facebook" in data.keys():
        Dict["off_facebook_activity_list"] = data["businesses_off_facebook"]

    # ----- US 6.9 -----
    if "advertisers_who_uploaded_a_contact_list_with_your_information" in data.keys():
        Dict["advertisers_list"] = data["advertisers_who_uploaded_a_contact_list_with_your_information"]

    if "num_advertisers" in data.keys():
        Dict["advertisers_count"] = data["num_advertisers"]

    # ----- US 6.10 -----
    if "friends" in data.keys() and "num_friends" in data.keys() and "pokes" in data.keys() and"num_pokes" in data.keys() and "profile_update_history" in data.keys():
        Dict["master_linegraph"] = [data["friends"], data["num_friends"], data["pokes"], data["num_pokes"], data["profile_update_history"]]

    #write analyzed data dictionary to json file
    genericParser.writeToJsonFile(Dict, "media/processedData/facebook/" + facebookUserFileName + "/analyzedFacebookData.json")
예제 #5
0
def parseNetflixData(netflixDataDumpName):

    rootPathName = "./media/unzippedFiles/netflix/" + netflixDataDumpName

    if os.path.exists(rootPathName):

        netflixData = genericParser.csvToDict(rootPathName, ("Title", "Date"))
        #print("Uploading netflix data")
        #print(netflixData)
        #print("End of raw Netflix data")

        shows = {}
        movies = []

        for item in netflixData:
            if isShow(item["Title"]):
                title = getShowTitle(item["Title"])
                if title in shows:
                    shows[title]["count"] = shows[title]["count"] + 1
                    shows[title]["firstDate"] = earlierDate(parseDate(item["Date"]), shows[title]["firstDate"])
                    shows[title]["lastDate"] = laterDate(parseDate(item["Date"]), shows[title]["lastDate"])
                else:
                    shows[title] = {"count": 1, "firstDate": parseDate(item["Date"]), "lastDate": parseDate(item["Date"])}
                    
                    
            else:
                movies.append({"title": item["Title"], "date": item["Date"]})

        # for item in netflixData:

        #pp = pprint.PrettyPrinter(indent=4)
        #print("Shows")
        #pp.pprint(shows)
        # print("Movies")
        # print(movies)

        analyzedData = {}

        totalWatchCount = len(netflixData)

        # print(totalWatchCount)

        # format for gantt chart
        showsGantt = []
        count = 1
        for show in shows:
            formattedShow = []
            formattedShow.append(str(count))
            formattedShow.append(show)
            formattedShow.append(str(shows[show]["count"]) + " episodes watched")
            formattedShow.append(shows[show]["firstDate"])
            formattedShow.append(shows[show]["lastDate"])
            formattedShow.append("null")
            formattedShow.append(0)
            formattedShow.append("null")
            showsGantt.append(formattedShow)
            count += 1
        #pp.pprint(showsGantt)

        # all shows list
        allShows = []
        count = 0
        for show in shows:
            allShows.append({"id": count, "label": show, "value": shows[show]["count"]})
            count += 1

        # all movies list
        allMovies = []
        count = 0
        for movie in movies:
            allMovies.append({"id": count, "label": movie["title"], "value": movie["date"]})
            count += 1

        # find top 10 shows
        topTenShows = []

        for show in shows:
            heapq.heappush(topTenShows, (shows[show]["count"], show))
        
        topTenShows = heapq.nlargest(10, topTenShows)

        # print("Top 10 shows")
        # print(topTenShows)

        pieChartTopTenShows = []
        count = 0
        for show in topTenShows:
            pieChartTopTenShows.append({"id": count, "label": show[1], "value": show[0]})
            count += 1


        analyzedData["shows_ganttchart"] = showsGantt
        analyzedData["shows_piechart"] = pieChartTopTenShows
        analyzedData["totalCount"] = totalWatchCount
        analyzedData["movies"] = allMovies
        analyzedData["shows"] = allShows
        #print(analyzedData)

        genericParser.writeToJsonFile(analyzedData, "media/processedData/netflix/" + netflixDataDumpName)