def test_add_time_related_columns(self):
     df = pd.DataFrame(pd.Series('2020-01-01'), columns=['Timestamp'])
     datetime_series = Utility.parse_date_time_column(df, 'Timestamp')
     Utility.add_time_related_columns(df,
                                      datetime_series,
                                      col_name_prefix='pref_',
                                      col_name_suffix='_suff')
     expected = {
         'Timestamp': ['2020-01-01'],
         'pref_date time_suff': ['2020-01-01'],
         'pref_Year_suff': [2020],
         'pref_Month_suff': [1],
         'pref_DOM_suff': [1],
         'pref_DOW_suff': ['Wednesday'],
         'pref_HOD_suff': [0]
     }
     expected_output = pd.DataFrame.from_dict(expected)
     self.assertEqual(df.shape, expected_output.shape)
     self.assertEqual(df.columns.tolist(), expected_output.columns.tolist())
Пример #2
0
    def parse_library_activity_df(library_activity_df):
        '''
            Method in charge of parsing the library activity dataframe.
            It is responsible for adding time columns from the timestamp column (year, month, day of the month,...), as well
            as agent columns (what performed the action, what model).

        '''
        parsed_df = library_activity_df.copy()
        # parse time related column
        parsed_datetime_series = Utility.parse_date_time_column(parsed_df, 'Transaction Date')
        Utility.add_time_related_columns(parsed_df, parsed_datetime_series, col_name_prefix='Transaction ')
    
        # parse action agent column
        parsed_df['Transaction Agent'] = parsed_df['UserAgent'].str.split('/').str.get(0)
        parsed_df.replace({'Transaction Agent' : { 'itunescloudd' : 'iPhone', 'iTunes' : 'Macintosh'}}, inplace=True)
        parsed_df['Transaction Agent Model'] = parsed_df[parsed_df['Transaction Agent'] == 'iPhone']['UserAgent'].str.split('/').str.get(3).str.split(',').str.get(0)
        parsed_df.loc[parsed_df['Transaction Agent'].eq('Macintosh'), 'Transaction Agent Model'] = 'Macintosh'

        return parsed_df
Пример #3
0
    def parse_play_activity_df(play_activity_df, convert_to_local_time = True, drop_columns=True):
        '''
            Method in charge of parsing the play activity dataframe. The parsing is performed in multiple steps:
            1. Rename the columns containing song title and artist
            2. Time columns: first obtain a timestamp column without missing values, using Event Start Timestamp and Event End Timestamp
            3. Time columns: add time columns from the timestamp column (year, month, day of the month,...), with or without conversion
            to local time (args)
            4. Remove outlier rows (Apple Music service started in 2015, so we drop rows with a year before 2015)
            5. Add a column with a flag for partial vs complete listening of a given track
            6. Add a column with a simplified 'origin' of the song, i.e. how it was found (search, suggestion, library,...)
            7. Add a column with a calculation of the listening duration in minutes
            8. Remove outliers of listening duration (99th percentile)
            9. Drop unused columns (args)

        '''

        columns_to_drop = [
        'Apple Id Number', 'Apple Music Subscription', 'Build Version', 'Client IP Address',
        'Content Specific Type', 'Device Identifier', 'Event Reason Hint Type', 'Activity date time',
        'End Position In Milliseconds', 'Event Received Timestamp', 'Media Type', 'Metrics Bucket Id', 
        'Metrics Client Id','Original Title', 'Source Type', 'Start Position In Milliseconds',
        'Store Country Name', 'Milliseconds Since Play', 'Event End Timestamp', 'Event Start Timestamp',
        'UTC Offset In Seconds','Play Duration Milliseconds', 'Media Duration In Milliseconds', 'Feature Name'
        ]
        # Rename columns for merges later
        parsed_df = play_activity_df.copy()
        parsed_df.rename(columns={'Content Name':'Title', 'Artist Name':'Artist'}, inplace=True)
        
        # Add time related columns
        parsed_df['Activity date time'] = pd.to_datetime(parsed_df['Event Start Timestamp'])
        parsed_df['Activity date time'].fillna(pd.to_datetime(parsed_df['Event End Timestamp']), inplace=True)
        if convert_to_local_time is True:
            parsed_df['Activity date time'] = Utility.convert_to_local_time(parsed_df['Activity date time'], parsed_df['UTC Offset In Seconds'])
        parsed_datetime_series = Utility.parse_date_time_column(parsed_df, 'Activity date time')
        Utility.add_time_related_columns(parsed_df, parsed_datetime_series, col_name_prefix='Play ')

        # We remove year outliers (Apple Music started in 2015, whatever is reported before is a mistake)
        parsed_df = parsed_df.drop(parsed_df[parsed_df['Play Year']< 2015].index)

        # Add partial listening column 
        play_duration = parsed_df['Play Duration Milliseconds']
        media_duration = parsed_df['Media Duration In Milliseconds']
        Parser.set_partial_listening(parsed_df, parsed_df['End Reason Type'], play_duration, media_duration)

        # Add track origin column
        parsed_df['Track origin'] = parsed_df['Feature Name'].apply(Parser.get_track_origin)

        # Add play duration column
        activity_start = pd.to_datetime(parsed_df['Event Start Timestamp'])
        activity_end = pd.to_datetime(parsed_df['Event End Timestamp'])
        played_completely = parsed_df['Played completely']
        Parser.compute_play_duration(parsed_df, activity_start, activity_end, played_completely, play_duration, media_duration)

        # we remove outliers from this play duration column, saying that if a value if above 1h30,
        # we drop it, and replace it by the duration of the media
        Parser.remove_play_duration_outliers(parsed_df, parsed_df['Play duration in minutes'], media_duration, 90)

        #we can then remove the columns we do not need anymore!
        if drop_columns:
            parsed_df = parsed_df.drop(columns_to_drop, axis=1, errors='ignore')

        return parsed_df