def get_listening_activity_month() -> Iterator[Optional[UserListeningActivityStatMessage]]: """ Calculate number of listens for an user on each day of the past month and current month. """ current_app.logger.debug("Calculating listening_activity_month") to_date = get_latest_listen_ts() # Set time to 00:00 to_date = datetime(to_date.year, to_date.month, to_date.day) from_date = offset_months(replace_days(to_date, 1), 1) day = offset_months(replace_days(to_date, 1), 1) # Genarate a dataframe containing days of last and current month along with start and end time time_range = [] while day < to_date: time_range.append([day.strftime('%d %B %Y'), day, get_day_end(day)]) day = offset_days(day, 1, shift_backwards=False) time_range_df = listenbrainz_spark.session.createDataFrame(time_range, time_range_schema) time_range_df.createOrReplaceTempView('time_range') _get_listens(from_date, to_date) data = get_listening_activity() messages = create_messages(data=data, stats_range='month', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) current_app.logger.debug("Done!") return messages
def test_get_dates_to_train_data(self): train_model_window = 20 to_date, from_date = create_dataframes.get_dates_to_train_data( train_model_window) d = stats.offset_days(to_date, train_model_window) d = stats.replace_days(d, 1) self.assertEqual(from_date, d)
def get_listens(from_date, to_date, dest_path): """ Prepare dataframe of months falling between from_date and to_date (both inclusive). Args: from_date (datetime): Date from which start fetching listens. to_date (datetime): Date upto which fetch listens. Returns: df (dataframe): Columns can be depicted as: [ 'artist_mbids', 'artist_msid', 'artist_name', 'listened_at', 'recording_mbid' 'recording_msid', 'release_mbid', 'release_msid', 'release_name', 'tags', 'track_name', 'user_name' ] """ if to_date < from_date: raise ValueError('{}: Data generation window is negative i.e. from_date (date from which start fetching listens)' \ ' is greater than to_date (date upto which fetch listens).\nAborting...'.format(type(err).__name__)) df = None while from_date <= to_date: try: month = read_files_from_HDFS('{}/{}/{}.parquet'.format( dest_path, from_date.year, from_date.month)) df = df.union(month) if df else month except PathNotFoundException as err: current_app.logger.warning( '{}\nFetching file for next date...'.format(err)) # go to the next month of from_date from_date = stats.adjust_days(from_date, config.STEPS_TO_REACH_NEXT_MONTH, shift_backwards=False) # shift to the first of the month from_date = stats.replace_days(from_date, 1) return df
def get_listens(from_date, to_date, dest_path): """ Prepare dataframe of months falling between from_date and to_date (both inclusive). Args: from_date (datetime): Date from which start fetching listens. to_date (datetime): Date upto which fetch listens. dest_path (str): HDFS path to fetch listens from. Returns: df: Dataframe of listens. """ if to_date < from_date: raise ValueError( '{}: Data generation window is negative i.e. from_date (date from which start fetching listens)' ' is greater than to_date (date upto which fetch listens).'.format( type(ValueError).__name__)) df = None while from_date <= to_date: try: month = read_files_from_HDFS('{}/{}/{}.parquet'.format( dest_path, from_date.year, from_date.month)) df = df.union(month) if df else month except PathNotFoundException as err: current_app.logger.debug( '{}\nFetching file for next date...'.format(err)) # go to the next month of from_date from_date = stats.offset_months(date=from_date, months=1, shift_backwards=False) # shift to the first of the month from_date = stats.replace_days(from_date, 1) if not df: current_app.logger.error('Listening history missing form HDFS') raise HDFSException("Listening history missing from HDFS") return df
def get_listens_for_training_model_window(metadata): """ Prepare dataframe of listens of X days to train. Here X is a config value. Returns: training_df (dataframe): Columns can de depicted as: [ artist_mbids, artist_msid, artist_name, listened_at, recording_mbid, recording_msid, release_mbid, release_msid, release_name, tags, track_name, user_name ] """ to_date = datetime.utcnow() from_date = stats.adjust_days(to_date, config.TRAIN_MODEL_WINDOW) # shift to the first of the month from_date = stats.replace_days(from_date, 1) metadata['to_date'] = to_date metadata['from_date'] = from_date try: training_df = utils.get_listens( from_date, to_date, config.HDFS_CLUSTER_URI + path.LISTENBRAINZ_DATA_DIRECTORY) except ValueError as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) except FileNotFetchedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) return training_df
def get_listens_for_rec_generation_window(): """ Prepare dataframe of listens of X days to generate recommendations. Here X is a config value. Returns: df (dataframe): Columns can de depicted as: [ artist_mbids, artist_msid, artist_name, listened_at, recording_mbid, recording_msid, release_mbid, release_msid, release_name, tags, track_name, user_name ] """ to_date = datetime.utcnow() from_date = stats.adjust_days(to_date, config.RECOMMENDATION_GENERATION_WINDOW) # shift to the first of the month from_date = stats.replace_days(from_date, 1) try: df = utils.get_listens( from_date, to_date, config.HDFS_CLUSTER_URI + path.LISTENBRAINZ_DATA_DIRECTORY) except ValueError as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) except FileNotFetchedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) return df
def test_get_dates_to_train_data(self): train_model_window = 12 to_date, from_date = dataframe_utils.get_dates_to_train_data(train_model_window) d = stats.offset_days(to_date, train_model_window) d = stats.replace_days(d, 1) # refer to testdata/listens.json self.assertEqual(to_date, datetime(2019, 1, 21, 0, 0)) self.assertEqual(from_date, d)
def get_dates_to_train_data(): """ Get window to fetch listens to train data. Returns: from_date (datetime): Date from which start fetching listens. to_date (datetime): Date upto which fetch listens. """ to_date = datetime.utcnow() from_date = stats.adjust_days(to_date, config.TRAIN_MODEL_WINDOW) # shift to the first of the month from_date = stats.replace_days(from_date, 1) return to_date, from_date
def get_dates_to_train_data(train_model_window): """ Get window to fetch listens to train data. Args: train_model_window (int): model to be trained on data of given number of days. Returns: from_date (datetime): Date from which start fetching listens. to_date (datetime): Date upto which fetch listens. """ to_date = get_latest_listen_ts() from_date = offset_days(to_date, train_model_window) # shift to the first of the month from_date = replace_days(from_date, 1) return to_date, from_date
def get_daily_activity_month() -> Iterator[Optional[UserDailyActivityStatMessage]]: """ Calculate number of listens for an user per hour on each day of week of the current month. """ logger.debug("Calculating daily_activity_month") to_date = get_latest_listen_ts() from_date = replace_days(to_date, 1) # Set time to 00:00 from_date = datetime(from_date.year, from_date.month, from_date.day) _get_listens(from_date, to_date) data = get_daily_activity() messages = create_messages(data=data, stats_range='month', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def get_entity_month( entity: str, use_mapping: bool = False ) -> Optional[List[SitewideEntityStatMessage]]: """ Get the montly sitewide top entity """ current_app.logger.debug("Calculating sitewide_{}_month...".format(entity)) to_date = get_latest_listen_ts() # Set time to 00:00 to_date = datetime(to_date.year, to_date.month, to_date.day) from_date = replace_days(offset_months(to_date, 1, shift_backwards=True), 1) day = from_date # Genarate a dataframe containing days of last and current month along with start and end time time_range = [] while day < to_date: time_range.append([ day.strftime('%d %B %Y'), int(day.timestamp()), int(get_day_end(day).timestamp()) ]) day = offset_days(day, 1, shift_backwards=False) time_range_df = listenbrainz_spark.session.createDataFrame( time_range, schema=time_range_schema) time_range_df.createOrReplaceTempView('time_range') listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) table_name = 'sitewide_{}_month'.format(entity) listens_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name, "dd MMMM yyyy", use_mapping) message = create_message(data=data, entity=entity, stats_range='month', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) current_app.logger.debug("Done!") return message
def get_entity_year(entity: str) -> Iterator[Optional[UserEntityStatMessage]]: """ Get the year top entity for all users """ logger.debug("Calculating {}_year...".format(entity)) to_date = get_latest_listen_ts() from_date = replace_days(replace_months(to_date, 1), 1) listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) table_name = 'user_{}_year'.format(entity) listens_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name) messages = create_messages(data=data, entity=entity, stats_range='year', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def test_replace_days(self): self.assertEqual(stats.replace_days(datetime.datetime(2019, 5, 12), 13), datetime.datetime(2019, 5, 13))
def test_get_dates_to_train_data(self): to_date, from_date = create_dataframes.get_dates_to_train_data() d = stats.adjust_days(to_date, config.TRAIN_MODEL_WINDOW) d = stats.replace_days(d, 1) self.assertEqual(from_date, d)