def get_listens_for_training_model_window(to_date, from_date, metadata, dest_path): """ Prepare dataframe of listens of X days to train. Args: from_date (datetime): Date from which start fetching listens. to_date (datetime): Date upto which fetch listens. dest_path (str): HDFS path. Returns: partial_listens_df (dataframe): listens without artist mbid and recording mbid. """ metadata['to_date'] = to_date metadata['from_date'] = from_date try: training_df = utils.get_listens(from_date, to_date, dest_path) except ValueError as err: current_app.logger.error(str(err), exc_info=True) raise except FileNotFetchedException as err: current_app.logger.error(str(err), exc_info=True) raise partial_listens_df = utils.get_listens_without_artist_and_recording_mbids( training_df) return convert_text_fields_to_matchable(partial_listens_df)
def get_listens_for_rec_generation_window(): """ Prepare dataframe of listens of X days to generate recommendations. Here X is a config value. Returns: df (dataframe): Columns can de depicted as: [ artist_mbids, artist_msid, artist_name, listened_at, recording_mbid, recording_msid, release_mbid, release_msid, release_name, tags, track_name, user_name ] """ to_date = datetime.utcnow() from_date = stats.adjust_days(to_date, config.RECOMMENDATION_GENERATION_WINDOW) # shift to the first of the month from_date = stats.replace_days(from_date, 1) try: df = utils.get_listens( from_date, to_date, config.HDFS_CLUSTER_URI + path.LISTENBRAINZ_DATA_DIRECTORY) except ValueError as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) except FileNotFetchedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) return df
def get_entity_all_time( entity: str, use_mapping: bool = False ) -> Optional[List[SitewideEntityStatMessage]]: """ Get the all_time sitewide top entity """ logger.debug("Calculating sitewide_{}_all_time...".format(entity)) to_date = get_latest_listen_ts() from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1) # Generate a dataframe containing years from "from_date" to "to_date" time_range = [[ str(year), int(datetime(year, 1, 1).timestamp()), int(get_year_end(year).timestamp()) ] for year in range(from_date.year, to_date.year + 1)] time_range_df = listenbrainz_spark.session.createDataFrame( time_range, schema=time_range_schema) time_range_df.createOrReplaceTempView('time_range') listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) table_name = 'sitewide_{}_all_time'.format(entity) listens_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name, "yyyy", use_mapping) message = create_message(data=data, entity=entity, stats_range='all_time', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return message
def get_listens_for_training_model_window(to_date, from_date, metadata, dest_path): """ Prepare dataframe of listens of X days to train. Here X is a config value. Args: from_date (datetime): Date from which start fetching listens. to_date (datetime): Date upto which fetch listens. dest_path (str): HDFS path. Returns: A dataframe with columns as: [ artist_msid, artist_name, listened_at, recording_msid, release_mbid, release_msid, release_name, tags, track_name, user_name ] """ metadata['to_date'] = to_date metadata['from_date'] = from_date try: training_df = utils.get_listens(from_date, to_date, dest_path) except ValueError as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) except FileNotFetchedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) return utils.get_listens_without_artist_and_recording_mbids(training_df)
def get_listens_for_training_model_window(metadata): """ Prepare dataframe of listens of X days to train. Here X is a config value. Returns: training_df (dataframe): Columns can de depicted as: [ artist_mbids, artist_msid, artist_name, listened_at, recording_mbid, recording_msid, release_mbid, release_msid, release_name, tags, track_name, user_name ] """ to_date = datetime.utcnow() from_date = stats.adjust_days(to_date, config.TRAIN_MODEL_WINDOW) # shift to the first of the month from_date = stats.replace_days(from_date, 1) metadata['to_date'] = to_date metadata['from_date'] = from_date try: training_df = utils.get_listens( from_date, to_date, config.HDFS_CLUSTER_URI + path.LISTENBRAINZ_DATA_DIRECTORY) except ValueError as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) except FileNotFetchedException as err: current_app.logger.error(str(err), exc_info=True) sys.exit(-1) return training_df
def test_get_releases_empty(self): self.save_dataframe('user_top_releases_empty.json') df = utils.get_listens(datetime.now(), datetime.now(), self.path_) df.createOrReplaceTempView('test_view') with open(self.path_to_data_file('user_top_releases.json')) as f: data = json.load(f) received = defaultdict(list) data = release_stats.get_releases('test_view') for entry in data: _dict = entry.asDict(recursive=True) received[_dict['user_name']] = _dict['releases'] self.assertDictEqual(received, {})
def get_latest_listen_ts(): """ Get the timestamp of the latest timestamp present in spark cluster """ now = datetime.now() while True: try: df = utils.get_listens(now, now, LISTENBRAINZ_DATA_DIRECTORY) break except HDFSException: now = offset_months(now, 1) df.createOrReplaceTempView('latest_listen_ts') result = run_query( "SELECT MAX(listened_at) as max_timestamp FROM latest_listen_ts") rows = result.collect() return rows[0]['max_timestamp']
def test_get_listens(self): from_date = datetime(2019, 10, 1) to_date = datetime(2019, 11, 1) df = utils.create_dataframe([Row(column1=1, column2=2)], schema=None) dest_path = self.path_ + '/{}/{}.parquet'.format( from_date.year, from_date.month) utils.save_parquet(df, dest_path) df = utils.create_dataframe([Row(column1=3, column2=4)], schema=None) dest_path = self.path_ + '/{}/{}.parquet'.format( to_date.year, to_date.month) utils.save_parquet(df, dest_path) received_df = utils.get_listens(from_date, to_date, self.path_) self.assertEqual(received_df.count(), 2)
def test_get_listens(self): from_date = datetime(2019, 10, 1) to_date = datetime(2019, 11, 1) path_ = 'test_df' hdfs_path = os.path.join(config.HDFS_CLUSTER_URI, path_) df = utils.create_dataframe(Row(column1=1, column2=2), schema=None) dest_path = hdfs_path + '/{}/{}.parquet'.format(from_date.year, from_date.month) utils.save_parquet(df, dest_path) df = utils.create_dataframe(Row(column1=3, column2=4), schema=None) dest_path = hdfs_path + '/{}/{}.parquet'.format(to_date.year, to_date.month) utils.save_parquet(df, dest_path) received_df = utils.get_listens(from_date, to_date, hdfs_path) self.assertEqual(received_df.count(), 2)
def get_entity_week( entity: str, use_mapping: bool = False ) -> Optional[List[SitewideEntityStatMessage]]: """ Get the weekly sitewide top entity """ current_app.logger.debug("Calculating sitewide_{}_week...".format(entity)) date = get_latest_listen_ts() to_date = get_last_monday(date) # Set time to 00:00 to_date = datetime(to_date.year, to_date.month, to_date.day) from_date = offset_days(to_date, 14) day = from_date # Genarate a dataframe containing days of last and current week along with start and end time time_range = [] while day < to_date: time_range.append([ day.strftime('%A %d %B %Y'), int(day.timestamp()), int(get_day_end(day).timestamp()) ]) day = offset_days(day, 1, shift_backwards=False) time_range_df = listenbrainz_spark.session.createDataFrame( time_range, schema=time_range_schema) time_range_df.createOrReplaceTempView('time_range') listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) filtered_df = filter_listens(listens_df, from_date, to_date) table_name = 'sitewide_{}_week'.format(entity) filtered_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name, "EEEE dd MMMM yyyy", use_mapping) message = create_message(data=data, entity=entity, stats_range='week', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) current_app.logger.debug("Done!") return message
def test_get_artists(self): self.save_dataframe() df = utils.get_listens(datetime.now(), datetime.now(), self.path_) df.createOrReplaceTempView('test_view') with open(self.path_to_data_file('user_top_artists.json')) as f: data = json.load(f) with open(self.path_to_data_file('user_top_artists_output.json')) as f: expected = json.load(f) data = artist_stats.get_artists('test_view') received = defaultdict(list) for entry in data: _dict = entry.asDict(recursive=True) received[_dict['user_name']] = _dict['artists'] self.assertDictEqual(received, expected)
def get_entity_year(entity: str) -> Iterator[Optional[UserEntityStatMessage]]: """ Get the year top entity for all users """ logger.debug("Calculating {}_year...".format(entity)) to_date = get_latest_listen_ts() from_date = replace_days(replace_months(to_date, 1), 1) listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) table_name = 'user_{}_year'.format(entity) listens_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name) messages = create_messages(data=data, entity=entity, stats_range='year', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def get_entity_all_time(entity: str) -> Iterator[Optional[UserEntityStatMessage]]: """ Get the all_time top entity for all users """ logger.debug("Calculating {}_all_time...".format(entity)) to_date = get_latest_listen_ts() from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1) listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) table_name = 'user_{}_all_time'.format(entity) listens_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name) messages = create_messages(data=data, entity=entity, stats_range='all_time', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def calculate(): now = datetime.utcnow() listens_df = get_listens(from_date=datetime(LAST_FM_FOUNDING_YEAR, 1, 1), to_date=now, dest_path=path.LISTENBRAINZ_DATA_DIRECTORY) table_name = 'stats_user_all' listens_df.createOrReplaceTempView(table_name) data = defaultdict(dict) # calculate and put artist stats into the result artist_data = get_artists(table_name) for user_name, user_artists in artist_data.items(): data[user_name]['artists'] = { 'artist_stats': user_artists, 'artist_count': len(user_artists), } return data
def calculate(): now = datetime.utcnow() listens_df = get_listens(from_date=datetime(LAST_FM_FOUNDING_YEAR, 1, 1), to_date=now, dest_path=path.LISTENBRAINZ_DATA_DIRECTORY) table_name = 'stats_user_all' listens_df.createOrReplaceTempView(table_name) # calculate and put artist stats into the result artist_data = get_artists(table_name) messages = [] for user_name, user_artists in artist_data.items(): messages.append({ 'musicbrainz_id': user_name, 'type': 'user_artist', 'artist_stats': user_artists, 'artist_count': len(user_artists), }) return messages
def get_entity_year( entity: str, use_mapping: bool = False ) -> Optional[List[SitewideEntityStatMessage]]: """ Get the yearly sitewide top entity """ current_app.logger.debug("Calculating sitewide_{}_year...".format(entity)) to_date = get_latest_listen_ts() from_date = datetime(to_date.year - 1, 1, 1) month = from_date time_range = [] # Genarate a dataframe containing months of last and current year along with start and end time while month < to_date: time_range.append([ month.strftime('%B %Y'), int(month.timestamp()), int(get_month_end(month).timestamp()) ]) month = offset_months(month, 1, shift_backwards=False) time_range_df = listenbrainz_spark.session.createDataFrame( time_range, schema=time_range_schema) time_range_df.createOrReplaceTempView('time_range') listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) table_name = 'sitewide_{}_year'.format(entity) listens_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name, "MMMM yyyy", use_mapping) message = create_message(data=data, entity=entity, stats_range='year', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) current_app.logger.debug("Done!") return message
def get_entity_week(entity: str) -> Iterator[Optional[UserEntityStatMessage]]: """ Get the weekly top entity for all users """ logger.debug("Calculating {}_week...".format(entity)) date = get_latest_listen_ts() to_date = get_last_monday(date) from_date = offset_days(to_date, 7) listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) filtered_df = filter_listens(listens_df, from_date, to_date) table_name = 'user_{}_week'.format(entity) filtered_df.createOrReplaceTempView(table_name) handler = entity_handler_map[entity] data = handler(table_name) messages = create_messages(data=data, entity=entity, stats_range='week', from_ts=from_date.timestamp(), to_ts=to_date.timestamp()) logger.debug("Done!") return messages
def get_listens_for_training_model_window(to_date, from_date, dest_path): """ Prepare dataframe of listens to train. Args: from_date (datetime): Date from which start fetching listens. to_date (datetime): Date upto which fetch listens. dest_path (str): HDFS path. Returns: partial_listens_df (dataframe): dataframe of listens. """ try: training_df = get_listens(from_date, to_date, dest_path) except ValueError as err: current_app.logger.error(str(err), exc_info=True) raise except FileNotFetchedException as err: current_app.logger.error(str(err), exc_info=True) raise partial_listens_df = mapping_utils.convert_text_fields_to_matchable( training_df) return partial_listens_df
def _get_listens(from_date: datetime, to_date: datetime): listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) listens_df.createOrReplaceTempView('listens')
def _get_listens(from_date: datetime, to_date: datetime): listens = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY) filtered_listens = filter_listens(listens, from_date, to_date) filtered_listens.createOrReplaceTempView('listens')