예제 #1
0
def get_entity_all_time(
        entity: str,
        use_mapping: bool = False
) -> Optional[List[SitewideEntityStatMessage]]:
    """ Get the all_time sitewide top entity """
    logger.debug("Calculating sitewide_{}_all_time...".format(entity))

    to_date = get_latest_listen_ts()
    from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1)

    # Generate a dataframe containing years from "from_date" to "to_date"
    time_range = [[
        str(year),
        int(datetime(year, 1, 1).timestamp()),
        int(get_year_end(year).timestamp())
    ] for year in range(from_date.year, to_date.year + 1)]
    time_range_df = listenbrainz_spark.session.createDataFrame(
        time_range, schema=time_range_schema)
    time_range_df.createOrReplaceTempView('time_range')

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    table_name = 'sitewide_{}_all_time'.format(entity)
    listens_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name, "yyyy", use_mapping)
    message = create_message(data=data,
                             entity=entity,
                             stats_range='all_time',
                             from_ts=from_date.timestamp(),
                             to_ts=to_date.timestamp())

    logger.debug("Done!")

    return message
    def test_get_mapped_artist_and_recording_mbids(self):
        to_date = get_latest_listen_ts()
        partial_listen_df = dataframe_utils.get_listens_for_training_model_window(to_date, to_date, self.listens_path)

        df = utils.read_files_from_HDFS(self.mapping_path)
        mapping_df = mapping_utils.get_unique_rows_from_mapping(df)
        mapped_listens_path = '/mapped_listens.parquet'

        mapped_listens = dataframe_utils.get_mapped_artist_and_recording_mbids(partial_listen_df, mapping_df, mapped_listens_path)
        self.assertEqual(mapped_listens.count(), 8)

        cols = [
            'listened_at',
            'mb_artist_credit_id',
            'mb_artist_credit_mbids',
            'mb_recording_mbid',
            'mb_release_mbid',
            'msb_artist_credit_name_matchable',
            'msb_recording_name_matchable',
            'user_name'
        ]

        self.assertListEqual(sorted(cols), sorted(mapped_listens.columns))
        status = utils.path_exists(mapped_listens_path)
        self.assertTrue(status)
예제 #3
0
def get_listening_activity_year() -> Iterator[Optional[UserListeningActivityStatMessage]]:
    """ Calculate the number of listens for an user in each month of the past and current year. """
    logger.debug("Calculating listening_activity_year")

    to_date = get_latest_listen_ts()
    from_date = datetime(to_date.year-1, 1, 1)
    month = datetime(to_date.year-1, 1, 1)
    time_range = []

    # Genarate a dataframe containing months of last and current year along with start and end time
    while month < to_date:
        time_range.append([month.strftime('%B %Y'), month, get_month_end(month)])
        month = offset_months(month, 1, shift_backwards=False)

    time_range_df = listenbrainz_spark.session.createDataFrame(time_range, time_range_schema)
    time_range_df.createOrReplaceTempView('time_range')

    _get_listens(from_date, to_date)

    data = get_listening_activity()
    messages = create_messages(data=data, stats_range='year', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
예제 #4
0
def get_listening_activity_month() -> Iterator[Optional[UserListeningActivityStatMessage]]:
    """ Calculate number of listens for an user on each day of the past month and current month. """
    logger.debug("Calculating listening_activity_month")

    to_date = get_latest_listen_ts()
    from_date = offset_months(replace_days(to_date, 1), 1)
    # Set time to 00:00
    from_date = datetime(from_date.year, from_date.month, from_date.day)
    day = from_date

    # Genarate a dataframe containing days of last and current month along with start and end time
    time_range = []
    while day < to_date:
        time_range.append([day.strftime('%d %B %Y'), day, get_day_end(day)])
        day = offset_days(day, 1, shift_backwards=False)

    time_range_df = listenbrainz_spark.session.createDataFrame(time_range, time_range_schema)
    time_range_df.createOrReplaceTempView('time_range')

    _get_listens(from_date, to_date)

    data = get_listening_activity()
    messages = create_messages(data=data, stats_range='month', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
 def test_get_listens_for_training_model_window(self):
     to_date = get_latest_listen_ts()
     from_date = stats.offset_days(to_date, 2)
     print(to_date, from_date)
     test_df = dataframe_utils.get_listens_for_training_model_window(to_date, from_date, self.listens_path)
     self.assertIn('artist_name_matchable', test_df.columns)
     self.assertIn('track_name_matchable', test_df.columns)
     self.assertEqual(test_df.count(), 11)
    def test_get_latest_listen_ts(self):
        date = datetime(2020, 5, 18)
        df = utils.create_dataframe(Row(listened_at=date), schema=None)
        df = df.union(
            utils.create_dataframe(Row(listened_at=offset_days(date, 7)),
                                   schema=None))
        utils.save_parquet(df, '{}/2020/5.parquet'.format(self.path_))

        result = stats_utils.get_latest_listen_ts()
        self.assertEqual(date, result)
 def test_get_listens_for_training_model_window(self):
     metadata = {}
     to_date = get_latest_listen_ts()
     from_date = stats.offset_days(to_date, 2)
     test_df = create_dataframes.get_listens_for_training_model_window(
         to_date, from_date, metadata, self.listens_path)
     self.assertEqual(metadata['to_date'], to_date)
     self.assertEqual(metadata['from_date'], from_date)
     self.assertNotIn('artist_mbids', test_df.columns)
     self.assertNotIn('recording_mbid', test_df.columns)
예제 #8
0
def get_daily_activity_all_time() -> Iterator[Optional[UserDailyActivityStatMessage]]:
    """ Calculate number of listens for an user per hour on each day of week. """
    logger.debug("Calculating daily_activity_all_time")

    to_date = get_latest_listen_ts()
    from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1)

    _get_listens(from_date, to_date)

    data = get_daily_activity()
    messages = create_messages(data=data, stats_range='all_time', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
예제 #9
0
def get_dates_to_train_data(train_model_window):
    """ Get window to fetch listens to train data.

        Args:
            train_model_window (int): model to be trained on data of given number of days.

        Returns:
            from_date (datetime): Date from which start fetching listens.
            to_date (datetime): Date upto which fetch listens.
    """
    to_date = get_latest_listen_ts()
    from_date = offset_days(to_date, train_model_window)
    # shift to the first of the month
    from_date = replace_days(from_date, 1)
    return to_date, from_date
예제 #10
0
def get_daily_activity_year() -> Iterator[Optional[UserDailyActivityStatMessage]]:
    """ Calculate number of listens for an user per hour on each day of week of the current year. """
    logger.debug("Calculating daily_activity_year")

    to_date = get_latest_listen_ts()
    from_date = datetime(to_date.year, 1, 1)

    _get_listens(from_date, to_date)

    data = get_daily_activity()
    messages = create_messages(data=data, stats_range='year', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
예제 #11
0
def get_daily_activity_month() -> Iterator[Optional[UserDailyActivityStatMessage]]:
    """ Calculate number of listens for an user per hour on each day of week of the current month. """
    logger.debug("Calculating daily_activity_month")

    to_date = get_latest_listen_ts()
    from_date = replace_days(to_date, 1)
    # Set time to 00:00
    from_date = datetime(from_date.year, from_date.month, from_date.day)

    _get_listens(from_date, to_date)

    data = get_daily_activity()
    messages = create_messages(data=data, stats_range='month', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
예제 #12
0
def get_daily_activity_week() -> Iterator[Optional[UserDailyActivityStatMessage]]:
    """ Calculate number of listens for an user per hour on each day of the past week. """
    logger.debug("Calculating daily_activity_week")

    date = get_latest_listen_ts()
    to_date = get_last_monday(date)
    from_date = offset_days(to_date, 7)

    _get_listens(from_date, to_date)

    data = get_daily_activity()

    messages = create_messages(data=data, stats_range='week', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
예제 #13
0
def get_entity_week(
        entity: str,
        use_mapping: bool = False
) -> Optional[List[SitewideEntityStatMessage]]:
    """ Get the weekly sitewide top entity """
    current_app.logger.debug("Calculating sitewide_{}_week...".format(entity))

    date = get_latest_listen_ts()

    to_date = get_last_monday(date)
    # Set time to 00:00
    to_date = datetime(to_date.year, to_date.month, to_date.day)
    from_date = offset_days(to_date, 14)
    day = from_date

    # Genarate a dataframe containing days of last and current week along with start and end time
    time_range = []
    while day < to_date:
        time_range.append([
            day.strftime('%A %d %B %Y'),
            int(day.timestamp()),
            int(get_day_end(day).timestamp())
        ])
        day = offset_days(day, 1, shift_backwards=False)

    time_range_df = listenbrainz_spark.session.createDataFrame(
        time_range, schema=time_range_schema)
    time_range_df.createOrReplaceTempView('time_range')

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    filtered_df = filter_listens(listens_df, from_date, to_date)
    table_name = 'sitewide_{}_week'.format(entity)
    filtered_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name, "EEEE dd MMMM yyyy", use_mapping)
    message = create_message(data=data,
                             entity=entity,
                             stats_range='week',
                             from_ts=from_date.timestamp(),
                             to_ts=to_date.timestamp())

    current_app.logger.debug("Done!")

    return message
예제 #14
0
def get_entity_year(entity: str) -> Iterator[Optional[UserEntityStatMessage]]:
    """ Get the year top entity for all users """
    logger.debug("Calculating {}_year...".format(entity))

    to_date = get_latest_listen_ts()
    from_date = replace_days(replace_months(to_date, 1), 1)

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    table_name = 'user_{}_year'.format(entity)
    listens_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name)
    messages = create_messages(data=data, entity=entity, stats_range='year',
                               from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
예제 #15
0
def get_entity_all_time(entity: str) -> Iterator[Optional[UserEntityStatMessage]]:
    """ Get the all_time top entity for all users """
    logger.debug("Calculating {}_all_time...".format(entity))

    to_date = get_latest_listen_ts()
    from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1)

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    table_name = 'user_{}_all_time'.format(entity)
    listens_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name)
    messages = create_messages(data=data, entity=entity, stats_range='all_time',
                               from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
예제 #16
0
def get_entity_year(
        entity: str,
        use_mapping: bool = False
) -> Optional[List[SitewideEntityStatMessage]]:
    """ Get the yearly sitewide top entity """
    current_app.logger.debug("Calculating sitewide_{}_year...".format(entity))

    to_date = get_latest_listen_ts()
    from_date = datetime(to_date.year - 1, 1, 1)
    month = from_date

    time_range = []
    # Genarate a dataframe containing months of last and current year along with start and end time
    while month < to_date:
        time_range.append([
            month.strftime('%B %Y'),
            int(month.timestamp()),
            int(get_month_end(month).timestamp())
        ])
        month = offset_months(month, 1, shift_backwards=False)

    time_range_df = listenbrainz_spark.session.createDataFrame(
        time_range, schema=time_range_schema)
    time_range_df.createOrReplaceTempView('time_range')

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    table_name = 'sitewide_{}_year'.format(entity)
    listens_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name, "MMMM yyyy", use_mapping)
    message = create_message(data=data,
                             entity=entity,
                             stats_range='year',
                             from_ts=from_date.timestamp(),
                             to_ts=to_date.timestamp())

    current_app.logger.debug("Done!")

    return message
예제 #17
0
def get_entity_week(entity: str) -> Iterator[Optional[UserEntityStatMessage]]:
    """ Get the weekly top entity for all users """
    logger.debug("Calculating {}_week...".format(entity))

    date = get_latest_listen_ts()

    to_date = get_last_monday(date)
    from_date = offset_days(to_date, 7)

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    filtered_df = filter_listens(listens_df, from_date, to_date)
    table_name = 'user_{}_week'.format(entity)
    filtered_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name)
    messages = create_messages(data=data, entity=entity, stats_range='week',
                               from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
예제 #18
0
def get_listening_activity_all_time() -> Iterator[Optional[UserListeningActivityStatMessage]]:
    """ Calculate the number of listens for an user in each year starting from LAST_FM_FOUNDING_YEAR (2002). """
    logger.debug("Calculating listening_activity_all_time")

    to_date = get_latest_listen_ts()
    from_date = datetime(LAST_FM_FOUNDING_YEAR, 1, 1)

    result_without_zero_years = None
    for year in range(from_date.year, to_date.year+1):
        year_start = datetime(year, 1, 1)
        year_end = get_year_end(year)
        try:
            _get_listens(year_start, year_end)
        except HDFSException:
            # Skip if no listens present in df
            continue
        year_df = run_query("""
                    SELECT user_name,
                           count(user_name) as listen_count
                      FROM listens
                  GROUP BY user_name
                  """)
        year_df = year_df.withColumn('time_range', lit(str(year))).withColumn(
            'from_ts', lit(year_start.timestamp())).withColumn('to_ts', lit(year_end.timestamp()))
        result_without_zero_years = result_without_zero_years.union(year_df) if result_without_zero_years else year_df

    # Create a table with a list of time ranges and corresponding listen count for each user
    data = result_without_zero_years \
        .withColumn("listening_activity", struct("from_ts", "to_ts", "listen_count", "time_range")) \
        .groupBy("user_name") \
        .agg(sort_array(collect_list("listening_activity")).alias("listening_activity")) \
        .toLocalIterator()

    messages = create_messages(data=data, stats_range='all_time', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages