Exemplo n.º 1
0
def get_listening_activity_month() -> Iterator[Optional[UserListeningActivityStatMessage]]:
    """ Calculate number of listens for an user on each day of the past month and current month. """
    current_app.logger.debug("Calculating listening_activity_month")

    to_date = get_latest_listen_ts()
    # Set time to 00:00
    to_date = datetime(to_date.year, to_date.month, to_date.day)
    from_date = offset_months(replace_days(to_date, 1), 1)
    day = offset_months(replace_days(to_date, 1), 1)

    # Genarate a dataframe containing days of last and current month along with start and end time
    time_range = []
    while day < to_date:
        time_range.append([day.strftime('%d %B %Y'), day, get_day_end(day)])
        day = offset_days(day, 1, shift_backwards=False)

    time_range_df = listenbrainz_spark.session.createDataFrame(time_range, time_range_schema)
    time_range_df.createOrReplaceTempView('time_range')

    _get_listens(from_date, to_date)

    data = get_listening_activity()
    messages = create_messages(data=data, stats_range='month', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    current_app.logger.debug("Done!")

    return messages
Exemplo n.º 2
0
 def test_offset_months(self):
     d1 = stats.offset_months(datetime.datetime(2019, 5, 12), 3, shift_backwards=False)
     d2 = datetime.datetime(2019, 8, 12)
     self.assertEqual(d1, d2)
     d1 = stats.offset_months(datetime.datetime(2019, 5, 12), 3)
     d2 = datetime.datetime(2019, 2, 12)
     self.assertEqual(d1, d2)
Exemplo n.º 3
0
def get_listening_activity_year() -> Iterator[Optional[UserListeningActivityStatMessage]]:
    """ Calculate the number of listens for an user in each month of the past and current year. """
    logger.debug("Calculating listening_activity_year")

    to_date = get_latest_listen_ts()
    from_date = datetime(to_date.year-1, 1, 1)
    month = datetime(to_date.year-1, 1, 1)
    time_range = []

    # Genarate a dataframe containing months of last and current year along with start and end time
    while month < to_date:
        time_range.append([month.strftime('%B %Y'), month, get_month_end(month)])
        month = offset_months(month, 1, shift_backwards=False)

    time_range_df = listenbrainz_spark.session.createDataFrame(time_range, time_range_schema)
    time_range_df.createOrReplaceTempView('time_range')

    _get_listens(from_date, to_date)

    data = get_listening_activity()
    messages = create_messages(data=data, stats_range='year', from_ts=from_date.timestamp(), to_ts=to_date.timestamp())

    logger.debug("Done!")

    return messages
    def test_get_listening_activity_year(self, mock_create_messages,
                                         mock_get_listening_activity,
                                         mock_get_listens,
                                         mock_get_latest_listen_ts):
        mock_df = MagicMock()
        mock_get_listens.return_value = mock_df

        listening_activity_stats.get_listening_activity_year()
        to_date = datetime(2020, 6, 19)
        from_date = month = datetime(2019, 1, 1)

        time_range = []
        while month < to_date:
            time_range.append(
                [month.strftime('%B %Y'), month,
                 get_month_end(month)])
            month = offset_months(month, 1, shift_backwards=False)
        time_range_df = run_query("SELECT * FROM time_range")
        time_range_result = time_range_df.rdd.map(list).collect()
        self.assertListEqual(time_range_result, time_range)

        mock_get_latest_listen_ts.assert_called_once()
        mock_get_listens.assert_called_with(from_date, to_date,
                                            LISTENBRAINZ_DATA_DIRECTORY)
        mock_df.createOrReplaceTempView.assert_called_with('listens')
        mock_create_messages.assert_called_with(
            data='listening_activity_table',
            stats_range='year',
            from_ts=from_date.timestamp(),
            to_ts=to_date.timestamp())
Exemplo n.º 5
0
def get_listens(from_date, to_date, dest_path):
    """ Prepare dataframe of months falling between from_date and to_date (both inclusive).

        Args:
            from_date (datetime): Date from which start fetching listens.
            to_date (datetime): Date upto which fetch listens.
            dest_path (str): HDFS path to fetch listens from.

        Returns:
            df: Dataframe of listens.
    """
    if to_date < from_date:
        raise ValueError(
            '{}: Data generation window is negative i.e. from_date (date from which start fetching listens)'
            ' is greater than to_date (date upto which fetch listens).'.format(
                type(ValueError).__name__))
    df = None
    while from_date <= to_date:
        try:
            month = read_files_from_HDFS('{}/{}/{}.parquet'.format(
                dest_path, from_date.year, from_date.month))
            df = df.union(month) if df else month
        except PathNotFoundException as err:
            current_app.logger.debug(
                '{}\nFetching file for next date...'.format(err))
        # go to the next month of from_date
        from_date = stats.offset_months(date=from_date,
                                        months=1,
                                        shift_backwards=False)
        # shift to the first of the month
        from_date = stats.replace_days(from_date, 1)
    if not df:
        current_app.logger.error('Listening history missing form HDFS')
        raise HDFSException("Listening history missing from HDFS")
    return df
Exemplo n.º 6
0
def get_latest_listen_ts():
    """ Get the timestamp of the latest timestamp present in spark cluster """
    now = datetime.now()
    while True:
        try:
            df = utils.get_listens(now, now, LISTENBRAINZ_DATA_DIRECTORY)
            break
        except HDFSException:
            now = offset_months(now, 1)

    df.createOrReplaceTempView('latest_listen_ts')
    result = run_query(
        "SELECT MAX(listened_at) as max_timestamp FROM latest_listen_ts")
    rows = result.collect()
    return rows[0]['max_timestamp']
    def test_filter_listens(self):
        from_date = datetime(2020, 5, 1)
        to_date = datetime(2020, 5, 31)

        df = utils.create_dataframe(
            Row(listened_at=offset_months(from_date, 1)), None)
        df = df.union(
            utils.create_dataframe(
                Row(listened_at=offset_months(
                    to_date, 1, shift_backwards=False)), None))
        df = df.union(
            utils.create_dataframe(
                Row(listened_at=offset_days(
                    from_date, 5, shift_backwards=False)), None))
        df = df.union(
            utils.create_dataframe(Row(listened_at=offset_days(to_date, 5)),
                                   None))

        result = stats_utils.filter_listens(df, from_date, to_date)
        rows = result.collect()

        self.assertEqual(rows[0]['listened_at'],
                         offset_days(from_date, 5, shift_backwards=False))
        self.assertEqual(rows[1]['listened_at'], offset_days(to_date, 5))
Exemplo n.º 8
0
def get_entity_month(
        entity: str,
        use_mapping: bool = False
) -> Optional[List[SitewideEntityStatMessage]]:
    """ Get the montly sitewide top entity """
    current_app.logger.debug("Calculating sitewide_{}_month...".format(entity))

    to_date = get_latest_listen_ts()
    # Set time to 00:00
    to_date = datetime(to_date.year, to_date.month, to_date.day)
    from_date = replace_days(offset_months(to_date, 1, shift_backwards=True),
                             1)
    day = from_date

    # Genarate a dataframe containing days of last and current month along with start and end time
    time_range = []
    while day < to_date:
        time_range.append([
            day.strftime('%d %B %Y'),
            int(day.timestamp()),
            int(get_day_end(day).timestamp())
        ])
        day = offset_days(day, 1, shift_backwards=False)

    time_range_df = listenbrainz_spark.session.createDataFrame(
        time_range, schema=time_range_schema)
    time_range_df.createOrReplaceTempView('time_range')

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    table_name = 'sitewide_{}_month'.format(entity)
    listens_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name, "dd MMMM yyyy", use_mapping)

    message = create_message(data=data,
                             entity=entity,
                             stats_range='month',
                             from_ts=from_date.timestamp(),
                             to_ts=to_date.timestamp())

    current_app.logger.debug("Done!")

    return message
    def test_get_listening_activity_year(self, mock_create_messages, _,
                                         mock_get_listens):
        listening_activity_stats.get_listening_activity('year')

        from_date = month = datetime(2019, 1, 1)
        to_date = datetime(2021, 1, 1)
        time_range = []
        while month < to_date:
            time_range.append(
                [month.strftime('%B %Y'), month,
                 get_month_end(month)])
            month = offset_months(month, 1, shift_backwards=False)
        time_range_df = run_query("SELECT * FROM time_range")
        time_range_result = time_range_df.rdd.map(list).collect()
        self.assertListEqual(time_range_result, time_range)

        mock_get_listens.assert_called_with(from_date, to_date)
        mock_create_messages.assert_called_with(data='activity_table',
                                                stats_range='year',
                                                from_date=from_date,
                                                to_date=to_date)
Exemplo n.º 10
0
def get_entity_year(
        entity: str,
        use_mapping: bool = False
) -> Optional[List[SitewideEntityStatMessage]]:
    """ Get the yearly sitewide top entity """
    current_app.logger.debug("Calculating sitewide_{}_year...".format(entity))

    to_date = get_latest_listen_ts()
    from_date = datetime(to_date.year - 1, 1, 1)
    month = from_date

    time_range = []
    # Genarate a dataframe containing months of last and current year along with start and end time
    while month < to_date:
        time_range.append([
            month.strftime('%B %Y'),
            int(month.timestamp()),
            int(get_month_end(month).timestamp())
        ])
        month = offset_months(month, 1, shift_backwards=False)

    time_range_df = listenbrainz_spark.session.createDataFrame(
        time_range, schema=time_range_schema)
    time_range_df.createOrReplaceTempView('time_range')

    listens_df = get_listens(from_date, to_date, LISTENBRAINZ_DATA_DIRECTORY)
    table_name = 'sitewide_{}_year'.format(entity)
    listens_df.createOrReplaceTempView(table_name)

    handler = entity_handler_map[entity]
    data = handler(table_name, "MMMM yyyy", use_mapping)
    message = create_message(data=data,
                             entity=entity,
                             stats_range='year',
                             from_ts=from_date.timestamp(),
                             to_ts=to_date.timestamp())

    current_app.logger.debug("Done!")

    return message