Пример #1
0
 def setUp(self):
     DatabaseTestCase.setUp(self)
     TimescaleTestCase.setUp(self)
     self.app = create_app()
     self.logstore = TimescaleListenStore(self.app.logger)
     self.dumpstore = DumpListenStore(self.app)
     self.testuser = db_user.get_or_create(1, "test")
     self.testuser_name = self.testuser["musicbrainz_id"]
     self.testuser_id = self.testuser["id"]
Пример #2
0
 def setUp(self):
     super(TimescaleWriterTestCase, self).setUp()
     self.ls = TimescaleListenStore(
         {
             'REDIS_HOST': config.REDIS_HOST,
             'REDIS_PORT': config.REDIS_PORT,
             'REDIS_NAMESPACE': config.REDIS_NAMESPACE,
             'SQLALCHEMY_TIMESCALE_URI': config.SQLALCHEMY_TIMESCALE_URI
         }, self.app.logger)
     self.rs = RedisListenStore(
         self.app.logger, {
             'REDIS_HOST': config.REDIS_HOST,
             'REDIS_PORT': config.REDIS_PORT,
             'REDIS_NAMESPACE': config.REDIS_NAMESPACE
         })
Пример #3
0
def get_listen_events(
    db_conn: TimescaleListenStore,
    musicbrainz_ids: List[str],
    min_ts: int,
    max_ts: int,
    count: int,
    time_range: int,
) -> List[APITimelineEvent]:
    """ Gets all listen events in the feed.
    """

    # NOTE: For now, we get a bunch of listens for the users the current
    # user is following and take a max of 2 out of them per user. This
    # could be done better by writing a complex query to get exactly 2 listens for each user,
    # but I'm happy with this heuristic for now and we can change later.
    db_conn = webserver.create_timescale(current_app)
    listens = db_conn.fetch_listens_for_multiple_users_from_storage(
        musicbrainz_ids,
        limit=count,
        from_ts=min_ts,
        to_ts=max_ts,
        time_range=time_range,
        order=0,  # descending
    )

    user_listens_map = defaultdict(list)
    for listen in listens:
        if len(user_listens_map[
                listen.user_name]) < MAX_LISTEN_EVENTS_PER_USER:
            user_listens_map[listen.user_name].append(listen)

    events = []
    for user in user_listens_map:
        for listen in user_listens_map[user]:
            try:
                listen_dict = listen.to_api()
                listen_dict['inserted_at'] = listen_dict[
                    'inserted_at'].timestamp()
                api_listen = APIListen(**listen_dict)
                events.append(
                    APITimelineEvent(
                        event_type=UserTimelineEventType.LISTEN,
                        user_name=api_listen.user_name,
                        created=api_listen.listened_at,
                        metadata=api_listen,
                    ))
            except pydantic.ValidationError as e:
                current_app.logger.error('Validation error: ' + str(e),
                                         exc_info=True)
                continue

    return events
def init_timescale_connection(logger, conf):
    global _ts
    while True:
        try:
            _ts = TimescaleListenStore(conf, logger)
            break
        except Exception as e:
            logger.error(
                "Couldn't create TimescaleListenStore instance: {}, sleeping and trying again..."
                .format(str(e)),
                exc_info=True)
            time.sleep(2)

    return _ts
def init_timescale_connection(app):
    global _ts

    if not app.config.get("SQLALCHEMY_TIMESCALE_URI"):
        return

    while True:
        try:
            _ts = TimescaleListenStore(app.logger)
            break
        except Exception:
            app.logger.error(
                f"Couldn't create TimescaleListenStore instance (sleeping and trying again...):",
                exc_info=True)
            time.sleep(2)

    return _ts
Пример #6
0
    def start(self):
        app = create_app()
        with app.app_context():
            current_app.logger.info("timescale-writer init")
            self._verify_hosts_in_config()

            if "SQLALCHEMY_TIMESCALE_URI" not in current_app.config:
                current_app.logger.critical(
                    "Timescale service not defined. Sleeping {0} seconds and exiting."
                    .format(self.ERROR_RETRY_DELAY))
                sleep(self.ERROR_RETRY_DELAY)
                sys.exit(-1)

            try:
                while True:
                    try:
                        self.ls = TimescaleListenStore(
                            {
                                'REDIS_HOST':
                                current_app.config['REDIS_HOST'],
                                'REDIS_PORT':
                                current_app.config['REDIS_PORT'],
                                'REDIS_NAMESPACE':
                                current_app.config['REDIS_NAMESPACE'],
                                'SQLALCHEMY_TIMESCALE_URI':
                                current_app.config['SQLALCHEMY_TIMESCALE_URI']
                            },
                            logger=current_app.logger)
                        break
                    except Exception as err:
                        current_app.logger.error(
                            "Cannot connect to timescale: %s. Retrying in 2 seconds and trying again."
                            % str(err),
                            exc_info=True)
                        sleep(self.ERROR_RETRY_DELAY)

                while True:
                    try:
                        self.redis = Redis(
                            host=current_app.config['REDIS_HOST'],
                            port=current_app.config['REDIS_PORT'],
                            decode_responses=True)
                        self.redis.ping()
                        self.redis_listenstore = RedisListenStore(
                            current_app.logger, current_app.config)
                        break
                    except Exception as err:
                        current_app.logger.error(
                            "Cannot connect to redis: %s. Retrying in 2 seconds and trying again."
                            % str(err),
                            exc_info=True)
                        sleep(self.ERROR_RETRY_DELAY)

                while True:
                    self.connect_to_rabbitmq()
                    self.incoming_ch = self.connection.channel()
                    self.incoming_ch.exchange_declare(
                        exchange=current_app.config['INCOMING_EXCHANGE'],
                        exchange_type='fanout')
                    self.incoming_ch.queue_declare(
                        current_app.config['INCOMING_QUEUE'], durable=True)
                    self.incoming_ch.queue_bind(
                        exchange=current_app.config['INCOMING_EXCHANGE'],
                        queue=current_app.config['INCOMING_QUEUE'])
                    self.incoming_ch.basic_consume(
                        queue=current_app.config['INCOMING_QUEUE'],
                        on_message_callback=lambda ch, method, properties,
                        body: self.static_callback(
                            ch, method, properties, body, obj=self))

                    self.unique_ch = self.connection.channel()
                    self.unique_ch.exchange_declare(
                        exchange=current_app.config['UNIQUE_EXCHANGE'],
                        exchange_type='fanout')

                    try:
                        self.incoming_ch.start_consuming()
                    except pika.exceptions.ConnectionClosed:
                        current_app.logger.warn(
                            "Connection to rabbitmq closed. Re-opening.",
                            exc_info=True)
                        self.connection = None
                        continue

                    self.connection.close()

            except Exception:
                current_app.logger.error("failed to start timescale loop:",
                                         exc_info=True)
 def setUp(self):
     DatabaseTestCase.setUp(self)
     TimescaleTestCase.setUp(self)
     self.log = logging.getLogger(__name__)
     self.app = create_app()
     self.logstore = TimescaleListenStore(self.log)
class TestTimescaleUtils(DatabaseTestCase, TimescaleTestCase):
    def setUp(self):
        DatabaseTestCase.setUp(self)
        TimescaleTestCase.setUp(self)
        self.log = logging.getLogger(__name__)
        self.app = create_app()
        self.logstore = TimescaleListenStore(self.log)

    def tearDown(self):
        self.logstore = None
        DatabaseTestCase.tearDown(self)
        TimescaleTestCase.tearDown(self)
        cache._r.flushdb()

    def _create_test_data(self, user, file=None):
        test_data = create_test_data_for_timescalelistenstore(
            user["musicbrainz_id"], user["id"], file)
        self.logstore.insert(test_data)
        return len(test_data)

    def _get_count_and_timestamp(self, user):
        with timescale.engine.connect() as connection:
            result = connection.execute(text("""
                    SELECT count, min_listened_at, max_listened_at
                      FROM listen_user_metadata
                     WHERE user_id = :user_id
                """),
                                        user_id=user["id"])
            return dict(**result.fetchone())

    def test_delete_listens_update_metadata(self):
        user_1 = db_user.get_or_create(1, "user_1")
        user_2 = db_user.get_or_create(2, "user_2")
        recalculate_all_user_data()

        self._create_test_data(user_1)
        self._create_test_data(user_2)
        update_user_listen_data()

        metadata_1 = self._get_count_and_timestamp(user_1)
        self.assertEqual(metadata_1["min_listened_at"], 1400000000)
        self.assertEqual(metadata_1["max_listened_at"], 1400000200)
        self.assertEqual(metadata_1["count"], 5)

        metadata_2 = self._get_count_and_timestamp(user_2)
        self.assertEqual(metadata_2["min_listened_at"], 1400000000)
        self.assertEqual(metadata_2["max_listened_at"], 1400000200)
        self.assertEqual(metadata_2["count"], 5)

        # to test the case when the update script has not run since delete, so metadata in listen_user_metadata does
        # account for this listen and deleting should not affect it either.
        self._create_test_data(user_1,
                               "timescale_listenstore_test_listens_2.json")
        self.logstore.delete_listen(1400000500, user_1["id"],
                                    "4269ddbc-9241-46da-935d-4fa9e0f7f371")

        # test min_listened_at is updated if that listen is deleted for a user
        self.logstore.delete_listen(1400000000, user_1["id"],
                                    "4269ddbc-9241-46da-935d-4fa9e0f7f371")
        # test max_listened_at is updated if that listen is deleted for a user
        self.logstore.delete_listen(1400000200, user_1["id"],
                                    "4269ddbc-9241-46da-935d-4fa9e0f7f371")
        # test normal listen delete updates correctly
        self.logstore.delete_listen(1400000100, user_2["id"],
                                    "4269ddbc-9241-46da-935d-4fa9e0f7f371")

        delete_listens()

        metadata_1 = self._get_count_and_timestamp(user_1)
        self.assertEqual(metadata_1["min_listened_at"], 1400000050)
        self.assertEqual(metadata_1["max_listened_at"], 1400000150)
        self.assertEqual(metadata_1["count"], 3)

        metadata_2 = self._get_count_and_timestamp(user_2)
        self.assertEqual(metadata_2["min_listened_at"], 1400000000)
        self.assertEqual(metadata_2["max_listened_at"], 1400000200)
        self.assertEqual(metadata_2["count"], 4)
Пример #9
0
class TimescaleWriterTestCase(IntegrationTestCase):
    def setUp(self):
        super(TimescaleWriterTestCase, self).setUp()
        self.ls = TimescaleListenStore(
            {
                'REDIS_HOST': config.REDIS_HOST,
                'REDIS_PORT': config.REDIS_PORT,
                'REDIS_NAMESPACE': config.REDIS_NAMESPACE,
                'SQLALCHEMY_TIMESCALE_URI': config.SQLALCHEMY_TIMESCALE_URI
            }, self.app.logger)
        self.rs = RedisListenStore(
            self.app.logger, {
                'REDIS_HOST': config.REDIS_HOST,
                'REDIS_PORT': config.REDIS_PORT,
                'REDIS_NAMESPACE': config.REDIS_NAMESPACE
            })

    def tearDown(self):
        self.rs.redis.flushall()

    def send_listen(self, user, filename):
        with open(self.path_to_data_file(filename)) as f:
            payload = json.load(f)
        return self.client.post(
            url_for('api_v1.submit_listen'),
            data=json.dumps(payload),
            headers={'Authorization': 'Token {}'.format(user['auth_token'])},
            content_type='application/json')

    def test_dedup(self):

        user = db_user.get_or_create(
            1, 'testtimescaleuser %d' % randint(1, 50000))

        # send the same listen twice
        r = self.send_listen(user, 'valid_single.json')
        self.assert200(r)
        time.sleep(2)
        r = self.send_listen(user, 'valid_single.json')
        self.assert200(r)
        time.sleep(2)

        to_ts = int(time.time())
        listens = self.ls.fetch_listens(user['musicbrainz_id'],
                                        to_ts=to_ts,
                                        time_range=-1)
        self.assertEqual(len(listens), 1)

        recent = self.rs.get_recent_listens(4)
        self.assertEqual(len(recent), 1)
        self.assertIsInstance(recent[0], Listen)

    def test_update_listen_count_per_day(self):
        """ Tests that timescale writer updates the listen count for the
        day in redis for each successful batch written
        """
        user = db_user.get_or_create(
            1, 'testtimescaleuser %d' % randint(1, 50000))
        r = self.send_listen(user, 'valid_single.json')
        self.assert200(r)
        time.sleep(2)

        self.assertEqual(1,
                         self.rs.get_listen_count_for_day(datetime.utcnow()))

    def test_dedup_user_special_characters(self):

        user = db_user.get_or_create(2, 'i have a\\weird\\user, name"\n')

        # send the same listen twice
        r = self.send_listen(user, 'valid_single.json')
        self.assert200(r)
        time.sleep(2)
        r = self.send_listen(user, 'valid_single.json')
        self.assert200(r)
        time.sleep(2)

        to_ts = int(time.time())
        listens = self.ls.fetch_listens(user['musicbrainz_id'],
                                        to_ts=to_ts,
                                        time_range=-1)
        self.assertEqual(len(listens), 1)

    def test_dedup_same_batch(self):

        user = db_user.get_or_create(3, 'phifedawg')
        r = self.send_listen(user, 'same_batch_duplicates.json')
        self.assert200(r)
        time.sleep(2)

        to_ts = int(time.time())
        listens = self.ls.fetch_listens(user['musicbrainz_id'],
                                        to_ts=to_ts,
                                        time_range=-1)
        self.assertEqual(len(listens), 1)

    def test_dedup_different_users(self):
        """
        Test to make sure timescale writer doesn't confuse listens with same timestamps
        but different users to be duplicates
        """

        user1 = db_user.get_or_create(1, 'testuser1')
        user2 = db_user.get_or_create(2, 'testuser2')

        r = self.send_listen(user1, 'valid_single.json')
        self.assert200(r)
        r = self.send_listen(user2, 'valid_single.json')
        self.assert200(r)

        time.sleep(2)  # sleep to allow timescale-writer to do its thing

        to_ts = int(time.time())
        listens = self.ls.fetch_listens(user1['musicbrainz_id'],
                                        to_ts=to_ts,
                                        time_range=-1)
        self.assertEqual(len(listens), 1)

        listens = self.ls.fetch_listens(user2['musicbrainz_id'],
                                        to_ts=to_ts,
                                        time_range=-1)
        self.assertEqual(len(listens), 1)

    def test_dedup_same_timestamp_different_tracks(self):
        """ Test to check that if there are two tracks w/ the same timestamp,
            they don't get considered as duplicates
        """

        user = db_user.get_or_create(1, 'difftracksametsuser')

        # send four different tracks with the same timestamp
        r = self.send_listen(user, 'valid_single.json')
        self.assert200(r)

        r = self.send_listen(user,
                             'same_timestamp_diff_track_valid_single.json')
        self.assert200(r)

        r = self.send_listen(user,
                             'same_timestamp_diff_track_valid_single_2.json')
        self.assert200(r)

        r = self.send_listen(user,
                             'same_timestamp_diff_track_valid_single_3.json')
        self.assert200(r)
        time.sleep(2)

        to_ts = int(time.time())
        listens = self.ls.fetch_listens(user['musicbrainz_id'],
                                        to_ts=to_ts,
                                        time_range=-1)
        self.assertEqual(len(listens), 4)
Пример #10
0
class TestDumpListenStore(DatabaseTestCase, TimescaleTestCase):
    def setUp(self):
        DatabaseTestCase.setUp(self)
        TimescaleTestCase.setUp(self)
        self.app = create_app()
        self.logstore = TimescaleListenStore(self.app.logger)
        self.dumpstore = DumpListenStore(self.app)
        self.testuser = db_user.get_or_create(1, "test")
        self.testuser_name = self.testuser["musicbrainz_id"]
        self.testuser_id = self.testuser["id"]

    def tearDown(self):
        self.logstore = None
        self.dumpstore = None
        DatabaseTestCase.tearDown(self)
        TimescaleTestCase.tearDown(self)

    def _create_test_data(self, user_name, user_id, test_data_file_name=None):
        test_data = create_test_data_for_timescalelistenstore(
            user_name, user_id, test_data_file_name)
        self.logstore.insert(test_data)
        return len(test_data)

    def _insert_with_created(self, listens):
        """ Insert a batch of listens with 'created' field.
        """
        submit = []
        for listen in listens:
            submit.append((*listen.to_timescale(), listen.inserted_timestamp))

        query = """INSERT INTO listen (listened_at, track_name, user_name, user_id, data, created)
                        VALUES %s
                   ON CONFLICT (listened_at, track_name, user_id)
                    DO NOTHING
                """

        conn = timescale.engine.raw_connection()
        with conn.cursor() as curs:
            execute_values(curs, query, submit, template=None)

        conn.commit()

    def test_dump_listens(self):
        self._create_test_data(self.testuser_name, self.testuser_id)
        temp_dir = tempfile.mkdtemp()
        dump = self.dumpstore.dump_listens(
            location=temp_dir,
            dump_id=1,
            end_time=datetime.now(),
        )
        self.assertTrue(os.path.isfile(dump))
        shutil.rmtree(temp_dir)

    def test_incremental_dump(self):
        base = 1500000000
        listens = generate_data(1, self.testuser_name, base - 4, 5, base +
                                1)  # generate 5 listens with inserted_ts 1-5
        self._insert_with_created(listens)
        listens = generate_data(1, self.testuser_name, base + 1, 5, base +
                                6)  # generate 5 listens with inserted_ts 6-10
        self._insert_with_created(listens)
        temp_dir = tempfile.mkdtemp()
        dump_location = self.dumpstore.dump_listens(
            location=temp_dir,
            dump_id=1,
            start_time=datetime.utcfromtimestamp(base + 6),
            end_time=datetime.utcfromtimestamp(base + 10))
        self.assertTrue(os.path.isfile(dump_location))

        self.reset_timescale_db()
        self.logstore.import_listens_dump(dump_location)
        recalculate_all_user_data()

        listens, min_ts, max_ts = self.logstore.fetch_listens(
            user=self.testuser, to_ts=base + 11)
        self.assertEqual(len(listens), 4)
        self.assertEqual(listens[0].ts_since_epoch, base + 5)
        self.assertEqual(listens[1].ts_since_epoch, base + 4)
        self.assertEqual(listens[2].ts_since_epoch, base + 3)
        self.assertEqual(listens[3].ts_since_epoch, base + 2)

        shutil.rmtree(temp_dir)

    def test_time_range_full_dumps(self):
        base = 1500000000
        listens = generate_data(1, self.testuser_name, base + 1,
                                5)  # generate 5 listens with ts 1-5
        self.logstore.insert(listens)
        listens = generate_data(1, self.testuser_name, base + 6,
                                5)  # generate 5 listens with ts 6-10
        self.logstore.insert(listens)
        temp_dir = tempfile.mkdtemp()
        dump_location = self.dumpstore.dump_listens(
            location=temp_dir,
            dump_id=1,
            end_time=datetime.utcfromtimestamp(base + 5))
        self.assertTrue(os.path.isfile(dump_location))

        self.reset_timescale_db()
        self.logstore.import_listens_dump(dump_location)
        recalculate_all_user_data()

        listens, min_ts, max_ts = self.logstore.fetch_listens(
            user=self.testuser, to_ts=base + 11)
        self.assertEqual(len(listens), 5)
        self.assertEqual(listens[0].ts_since_epoch, base + 5)
        self.assertEqual(listens[1].ts_since_epoch, base + 4)
        self.assertEqual(listens[2].ts_since_epoch, base + 3)
        self.assertEqual(listens[3].ts_since_epoch, base + 2)
        self.assertEqual(listens[4].ts_since_epoch, base + 1)

    # tests test_full_dump_listen_with_no_created
    # and test_incremental_dumps_listen_with_no_created have been removed because
    # with timescale all the missing inserted timestamps will have been
    # been assigned sane created timestamps by the migration script
    # and timescale will not allow blank created timestamps, so this test is pointless

    def test_import_listens(self):
        self._create_test_data(self.testuser_name, self.testuser_id)
        temp_dir = tempfile.mkdtemp()
        dump_location = self.dumpstore.dump_listens(
            location=temp_dir,
            dump_id=1,
            end_time=datetime.now(),
        )
        self.assertTrue(os.path.isfile(dump_location))

        self.reset_timescale_db()
        self.logstore.import_listens_dump(dump_location)
        recalculate_all_user_data()

        listens, min_ts, max_ts = self.logstore.fetch_listens(
            user=self.testuser, to_ts=1400000300)
        self.assertEqual(len(listens), 5)
        self.assertEqual(listens[0].ts_since_epoch, 1400000200)
        self.assertEqual(listens[1].ts_since_epoch, 1400000150)
        self.assertEqual(listens[2].ts_since_epoch, 1400000100)
        self.assertEqual(listens[3].ts_since_epoch, 1400000050)
        self.assertEqual(listens[4].ts_since_epoch, 1400000000)
        shutil.rmtree(temp_dir)

    def test_dump_and_import_listens_escaped(self):
        user = db_user.get_or_create(3, 'i have a\\weird\\user, na/me"\n')
        self._create_test_data(user['musicbrainz_id'], user['id'])

        self._create_test_data(self.testuser_name, self.testuser_id)

        temp_dir = tempfile.mkdtemp()
        dump_location = self.dumpstore.dump_listens(
            location=temp_dir,
            dump_id=1,
            end_time=datetime.now(),
        )
        self.assertTrue(os.path.isfile(dump_location))

        self.reset_timescale_db()
        self.logstore.import_listens_dump(dump_location)
        recalculate_all_user_data()

        listens, min_ts, max_ts = self.logstore.fetch_listens(user=user,
                                                              to_ts=1400000300)
        self.assertEqual(len(listens), 5)
        self.assertEqual(listens[0].ts_since_epoch, 1400000200)
        self.assertEqual(listens[1].ts_since_epoch, 1400000150)
        self.assertEqual(listens[2].ts_since_epoch, 1400000100)
        self.assertEqual(listens[3].ts_since_epoch, 1400000050)
        self.assertEqual(listens[4].ts_since_epoch, 1400000000)

        listens, min_ts, max_ts = self.logstore.fetch_listens(
            user=self.testuser, to_ts=1400000300)
        self.assertEqual(len(listens), 5)
        self.assertEqual(listens[0].ts_since_epoch, 1400000200)
        self.assertEqual(listens[1].ts_since_epoch, 1400000150)
        self.assertEqual(listens[2].ts_since_epoch, 1400000100)
        self.assertEqual(listens[3].ts_since_epoch, 1400000050)
        self.assertEqual(listens[4].ts_since_epoch, 1400000000)
        shutil.rmtree(temp_dir)

    # test test_import_dump_many_users is gone -- why are we testing user dump/restore here??

    def create_test_dump(self,
                         archive_name,
                         archive_path,
                         schema_version=None):
        """ Creates a test dump to test the import listens functionality.
        Args:
            archive_name (str): the name of the archive
            archive_path (str): the full path to the archive
            schema_version (int): the version of the schema to be written into SCHEMA_SEQUENCE
                                  if not provided, the SCHEMA_SEQUENCE file is not added to the archive
        Returns:
            the full path to the archive created
        """

        temp_dir = tempfile.mkdtemp()
        with tarfile.open(archive_path, mode='w|xz') as tar:
            schema_version_path = os.path.join(temp_dir, 'SCHEMA_SEQUENCE')
            with open(schema_version_path, 'w') as f:
                f.write(str(schema_version or ' '))
            tar.add(schema_version_path,
                    arcname=os.path.join(archive_name, 'SCHEMA_SEQUENCE'))

        return archive_path

    def test_schema_mismatch_exception_for_dump_incorrect_schema(self):
        """ Tests that SchemaMismatchException is raised when the schema of the dump is old """

        # create a temp archive with incorrect SCHEMA_VERSION_CORE
        temp_dir = tempfile.mkdtemp()
        archive_name = 'temp_dump'
        archive_path = os.path.join(temp_dir, archive_name + '.tar.xz')
        archive_path = self.create_test_dump(
            archive_name=archive_name,
            archive_path=archive_path,
            schema_version=LISTENS_DUMP_SCHEMA_VERSION - 1)
        with self.assertRaises(SchemaMismatchException):
            self.logstore.import_listens_dump(archive_path)

    def test_schema_mismatch_exception_for_dump_no_schema(self):
        """ Tests that SchemaMismatchException is raised when there is no schema version in the archive """

        temp_dir = tempfile.mkdtemp()
        archive_name = 'temp_dump'
        archive_path = os.path.join(temp_dir, archive_name + '.tar.xz')

        archive_path = self.create_test_dump(archive_name=archive_name,
                                             archive_path=archive_path)

        with self.assertRaises(SchemaMismatchException):
            self.logstore.import_listens_dump(archive_path)