def test_to_influx(self):
        listen = Listen(
            timestamp=int(time.time()),
            user_name='testuser',
            artist_msid=uuid.uuid4(),
            recording_msid=uuid.uuid4(),
            dedup_tag=3,
            data={
                'artist_name': 'Radiohead',
                'track_name': 'True Love Waits',
                'additional_info': {
                    'release_type': ["ALBUM", "REMIX"],
                }
            }
        )

        data = listen.to_influx(quote(listen.user_name))

        # Make sure every value that we don't explicitly support is a string
        for key in data['fields']:
            if key not in Listen.SUPPORTED_KEYS and key not in Listen.PRIVATE_KEYS:
                self.assertIsInstance(data['fields'][key], str)

        # Check values
        self.assertEqual(data['measurement'], quote(listen.user_name))
        self.assertEqual(data['time'], listen.ts_since_epoch)
        self.assertEqual(data['tags']['dedup_tag'], listen.dedup_tag)
        self.assertEqual(data['fields']['user_name'], listen.user_name)
        self.assertEqual(data['fields']['artist_msid'], listen.artist_msid)
        self.assertEqual(data['fields']['recording_msid'], listen.recording_msid)
        self.assertEqual(data['fields']['track_name'], listen.data['track_name'])
        self.assertEqual(data['fields']['artist_name'], listen.data['artist_name'])

        self.assertIn('inserted_timestamp', data['fields'])
    def test_to_influx(self):
        listen = Listen(timestamp=int(time.time()),
                        user_name='testuser',
                        artist_msid=uuid.uuid4(),
                        recording_msid=uuid.uuid4(),
                        data={
                            'artist_name': 'Radiohead',
                            'track_name': 'True Love Waits',
                            'additional_info': {
                                'release_type': ["ALBUM", "REMIX"],
                            }
                        })

        data = listen.to_influx(quote(listen.user_name))

        # Make sure every value that we don't explicitly support is a string
        for key in data['fields']:
            if key not in Listen.SUPPORTED_KEYS:
                print(key)
                self.assertIsInstance(data['fields'][key], str)

        # Check values
        self.assertEqual(data['measurement'], quote(listen.user_name))
        self.assertEqual(data['time'], listen.ts_since_epoch)
        self.assertEqual(data['tags']['user_name'], listen.user_name)
        self.assertEqual(data['fields']['artist_msid'], listen.artist_msid)
        self.assertEqual(data['fields']['recording_msid'],
                         listen.recording_msid)
        self.assertEqual(data['fields']['track_name'],
                         listen.data['track_name'])
        self.assertEqual(data['fields']['artist_name'],
                         listen.data['artist_name'])
    def insert(self, listens):
        """ Insert a batch of listens.
        """

        submit = []
        user_names = {}
        for listen in listens:
            user_names[listen.user_name] = 1
            submit.append(listen.to_influx(quote(listen.user_name)))

        if not self.influx.write_points(submit, time_precision='s'):
            self.log.error(
                "Cannot write data to influx. (write_points returned False)")

        # If we reach this point, we were able to write the listens to the InfluxListenStore.
        # So update the listen counts of the users cached in redis.
        for data in submit:
            user_key = "{}{}".format(REDIS_INFLUX_USER_LISTEN_COUNT,
                                     data['tags']['user_name'])
            if self.redis.exists(user_key):
                self.redis.incr(user_key)

        # Invalidate cached data for user
        for user_name in user_names.keys():
            self.redis.delete(REDIS_USER_TIMESTAMPS % user_name)
示例#4
0
    def import_listens_dump(self, archive_path, threads=None):
        """ Imports listens into InfluxDB from a ListenBrainz listens dump .tar.xz archive.

        Args:
            archive (str): the path to the listens dump .tar.xz archive to be imported
            threads (int): the number of threads to be used for decompression (defaults to 1)
        """

        self.log.info('Beginning import of listens from dump %s...',
                      archive_path)

        pxz_command = ['pxz', '--decompress', '--stdout', archive_path]
        if threads is not None:
            pxz_command.append('-T {threads}'.format(threads=threads))
        pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE)

        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            for member in tar:
                file_name = member.name.split('/')[-1]

                if file_name == 'SCHEMA_SEQUENCE':
                    self.log.info(
                        'Checking if schema version of dump matches...')
                    schema_seq = int(tar.extractfile(member).read().strip())
                    if schema_seq != LISTENS_DUMP_SCHEMA_VERSION:
                        raise SchemaMismatchException(
                            'Incorrect schema version! Expected: %d, got: %d.'
                            'Please ensure that the data dump version matches the code version'
                            'in order to import the data.' %
                            (LISTENS_DUMP_SCHEMA_VERSION, schema_seq))

                elif file_name.endswith('.listens'):

                    # remove .listens from the filename to get the username
                    user_name = file_name[:-8]
                    self.log.info('Importing user %s', user_name)
                    listens = []
                    listen_count = 0

                    # iterate through files and keep writing listens in chunks
                    for listen in tar.extractfile(member):
                        influx_listen = Listen.from_json(
                            ujson.loads(listen)).to_influx(quote(user_name))
                        listens.append(influx_listen)
                        listen_count += 1

                        if listen_count > DUMP_CHUNK_SIZE:
                            self.write_points_to_db(listens)
                            listen_count = 0
                            listens = []

                    # if some listens are left, write them to db
                    if listen_count > 0:
                        self.write_points_to_db(listens)

        self.log.info('Import of listens from dump %s done!', archive_path)
    def insert(self, listens):
        """ Insert a batch of listens.
        """

        submit = []
        user_names = {}
        for listen in listens:
            user_names[listen.user_name] = 1
            submit.append(listen.to_influx(quote(listen.user_name)))

        if not self.influx.write_points(submit, time_precision='s'):
            self.log.error(
                "Cannot write data to influx. (write_points returned False), data=%s",
                json.dumps(submit, indent=3))

        # If we reach this point, we were able to write the listens to the InfluxListenStore.
        # So update the listen counts of the users cached in brainzutils cache.
        for data in submit:
            user_key = "{}{}".format(REDIS_INFLUX_USER_LISTEN_COUNT,
                                     data['fields']['user_name'])

            cached_count = cache.get(user_key, decode=False)
            if cached_count:
                cache.increment(user_key)

        # Invalidate cached data for user
        for user_name in user_names.keys():
            cache.delete(REDIS_USER_TIMESTAMPS % user_name)

        if len(listens):
            # Enter a measurement to count items inserted
            submit = [{
                'measurement': TEMP_COUNT_MEASUREMENT,
                'tags': {
                    COUNT_MEASUREMENT_NAME: len(listens)
                },
                'fields': {
                    COUNT_MEASUREMENT_NAME: len(listens)
                }
            }]
            try:
                if not self.influx.write_points(submit):
                    self.log.error(
                        "Cannot write listen cound to influx. (write_points returned False)"
                    )
            except (InfluxDBServerError, InfluxDBClientError,
                    ValueError) as err:
                self.log.error("Cannot write data to influx: %s, data: %s",
                               str(err),
                               json.dumps(submit, indent=3),
                               exc_info=True)
                raise
示例#6
0
    def test_incremental_dumps_listen_with_no_insert_timestamp(self):
        """ Incremental dumps should only consider listens that have
        inserted_timestamps.
        """
        t = datetime.now()
        sleep(1)
        listens = generate_data(1, self.testuser_name, 1, 5)

        # insert these listens into influx without an insert_timestamp
        influx_rows = [
            listen.to_influx(quote(self.testuser_name)) for listen in listens
        ]
        for row in influx_rows[1:]:
            row['fields'].pop('inserted_timestamp')

        self.logstore.write_points_to_db(influx_rows)
        sleep(1)
        listens_from_influx = self.logstore.fetch_listens(
            user_name=self.testuser_name, to_ts=11)
        self.assertEqual(len(listens_from_influx), 5)

        # incremental dump (with a start time) should not contain these listens
        temp_dir = tempfile.mkdtemp()
        dump_location = self.logstore.dump_listens(
            location=temp_dir,
            dump_id=1,
            start_time=t,
            end_time=datetime.now(),
        )
        spark_dump_location = self.logstore.dump_listens(
            location=temp_dir,
            dump_id=1,
            start_time=t,
            end_time=datetime.now(),
            spark_format=True,
        )
        self.assertTrue(os.path.isfile(dump_location))
        self.reset_influx_db()
        sleep(1)
        self.logstore.import_listens_dump(dump_location)
        sleep(1)
        listens_from_influx = self.logstore.fetch_listens(
            user_name=self.testuser_name, to_ts=11)
        self.assertEqual(len(listens_from_influx), 1)
        self.assert_spark_dump_contains_listens(spark_dump_location, 1)
        shutil.rmtree(temp_dir)
示例#7
0
    def test_full_dump_listen_with_no_insert_timestamp(self):
        """ We have listens with no `inserted_timestamps` inside the production
        database. This means that full dumps should always be able to dump these
        listens as well. This is a test to test that.
        """
        listens = generate_data(1, self.testuser_name, 1, 5)

        # insert these listens into influx without an insert_timestamp
        influx_rows = [
            listen.to_influx(quote(self.testuser_name)) for listen in listens
        ]
        for row in influx_rows[1:]:
            row['fields'].pop('inserted_timestamp')

        t = datetime.now()
        self.logstore.write_points_to_db(influx_rows)
        sleep(1)
        listens_from_influx = self.logstore.fetch_listens(
            user_name=self.testuser_name, to_ts=11)
        self.assertEqual(len(listens_from_influx), 5)

        # full dump (with no start time) should contain these listens
        temp_dir = tempfile.mkdtemp()
        dump_location = self.logstore.dump_listens(
            location=temp_dir,
            dump_id=1,
            end_time=datetime.now(),
        )
        spark_dump_location = self.logstore.dump_listens(
            location=temp_dir,
            dump_id=1,
            end_time=datetime.now(),
            spark_format=True,
        )
        self.assertTrue(os.path.isfile(dump_location))
        self.reset_influx_db()
        sleep(1)
        self.logstore.import_listens_dump(dump_location)
        sleep(1)
        listens_from_influx = self.logstore.fetch_listens(
            user_name=self.testuser_name, to_ts=11)
        self.assertEqual(len(listens_from_influx), 5)
        self.assert_spark_dump_contains_listens(spark_dump_location, 5)
        shutil.rmtree(temp_dir)
示例#8
0
 def copy_measurement(self, src, dest, apply_filter=False):
     done = False
     offset = 0
     while True:
         result = self.ls.get_listens_batch_for_dump(
             src, self.max_time, offset)
         rows = []
         count = 0
         for row in result.get_points(get_measurement_name(src)):
             count += 1
             if apply_filter:
                 row = self.filter_function(row)
             if row:
                 rows.append(
                     self.convert_to_influx_insert_format(row, quote(dest)))
         self.ls.write_points_to_db(rows)
         offset += DUMP_CHUNK_SIZE
         if count == 0:
             break
    def insert(self, listens):
        """ Insert a batch of listens.
        """

        submit = []
        user_names = {}
        for listen in listens:
            user_names[listen.user_name] = 1
            submit.append(listen.to_influx(quote(listen.user_name)))

        if not self.influx.write_points(submit, time_precision='s'):
            self.log.error("Cannot write data to influx. (write_points returned False), data=%s", json.dumps(submit, indent=3))

        # If we reach this point, we were able to write the listens to the InfluxListenStore.
        # So update the listen counts of the users cached in brainzutils cache.
        for data in submit:
            user_key = "{}{}".format(REDIS_INFLUX_USER_LISTEN_COUNT, data['fields']['user_name'])

            cached_count = cache.get(user_key, decode=False)
            if cached_count:
                cache.increment(user_key)

        # Invalidate cached data for user
        for user_name in user_names.keys():
            cache.delete(REDIS_USER_TIMESTAMPS % user_name)

        if len(listens):
            # Enter a measurement to count items inserted
            submit = [{
                'measurement': TEMP_COUNT_MEASUREMENT,
                'tags': {
                    COUNT_MEASUREMENT_NAME: len(listens)
                },
                'fields': {
                    COUNT_MEASUREMENT_NAME: len(listens)
                }
            }]
            try:
                if not self.influx.write_points(submit):
                    self.log.error("Cannot write listen cound to influx. (write_points returned False)")
            except (InfluxDBServerError, InfluxDBClientError, ValueError) as err:
                self.log.error("Cannot write data to influx: %s, data: %s", str(err), json.dumps(submit, indent=3), exc_info=True)
                raise
    def import_listens_dump(self,
                            archive_path,
                            threads=DUMP_DEFAULT_THREAD_COUNT):
        """ Imports listens into InfluxDB from a ListenBrainz listens dump .tar.xz archive.

        Args:
            archive (str): the path to the listens dump .tar.xz archive to be imported
            threads (int): the number of threads to be used for decompression
                           (defaults to DUMP_DEFAULT_THREAD_COUNT)

        Returns:
            int: the number of users for whom listens have been imported
        """

        self.log.info('Beginning import of listens from dump %s...',
                      archive_path)

        # construct the pxz command to decompress the archive
        pxz_command = [
            'pxz', '--decompress', '--stdout', archive_path,
            '-T{threads}'.format(threads=threads)
        ]

        # run the command once to ensure schema version is correct
        # and load the index
        pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE)

        index = None
        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            schema_check_done = False
            index_loaded = False
            for member in tar:
                file_name = member.name.split('/')[-1]
                if file_name == 'SCHEMA_SEQUENCE':
                    self.log.info(
                        'Checking if schema version of dump matches...')
                    schema_seq = int(tar.extractfile(member).read().strip())
                    if schema_seq != LISTENS_DUMP_SCHEMA_VERSION:
                        raise SchemaMismatchException(
                            'Incorrect schema version! Expected: %d, got: %d.'
                            'Please ensure that the data dump version matches the code version'
                            'in order to import the data.' %
                            (LISTENS_DUMP_SCHEMA_VERSION, schema_seq))
                    schema_check_done = True

                elif file_name == 'index.json':
                    with tar.extractfile(member) as f:
                        index = ujson.load(f)
                    index_loaded = True

                if schema_check_done and index_loaded:
                    self.log.info(
                        'Schema version matched and index.json loaded!')
                    self.log.info('Starting import of listens...')
                    break
            else:
                raise SchemaMismatchException(
                    'Metadata files missing in dump, please ensure that the dump file is valid.'
                )

        # close pxz command and start over again, this time with the aim of importing all listens
        pxz.stdout.close()

        file_contents = defaultdict(list)
        for user, info in index.items():
            file_contents[info['file_name']].append({
                'user_name': user,
                'offset': info['offset'],
                'size': info['size'],
            })

        for file_name in file_contents:
            file_contents[file_name] = sorted(file_contents[file_name],
                                              key=lambda x: x['offset'])

        pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE)

        users_done = 0
        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            for member in tar:
                file_name = member.name.split('/')[-1]
                if file_name.endswith('.listens'):

                    file_name = file_name[:-8]
                    with tar.extractfile(member) as f:
                        for user in file_contents[file_name]:
                            self.log.info('Importing user %s...',
                                          user['user_name'])
                            assert (f.tell() == user['offset'])
                            bytes_read = 0
                            listens = []
                            while bytes_read < user['size']:
                                line = f.readline()
                                bytes_read += len(line)
                                listen = Listen.from_json(
                                    ujson.loads(line)).to_influx(
                                        quote(user['user_name']))
                                listens.append(listen)

                                if len(listens) > DUMP_CHUNK_SIZE:
                                    self.write_points_to_db(listens)
                                    listens = []

                            if len(listens) > 0:
                                self.write_points_to_db(listens)

                            self.log.info('Import of user %s done!',
                                          user['user_name'])
                            users_done += 1

        self.log.info('Import of listens from dump %s done!', archive_path)
        pxz.stdout.close()
        return users_done
    def import_listens_dump(self, archive_path, threads=DUMP_DEFAULT_THREAD_COUNT):
        """ Imports listens into InfluxDB from a ListenBrainz listens dump .tar.xz archive.

        Args:
            archive (str): the path to the listens dump .tar.xz archive to be imported
            threads (int): the number of threads to be used for decompression
                           (defaults to DUMP_DEFAULT_THREAD_COUNT)

        Returns:
            int: the number of users for whom listens have been imported
        """

        self.log.info('Beginning import of listens from dump %s...', archive_path)

        # construct the pxz command to decompress the archive
        pxz_command = ['pxz', '--decompress', '--stdout', archive_path, '-T{threads}'.format(threads=threads)]

        # run the command once to ensure schema version is correct
        # and load the index
        pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE)

        index = None
        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            schema_check_done = False
            index_loaded = False
            for member in tar:
                file_name = member.name.split('/')[-1]
                if file_name == 'SCHEMA_SEQUENCE':
                    self.log.info('Checking if schema version of dump matches...')
                    schema_seq = int(tar.extractfile(member).read().strip())
                    if schema_seq != LISTENS_DUMP_SCHEMA_VERSION:
                        raise SchemaMismatchException('Incorrect schema version! Expected: %d, got: %d.'
                                        'Please ensure that the data dump version matches the code version'
                                        'in order to import the data.'
                                        % (LISTENS_DUMP_SCHEMA_VERSION, schema_seq))
                    schema_check_done = True

                elif file_name == 'index.json':
                    with tar.extractfile(member) as f:
                        index = ujson.load(f)
                    index_loaded = True

                if schema_check_done and index_loaded:
                    self.log.info('Schema version matched and index.json loaded!')
                    self.log.info('Starting import of listens...')
                    break
            else:
                raise SchemaMismatchException('Metadata files missing in dump, please ensure that the dump file is valid.')


        # close pxz command and start over again, this time with the aim of importing all listens
        pxz.stdout.close()

        file_contents = defaultdict(list)
        for user, info in index.items():
            file_contents[info['file_name']].append({
                'user_name': user,
                'offset': info['offset'],
                'size': info['size'],
            })

        for file_name in file_contents:
            file_contents[file_name] = sorted(file_contents[file_name], key=lambda x: x['offset'])

        pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE)

        users_done = 0
        with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar:
            for member in tar:
                file_name = member.name.split('/')[-1]
                if file_name.endswith('.listens'):

                    file_name = file_name[:-8]
                    with tar.extractfile(member) as f:
                        for user in file_contents[file_name]:
                            self.log.info('Importing user %s...', user['user_name'])
                            assert(f.tell() == user['offset'])
                            bytes_read = 0
                            listens = []
                            while bytes_read < user['size']:
                                line = f.readline()
                                bytes_read += len(line)
                                listen = Listen.from_json(ujson.loads(line)).to_influx(quote(user['user_name']))
                                listens.append(listen)

                                if len(listens) > DUMP_CHUNK_SIZE:
                                    self.write_points_to_db(listens)
                                    listens = []

                            if len(listens) > 0:
                                self.write_points_to_db(listens)

                            self.log.info('Import of user %s done!', user['user_name'])
                            users_done += 1

        self.log.info('Import of listens from dump %s done!', archive_path)
        pxz.stdout.close()
        return users_done