def test_to_influx(self): listen = Listen( timestamp=int(time.time()), user_name='testuser', artist_msid=uuid.uuid4(), recording_msid=uuid.uuid4(), dedup_tag=3, data={ 'artist_name': 'Radiohead', 'track_name': 'True Love Waits', 'additional_info': { 'release_type': ["ALBUM", "REMIX"], } } ) data = listen.to_influx(quote(listen.user_name)) # Make sure every value that we don't explicitly support is a string for key in data['fields']: if key not in Listen.SUPPORTED_KEYS and key not in Listen.PRIVATE_KEYS: self.assertIsInstance(data['fields'][key], str) # Check values self.assertEqual(data['measurement'], quote(listen.user_name)) self.assertEqual(data['time'], listen.ts_since_epoch) self.assertEqual(data['tags']['dedup_tag'], listen.dedup_tag) self.assertEqual(data['fields']['user_name'], listen.user_name) self.assertEqual(data['fields']['artist_msid'], listen.artist_msid) self.assertEqual(data['fields']['recording_msid'], listen.recording_msid) self.assertEqual(data['fields']['track_name'], listen.data['track_name']) self.assertEqual(data['fields']['artist_name'], listen.data['artist_name']) self.assertIn('inserted_timestamp', data['fields'])
def test_to_influx(self): listen = Listen(timestamp=int(time.time()), user_name='testuser', artist_msid=uuid.uuid4(), recording_msid=uuid.uuid4(), data={ 'artist_name': 'Radiohead', 'track_name': 'True Love Waits', 'additional_info': { 'release_type': ["ALBUM", "REMIX"], } }) data = listen.to_influx(quote(listen.user_name)) # Make sure every value that we don't explicitly support is a string for key in data['fields']: if key not in Listen.SUPPORTED_KEYS: print(key) self.assertIsInstance(data['fields'][key], str) # Check values self.assertEqual(data['measurement'], quote(listen.user_name)) self.assertEqual(data['time'], listen.ts_since_epoch) self.assertEqual(data['tags']['user_name'], listen.user_name) self.assertEqual(data['fields']['artist_msid'], listen.artist_msid) self.assertEqual(data['fields']['recording_msid'], listen.recording_msid) self.assertEqual(data['fields']['track_name'], listen.data['track_name']) self.assertEqual(data['fields']['artist_name'], listen.data['artist_name'])
def insert(self, listens): """ Insert a batch of listens. """ submit = [] user_names = {} for listen in listens: user_names[listen.user_name] = 1 submit.append(listen.to_influx(quote(listen.user_name))) if not self.influx.write_points(submit, time_precision='s'): self.log.error( "Cannot write data to influx. (write_points returned False)") # If we reach this point, we were able to write the listens to the InfluxListenStore. # So update the listen counts of the users cached in redis. for data in submit: user_key = "{}{}".format(REDIS_INFLUX_USER_LISTEN_COUNT, data['tags']['user_name']) if self.redis.exists(user_key): self.redis.incr(user_key) # Invalidate cached data for user for user_name in user_names.keys(): self.redis.delete(REDIS_USER_TIMESTAMPS % user_name)
def import_listens_dump(self, archive_path, threads=None): """ Imports listens into InfluxDB from a ListenBrainz listens dump .tar.xz archive. Args: archive (str): the path to the listens dump .tar.xz archive to be imported threads (int): the number of threads to be used for decompression (defaults to 1) """ self.log.info('Beginning import of listens from dump %s...', archive_path) pxz_command = ['pxz', '--decompress', '--stdout', archive_path] if threads is not None: pxz_command.append('-T {threads}'.format(threads=threads)) pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE) with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: for member in tar: file_name = member.name.split('/')[-1] if file_name == 'SCHEMA_SEQUENCE': self.log.info( 'Checking if schema version of dump matches...') schema_seq = int(tar.extractfile(member).read().strip()) if schema_seq != LISTENS_DUMP_SCHEMA_VERSION: raise SchemaMismatchException( 'Incorrect schema version! Expected: %d, got: %d.' 'Please ensure that the data dump version matches the code version' 'in order to import the data.' % (LISTENS_DUMP_SCHEMA_VERSION, schema_seq)) elif file_name.endswith('.listens'): # remove .listens from the filename to get the username user_name = file_name[:-8] self.log.info('Importing user %s', user_name) listens = [] listen_count = 0 # iterate through files and keep writing listens in chunks for listen in tar.extractfile(member): influx_listen = Listen.from_json( ujson.loads(listen)).to_influx(quote(user_name)) listens.append(influx_listen) listen_count += 1 if listen_count > DUMP_CHUNK_SIZE: self.write_points_to_db(listens) listen_count = 0 listens = [] # if some listens are left, write them to db if listen_count > 0: self.write_points_to_db(listens) self.log.info('Import of listens from dump %s done!', archive_path)
def insert(self, listens): """ Insert a batch of listens. """ submit = [] user_names = {} for listen in listens: user_names[listen.user_name] = 1 submit.append(listen.to_influx(quote(listen.user_name))) if not self.influx.write_points(submit, time_precision='s'): self.log.error( "Cannot write data to influx. (write_points returned False), data=%s", json.dumps(submit, indent=3)) # If we reach this point, we were able to write the listens to the InfluxListenStore. # So update the listen counts of the users cached in brainzutils cache. for data in submit: user_key = "{}{}".format(REDIS_INFLUX_USER_LISTEN_COUNT, data['fields']['user_name']) cached_count = cache.get(user_key, decode=False) if cached_count: cache.increment(user_key) # Invalidate cached data for user for user_name in user_names.keys(): cache.delete(REDIS_USER_TIMESTAMPS % user_name) if len(listens): # Enter a measurement to count items inserted submit = [{ 'measurement': TEMP_COUNT_MEASUREMENT, 'tags': { COUNT_MEASUREMENT_NAME: len(listens) }, 'fields': { COUNT_MEASUREMENT_NAME: len(listens) } }] try: if not self.influx.write_points(submit): self.log.error( "Cannot write listen cound to influx. (write_points returned False)" ) except (InfluxDBServerError, InfluxDBClientError, ValueError) as err: self.log.error("Cannot write data to influx: %s, data: %s", str(err), json.dumps(submit, indent=3), exc_info=True) raise
def test_incremental_dumps_listen_with_no_insert_timestamp(self): """ Incremental dumps should only consider listens that have inserted_timestamps. """ t = datetime.now() sleep(1) listens = generate_data(1, self.testuser_name, 1, 5) # insert these listens into influx without an insert_timestamp influx_rows = [ listen.to_influx(quote(self.testuser_name)) for listen in listens ] for row in influx_rows[1:]: row['fields'].pop('inserted_timestamp') self.logstore.write_points_to_db(influx_rows) sleep(1) listens_from_influx = self.logstore.fetch_listens( user_name=self.testuser_name, to_ts=11) self.assertEqual(len(listens_from_influx), 5) # incremental dump (with a start time) should not contain these listens temp_dir = tempfile.mkdtemp() dump_location = self.logstore.dump_listens( location=temp_dir, dump_id=1, start_time=t, end_time=datetime.now(), ) spark_dump_location = self.logstore.dump_listens( location=temp_dir, dump_id=1, start_time=t, end_time=datetime.now(), spark_format=True, ) self.assertTrue(os.path.isfile(dump_location)) self.reset_influx_db() sleep(1) self.logstore.import_listens_dump(dump_location) sleep(1) listens_from_influx = self.logstore.fetch_listens( user_name=self.testuser_name, to_ts=11) self.assertEqual(len(listens_from_influx), 1) self.assert_spark_dump_contains_listens(spark_dump_location, 1) shutil.rmtree(temp_dir)
def test_full_dump_listen_with_no_insert_timestamp(self): """ We have listens with no `inserted_timestamps` inside the production database. This means that full dumps should always be able to dump these listens as well. This is a test to test that. """ listens = generate_data(1, self.testuser_name, 1, 5) # insert these listens into influx without an insert_timestamp influx_rows = [ listen.to_influx(quote(self.testuser_name)) for listen in listens ] for row in influx_rows[1:]: row['fields'].pop('inserted_timestamp') t = datetime.now() self.logstore.write_points_to_db(influx_rows) sleep(1) listens_from_influx = self.logstore.fetch_listens( user_name=self.testuser_name, to_ts=11) self.assertEqual(len(listens_from_influx), 5) # full dump (with no start time) should contain these listens temp_dir = tempfile.mkdtemp() dump_location = self.logstore.dump_listens( location=temp_dir, dump_id=1, end_time=datetime.now(), ) spark_dump_location = self.logstore.dump_listens( location=temp_dir, dump_id=1, end_time=datetime.now(), spark_format=True, ) self.assertTrue(os.path.isfile(dump_location)) self.reset_influx_db() sleep(1) self.logstore.import_listens_dump(dump_location) sleep(1) listens_from_influx = self.logstore.fetch_listens( user_name=self.testuser_name, to_ts=11) self.assertEqual(len(listens_from_influx), 5) self.assert_spark_dump_contains_listens(spark_dump_location, 5) shutil.rmtree(temp_dir)
def copy_measurement(self, src, dest, apply_filter=False): done = False offset = 0 while True: result = self.ls.get_listens_batch_for_dump( src, self.max_time, offset) rows = [] count = 0 for row in result.get_points(get_measurement_name(src)): count += 1 if apply_filter: row = self.filter_function(row) if row: rows.append( self.convert_to_influx_insert_format(row, quote(dest))) self.ls.write_points_to_db(rows) offset += DUMP_CHUNK_SIZE if count == 0: break
def insert(self, listens): """ Insert a batch of listens. """ submit = [] user_names = {} for listen in listens: user_names[listen.user_name] = 1 submit.append(listen.to_influx(quote(listen.user_name))) if not self.influx.write_points(submit, time_precision='s'): self.log.error("Cannot write data to influx. (write_points returned False), data=%s", json.dumps(submit, indent=3)) # If we reach this point, we were able to write the listens to the InfluxListenStore. # So update the listen counts of the users cached in brainzutils cache. for data in submit: user_key = "{}{}".format(REDIS_INFLUX_USER_LISTEN_COUNT, data['fields']['user_name']) cached_count = cache.get(user_key, decode=False) if cached_count: cache.increment(user_key) # Invalidate cached data for user for user_name in user_names.keys(): cache.delete(REDIS_USER_TIMESTAMPS % user_name) if len(listens): # Enter a measurement to count items inserted submit = [{ 'measurement': TEMP_COUNT_MEASUREMENT, 'tags': { COUNT_MEASUREMENT_NAME: len(listens) }, 'fields': { COUNT_MEASUREMENT_NAME: len(listens) } }] try: if not self.influx.write_points(submit): self.log.error("Cannot write listen cound to influx. (write_points returned False)") except (InfluxDBServerError, InfluxDBClientError, ValueError) as err: self.log.error("Cannot write data to influx: %s, data: %s", str(err), json.dumps(submit, indent=3), exc_info=True) raise
def import_listens_dump(self, archive_path, threads=DUMP_DEFAULT_THREAD_COUNT): """ Imports listens into InfluxDB from a ListenBrainz listens dump .tar.xz archive. Args: archive (str): the path to the listens dump .tar.xz archive to be imported threads (int): the number of threads to be used for decompression (defaults to DUMP_DEFAULT_THREAD_COUNT) Returns: int: the number of users for whom listens have been imported """ self.log.info('Beginning import of listens from dump %s...', archive_path) # construct the pxz command to decompress the archive pxz_command = [ 'pxz', '--decompress', '--stdout', archive_path, '-T{threads}'.format(threads=threads) ] # run the command once to ensure schema version is correct # and load the index pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE) index = None with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: schema_check_done = False index_loaded = False for member in tar: file_name = member.name.split('/')[-1] if file_name == 'SCHEMA_SEQUENCE': self.log.info( 'Checking if schema version of dump matches...') schema_seq = int(tar.extractfile(member).read().strip()) if schema_seq != LISTENS_DUMP_SCHEMA_VERSION: raise SchemaMismatchException( 'Incorrect schema version! Expected: %d, got: %d.' 'Please ensure that the data dump version matches the code version' 'in order to import the data.' % (LISTENS_DUMP_SCHEMA_VERSION, schema_seq)) schema_check_done = True elif file_name == 'index.json': with tar.extractfile(member) as f: index = ujson.load(f) index_loaded = True if schema_check_done and index_loaded: self.log.info( 'Schema version matched and index.json loaded!') self.log.info('Starting import of listens...') break else: raise SchemaMismatchException( 'Metadata files missing in dump, please ensure that the dump file is valid.' ) # close pxz command and start over again, this time with the aim of importing all listens pxz.stdout.close() file_contents = defaultdict(list) for user, info in index.items(): file_contents[info['file_name']].append({ 'user_name': user, 'offset': info['offset'], 'size': info['size'], }) for file_name in file_contents: file_contents[file_name] = sorted(file_contents[file_name], key=lambda x: x['offset']) pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE) users_done = 0 with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: for member in tar: file_name = member.name.split('/')[-1] if file_name.endswith('.listens'): file_name = file_name[:-8] with tar.extractfile(member) as f: for user in file_contents[file_name]: self.log.info('Importing user %s...', user['user_name']) assert (f.tell() == user['offset']) bytes_read = 0 listens = [] while bytes_read < user['size']: line = f.readline() bytes_read += len(line) listen = Listen.from_json( ujson.loads(line)).to_influx( quote(user['user_name'])) listens.append(listen) if len(listens) > DUMP_CHUNK_SIZE: self.write_points_to_db(listens) listens = [] if len(listens) > 0: self.write_points_to_db(listens) self.log.info('Import of user %s done!', user['user_name']) users_done += 1 self.log.info('Import of listens from dump %s done!', archive_path) pxz.stdout.close() return users_done
def import_listens_dump(self, archive_path, threads=DUMP_DEFAULT_THREAD_COUNT): """ Imports listens into InfluxDB from a ListenBrainz listens dump .tar.xz archive. Args: archive (str): the path to the listens dump .tar.xz archive to be imported threads (int): the number of threads to be used for decompression (defaults to DUMP_DEFAULT_THREAD_COUNT) Returns: int: the number of users for whom listens have been imported """ self.log.info('Beginning import of listens from dump %s...', archive_path) # construct the pxz command to decompress the archive pxz_command = ['pxz', '--decompress', '--stdout', archive_path, '-T{threads}'.format(threads=threads)] # run the command once to ensure schema version is correct # and load the index pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE) index = None with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: schema_check_done = False index_loaded = False for member in tar: file_name = member.name.split('/')[-1] if file_name == 'SCHEMA_SEQUENCE': self.log.info('Checking if schema version of dump matches...') schema_seq = int(tar.extractfile(member).read().strip()) if schema_seq != LISTENS_DUMP_SCHEMA_VERSION: raise SchemaMismatchException('Incorrect schema version! Expected: %d, got: %d.' 'Please ensure that the data dump version matches the code version' 'in order to import the data.' % (LISTENS_DUMP_SCHEMA_VERSION, schema_seq)) schema_check_done = True elif file_name == 'index.json': with tar.extractfile(member) as f: index = ujson.load(f) index_loaded = True if schema_check_done and index_loaded: self.log.info('Schema version matched and index.json loaded!') self.log.info('Starting import of listens...') break else: raise SchemaMismatchException('Metadata files missing in dump, please ensure that the dump file is valid.') # close pxz command and start over again, this time with the aim of importing all listens pxz.stdout.close() file_contents = defaultdict(list) for user, info in index.items(): file_contents[info['file_name']].append({ 'user_name': user, 'offset': info['offset'], 'size': info['size'], }) for file_name in file_contents: file_contents[file_name] = sorted(file_contents[file_name], key=lambda x: x['offset']) pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE) users_done = 0 with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: for member in tar: file_name = member.name.split('/')[-1] if file_name.endswith('.listens'): file_name = file_name[:-8] with tar.extractfile(member) as f: for user in file_contents[file_name]: self.log.info('Importing user %s...', user['user_name']) assert(f.tell() == user['offset']) bytes_read = 0 listens = [] while bytes_read < user['size']: line = f.readline() bytes_read += len(line) listen = Listen.from_json(ujson.loads(line)).to_influx(quote(user['user_name'])) listens.append(listen) if len(listens) > DUMP_CHUNK_SIZE: self.write_points_to_db(listens) listens = [] if len(listens) > 0: self.write_points_to_db(listens) self.log.info('Import of user %s done!', user['user_name']) users_done += 1 self.log.info('Import of listens from dump %s done!', archive_path) pxz.stdout.close() return users_done