def test_from_json_null_values(self): data = { "listened_at": 1618353413, "track_metadata": { "additional_info": {"recording_mbid": "99e087e1-5649-4e8c-b84f-eea05b8e143a", "release_mbid": "4b6ca48c-f7db-439d-ba57-6104b5fec61e", "artist_mbid": "e1564e98-978b-4947-8698-f6fd6f8b0181\u0000\ufeff9ad10546-b081-4cc8-a487-3d2eece82d9e\u0000\ufeff5245e5cd-4408-4d9e-a037-c71a53edce83", "artist_msid": "392f2883-724f-4c63-b155-81a7cc89a499", "release_msid": "632207f8-150f-4342-99ad-0fd5a6687e63"}, "artist_name": "Fort Minor Feat. Holly Brook & Jonah Matranga", "track_name": "some name"} } with self.assertRaises(ValueError): Listen.from_json(data)
def test_from_json(self): json_row = {"track_metadata": {"additional_info": {}}} json_row.update({'listened_at': 123456}) listen = Listen.from_json(json_row) self.assertEqual(listen.timestamp, json_row['listened_at']) del json_row['listened_at'] json_row.update({'playing_now': True}) listen = Listen.from_json(json_row) self.assertEqual(listen.timestamp, None)
def callback(self, ch, method, properties, body): listens = ujson.loads(body) msb_listens = [] for chunk in chunked(listens, MAX_ITEMS_PER_MESSYBRAINZ_LOOKUP): msb_listens.extend(self.messybrainz_lookup(chunk)) submit = [] for listen in msb_listens: try: submit.append(Listen.from_json(listen)) except ValueError: pass ret = self.insert_to_listenstore(submit) # If there is an error, we do not ack the message so that rabbitmq redelivers it later. if ret == LISTEN_INSERT_ERROR_SENTINEL: return ret while True: try: self.incoming_ch.basic_ack(delivery_tag=method.delivery_tag) break except pika.exceptions.ConnectionClosed: self.connect_to_rabbitmq() return ret
def get_playing_now(self, user_id): """ Return the current playing song of the user """ data = self.redis.get('playing_now' + ':' + str(user_id)) if not data: return None data = ujson.loads(data) data.update({'listened_at': MIN_ID + 1}) return Listen.from_json(data)
def get_recent_listens(self, max = RECENT_LISTENS_MAX): """ Get the max number of most recent listens """ recent = [] for listen in cache._r.zrevrange(cache._prep_key(self.RECENT_LISTENS_KEY), 0, max - 1): recent.append(Listen.from_json(ujson.loads(listen))) return recent
def import_listens_dump(self, archive_path, threads=None): """ Imports listens into InfluxDB from a ListenBrainz listens dump .tar.xz archive. Args: archive (str): the path to the listens dump .tar.xz archive to be imported threads (int): the number of threads to be used for decompression (defaults to 1) """ self.log.info('Beginning import of listens from dump %s...', archive_path) pxz_command = ['pxz', '--decompress', '--stdout', archive_path] if threads is not None: pxz_command.append('-T {threads}'.format(threads=threads)) pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE) with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: for member in tar: file_name = member.name.split('/')[-1] if file_name == 'SCHEMA_SEQUENCE': self.log.info( 'Checking if schema version of dump matches...') schema_seq = int(tar.extractfile(member).read().strip()) if schema_seq != LISTENS_DUMP_SCHEMA_VERSION: raise SchemaMismatchException( 'Incorrect schema version! Expected: %d, got: %d.' 'Please ensure that the data dump version matches the code version' 'in order to import the data.' % (LISTENS_DUMP_SCHEMA_VERSION, schema_seq)) elif file_name.endswith('.listens'): # remove .listens from the filename to get the username user_name = file_name[:-8] self.log.info('Importing user %s', user_name) listens = [] listen_count = 0 # iterate through files and keep writing listens in chunks for listen in tar.extractfile(member): influx_listen = Listen.from_json( ujson.loads(listen)).to_influx(quote(user_name)) listens.append(influx_listen) listen_count += 1 if listen_count > DUMP_CHUNK_SIZE: self.write_points_to_db(listens) listen_count = 0 listens = [] # if some listens are left, write them to db if listen_count > 0: self.write_points_to_db(listens) self.log.info('Import of listens from dump %s done!', archive_path)
def get_recent_listens(self, max=RECENT_LISTENS_MAX): """ Get the max number of most recent listens """ recent = [] for listen in self.redis.zrevrange(self.ns + self.RECENT_LISTENS_KEY, 0, max - 1): recent.append(Listen.from_json(ujson.loads(listen))) return recent
def send_listens(self, event_name, message): listens = json.loads(message.body.decode("utf-8")) for data in listens: if event_name == "playing_now": listen = NowPlayingListen(user_id=data["user_id"], user_name=data["user_name"], data=data["track_metadata"]) else: data["track_metadata"] = data["data"] del data["data"] listen = Listen.from_json(data) self.socketio.emit(event_name, json.dumps(listen.to_api()), to=listen.user_name) message.ack()
def get_playing_now(self, user_id): """ Return the current playing song of the user Arguments: user_id (int): the id of the user in the db Returns: Listen object which is the currently playing song of the user """ data = cache.get(self.PLAYING_NOW_KEY + str(user_id)) if not data: return None data = ujson.loads(data) data.update({'playing_now': True}) return Listen.from_json(data)
def get_playing_now(self, user_id): """ Return the current playing song of the user Arguments: user_id (int): the id of the user in the db Returns: Listen object which is the currently playing song of the user """ data = self.redis.get('playing_now:{}'.format(user_id)) if not data: return None data = ujson.loads(data) data.update({'playing_now': True}) return Listen.from_json(data)
def get_playing_now(self, user_id): """ Return the current playing song of the user Arguments: user_id (int): the id of the user in the db Returns: Listen object which is the currently playing song of the user """ data = self.redis.get('playing_now:{}'.format(user_id)) if not data: return None data = ujson.loads(data) data.update({'listened_at': MIN_ID+1}) return Listen.from_json(data)
def callback(self, ch, method, properties, body): listens = ujson.loads(body) non_null_listens = [] for listen in listens: try: check_recursively_for_nulls(listen) except ValueError: # temporary to make sure fix is working current_app.logger.error( "Found null byte in listen. Skipping!", exc_info=True) continue non_null_listens.append(listen) msb_listens = [] for chunk in chunked(non_null_listens, MAX_ITEMS_PER_MESSYBRAINZ_LOOKUP): msb_listens.extend(self.messybrainz_lookup(chunk)) submit = [] for listen in msb_listens: try: submit.append(Listen.from_json(listen)) except ValueError: pass ret = self.insert_to_listenstore(submit) # If there is an error, we do not ack the message so that rabbitmq redelivers it later. if ret == LISTEN_INSERT_ERROR_SENTINEL: return ret while True: try: self.incoming_ch.basic_ack(delivery_tag=method.delivery_tag) break except pika.exceptions.ConnectionClosed: self.connect_to_rabbitmq() return ret
def callback(self, ch, method, properties, body): listens = ujson.loads(body) submit = [] for listen in listens: try: submit.append(Listen.from_json(listen)) except ValueError: pass ret = self.insert_to_listenstore(submit) if not ret: return ret while True: try: self.incoming_ch.basic_ack(delivery_tag=method.delivery_tag) break except pika.exceptions.ConnectionClosed: self.connect_to_rabbitmq() return ret
def write(self, listen_dicts): submit = [] unique = [] duplicate_count = 0 unique_count = 0 # Partition the listens on the basis of user names # and then store the time range for each user users = {} for listen in listen_dicts: t = int(listen['listened_at']) user_name = listen['user_name'] if user_name not in users: users[user_name] = { 'min_time': t, 'max_time': t, 'listens': [listen], } continue if t > users[user_name]['max_time']: users[user_name]['max_time'] = t if t < users[user_name]['min_time']: users[user_name]['min_time'] = t users[user_name]['listens'].append(listen) # get listens in the time range for each user and # remove duplicates on the basis of timestamps for user_name in users: # get the range of time that we need to get from influx for # deduplication of listens min_time = users[user_name]['min_time'] max_time = users[user_name]['max_time'] # quering for artist name here, since a field must be included in the query. query = """SELECT time, artist_name FROM %s WHERE time >= %s AND time <= %s """ % (get_escaped_measurement_name(user_name), get_influx_query_timestamp(min_time), get_influx_query_timestamp(max_time)) while True: try: results = self.influx.query(query) break except Exception as e: self.log.error("Cannot query influx: %s" % str(e)) sleep(3) # collect all the timestamps for this given time range. timestamps = {} for result in results.get_points( measurement=get_measurement_name(user_name)): timestamps[convert_to_unix_timestamp(result['time'])] = 1 for listen in users[user_name]['listens']: # Check if this listen is already present in Influx DB and if it is # mark current listen as duplicate t = int(listen['listened_at']) if t in timestamps: duplicate_count += 1 continue else: unique_count += 1 submit.append(Listen.from_json(listen)) unique.append(listen) timestamps[t] = 1 t0 = time() submitted_count = self.insert_to_listenstore(submit) self.time += time() - t0 self.log.error("dups: %d, unique: %d, submitted: %d" % (duplicate_count, unique_count, submitted_count)) if not unique_count: return True while True: try: self.unique_ch.basic_publish(exchange='unique', routing_key='', body=ujson.dumps(unique), properties=pika.BasicProperties( delivery_mode=2, )) break except pika.exceptions.ConnectionClosed: self.connect_to_rabbitmq() return True
def write(self, listen_dicts): submit = [] unique = [] duplicate_count = 0 unique_count = 0 # Partition the listens on the basis of user names # and then store the time range for each user users = {} for listen in listen_dicts: t = int(listen['listened_at']) user_name = listen['user_name'] if user_name not in users: users[user_name] = { 'min_time': t, 'max_time': t, 'listens': [listen], } continue if t > users[user_name]['max_time']: users[user_name]['max_time'] = t if t < users[user_name]['min_time']: users[user_name]['min_time'] = t users[user_name]['listens'].append(listen) # get listens in the time range for each user and # remove duplicates on the basis of timestamps for user_name in users: # get the range of time that we need to get from influx for # deduplication of listens min_time = users[user_name]['min_time'] max_time = users[user_name]['max_time'] query = """SELECT time, recording_msid FROM %s WHERE time >= %s AND time <= %s """ % (get_escaped_measurement_name(user_name), get_influx_query_timestamp(min_time), get_influx_query_timestamp(max_time)) while True: try: results = self.influx.query(query) break except Exception as e: self.log.error("Cannot query influx: %s" % str(e)) sleep(3) # collect all the timestamps for this given time range. timestamps = defaultdict(list) # dict of list of listens indexed by timestamp for result in results.get_points(measurement=get_measurement_name(user_name)): timestamps[convert_to_unix_timestamp(result['time'])].append(result) for listen in users[user_name]['listens']: # Check if a listen with the same timestamp and recording msid is already present in # Influx DB and if it is, mark current listen as duplicate t = int(listen['listened_at']) recording_msid = listen['recording_msid'] dup = False if t in timestamps: for row in timestamps[t]: if row['recording_msid'] == recording_msid: duplicate_count += 1 dup = True break else: # if there are listens with the same timestamp but different # metadata, we add a tag specifically for making sure that # influxdb doesn't drop one of the listens. This value # is monotonically increasing and defaults to 0 listen['dedup_tag'] = len(timestamps[t]) if not dup: unique_count += 1 submit.append(Listen.from_json(listen)) unique.append(listen) timestamps[t].append({ 'time': convert_timestamp_to_influx_row_format(t), 'recording_msid': recording_msid }) t0 = time() submitted_count = self.insert_to_listenstore(submit) self.time += time() - t0 self.log.error("dups: %d, unique: %d, submitted: %d" % (duplicate_count, unique_count, submitted_count)) if not unique_count: return True while True: try: self.unique_ch.basic_publish( exchange=self.config.UNIQUE_EXCHANGE, routing_key='', body=ujson.dumps(unique), properties=pika.BasicProperties(delivery_mode = 2,), ) break except pika.exceptions.ConnectionClosed: self.connect_to_rabbitmq() return True
def import_listens_dump(self, archive_path, threads=DUMP_DEFAULT_THREAD_COUNT): """ Imports listens into TimescaleDB from a ListenBrainz listens dump .tar.xz archive. Args: archive (str): the path to the listens dump .tar.xz archive to be imported threads (int): the number of threads to be used for decompression (defaults to DUMP_DEFAULT_THREAD_COUNT) Returns: int: the number of users for whom listens have been imported """ self.log.info('Beginning import of listens from dump %s...', archive_path) # construct the pxz command to decompress the archive pxz_command = [ 'pxz', '--decompress', '--stdout', archive_path, '-T{threads}'.format(threads=threads) ] pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE) schema_checked = False total_imported = 0 with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: listens = [] for member in tar: if member.name.endswith('SCHEMA_SEQUENCE'): self.log.info( 'Checking if schema version of dump matches...') schema_seq = int( tar.extractfile(member).read().strip() or '-1') if schema_seq != LISTENS_DUMP_SCHEMA_VERSION: raise SchemaMismatchException( 'Incorrect schema version! Expected: %d, got: %d.' 'Please ensure that the data dump version matches the code version' 'in order to import the data.' % (LISTENS_DUMP_SCHEMA_VERSION, schema_seq)) schema_checked = True if member.name.endswith(".listens"): if not schema_checked: raise SchemaMismatchException( "SCHEMA_SEQUENCE file missing from listen dump.") with tar.extractfile( member ) as tarf: # tarf, really? That's the name you're going with? Yep. while True: line = tarf.readline() if not line: break listen = Listen.from_json(ujson.loads(line)) listens.append(listen) if len(listens) > DUMP_CHUNK_SIZE: total_imported += len(listens) self.insert(listens) listens = [] if len(listens) > 0: total_imported += len(listens) self.insert(listens) if not schema_checked: raise SchemaMismatchException( "SCHEMA_SEQUENCE file missing from listen dump.") self.log.info('Import of listens from dump %s done!', archive_path) pxz.stdout.close() return total_imported
def import_listens_dump(self, archive_path, threads=DUMP_DEFAULT_THREAD_COUNT): """ Imports listens into InfluxDB from a ListenBrainz listens dump .tar.xz archive. Args: archive (str): the path to the listens dump .tar.xz archive to be imported threads (int): the number of threads to be used for decompression (defaults to DUMP_DEFAULT_THREAD_COUNT) Returns: int: the number of users for whom listens have been imported """ self.log.info('Beginning import of listens from dump %s...', archive_path) # construct the pxz command to decompress the archive pxz_command = ['pxz', '--decompress', '--stdout', archive_path, '-T{threads}'.format(threads=threads)] # run the command once to ensure schema version is correct # and load the index pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE) index = None with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: schema_check_done = False index_loaded = False for member in tar: file_name = member.name.split('/')[-1] if file_name == 'SCHEMA_SEQUENCE': self.log.info('Checking if schema version of dump matches...') schema_seq = int(tar.extractfile(member).read().strip()) if schema_seq != LISTENS_DUMP_SCHEMA_VERSION: raise SchemaMismatchException('Incorrect schema version! Expected: %d, got: %d.' 'Please ensure that the data dump version matches the code version' 'in order to import the data.' % (LISTENS_DUMP_SCHEMA_VERSION, schema_seq)) schema_check_done = True elif file_name == 'index.json': with tar.extractfile(member) as f: index = ujson.load(f) index_loaded = True if schema_check_done and index_loaded: self.log.info('Schema version matched and index.json loaded!') self.log.info('Starting import of listens...') break else: raise SchemaMismatchException('Metadata files missing in dump, please ensure that the dump file is valid.') # close pxz command and start over again, this time with the aim of importing all listens pxz.stdout.close() file_contents = defaultdict(list) for user, info in index.items(): file_contents[info['file_name']].append({ 'user_name': user, 'offset': info['offset'], 'size': info['size'], }) for file_name in file_contents: file_contents[file_name] = sorted(file_contents[file_name], key=lambda x: x['offset']) pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE) users_done = 0 with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: for member in tar: file_name = member.name.split('/')[-1] if file_name.endswith('.listens'): file_name = file_name[:-8] with tar.extractfile(member) as f: for user in file_contents[file_name]: self.log.info('Importing user %s...', user['user_name']) assert(f.tell() == user['offset']) bytes_read = 0 listens = [] while bytes_read < user['size']: line = f.readline() bytes_read += len(line) listen = Listen.from_json(ujson.loads(line)).to_influx(quote(user['user_name'])) listens.append(listen) if len(listens) > DUMP_CHUNK_SIZE: self.write_points_to_db(listens) listens = [] if len(listens) > 0: self.write_points_to_db(listens) self.log.info('Import of user %s done!', user['user_name']) users_done += 1 self.log.info('Import of listens from dump %s done!', archive_path) pxz.stdout.close() return users_done
def import_listens_dump(self, archive_path, threads=DUMP_DEFAULT_THREAD_COUNT): """ Imports listens into InfluxDB from a ListenBrainz listens dump .tar.xz archive. Args: archive (str): the path to the listens dump .tar.xz archive to be imported threads (int): the number of threads to be used for decompression (defaults to DUMP_DEFAULT_THREAD_COUNT) Returns: int: the number of users for whom listens have been imported """ self.log.info('Beginning import of listens from dump %s...', archive_path) # construct the pxz command to decompress the archive pxz_command = [ 'pxz', '--decompress', '--stdout', archive_path, '-T{threads}'.format(threads=threads) ] # run the command once to ensure schema version is correct # and load the index pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE) index = None with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: schema_check_done = False index_loaded = False for member in tar: file_name = member.name.split('/')[-1] if file_name == 'SCHEMA_SEQUENCE': self.log.info( 'Checking if schema version of dump matches...') schema_seq = int(tar.extractfile(member).read().strip()) if schema_seq != LISTENS_DUMP_SCHEMA_VERSION: raise SchemaMismatchException( 'Incorrect schema version! Expected: %d, got: %d.' 'Please ensure that the data dump version matches the code version' 'in order to import the data.' % (LISTENS_DUMP_SCHEMA_VERSION, schema_seq)) schema_check_done = True elif file_name == 'index.json': with tar.extractfile(member) as f: index = ujson.load(f) index_loaded = True if schema_check_done and index_loaded: self.log.info( 'Schema version matched and index.json loaded!') self.log.info('Starting import of listens...') break else: raise SchemaMismatchException( 'Metadata files missing in dump, please ensure that the dump file is valid.' ) # close pxz command and start over again, this time with the aim of importing all listens pxz.stdout.close() file_contents = defaultdict(list) for user, info in index.items(): file_contents[info['file_name']].append({ 'user_name': user, 'offset': info['offset'], 'size': info['size'], }) for file_name in file_contents: file_contents[file_name] = sorted(file_contents[file_name], key=lambda x: x['offset']) pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE) users_done = 0 with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: for member in tar: file_name = member.name.split('/')[-1] if file_name.endswith('.listens'): file_name = file_name[:-8] with tar.extractfile(member) as f: for user in file_contents[file_name]: self.log.info('Importing user %s...', user['user_name']) assert (f.tell() == user['offset']) bytes_read = 0 listens = [] while bytes_read < user['size']: line = f.readline() bytes_read += len(line) listen = Listen.from_json( ujson.loads(line)).to_influx( quote(user['user_name'])) listens.append(listen) if len(listens) > DUMP_CHUNK_SIZE: self.write_points_to_db(listens) listens = [] if len(listens) > 0: self.write_points_to_db(listens) self.log.info('Import of user %s done!', user['user_name']) users_done += 1 self.log.info('Import of listens from dump %s done!', archive_path) pxz.stdout.close() return users_done