def test_to_influx(self): listen = Listen( timestamp=int(time.time()), user_name='testuser', artist_msid=uuid.uuid4(), recording_msid=uuid.uuid4(), dedup_tag=3, data={ 'artist_name': 'Radiohead', 'track_name': 'True Love Waits', 'additional_info': { 'release_type': ["ALBUM", "REMIX"], } } ) data = listen.to_influx(quote(listen.user_name)) # Make sure every value that we don't explicitly support is a string for key in data['fields']: if key not in Listen.SUPPORTED_KEYS and key not in Listen.PRIVATE_KEYS: self.assertIsInstance(data['fields'][key], str) # Check values self.assertEqual(data['measurement'], quote(listen.user_name)) self.assertEqual(data['time'], listen.ts_since_epoch) self.assertEqual(data['tags']['dedup_tag'], listen.dedup_tag) self.assertEqual(data['fields']['user_name'], listen.user_name) self.assertEqual(data['fields']['artist_msid'], listen.artist_msid) self.assertEqual(data['fields']['recording_msid'], listen.recording_msid) self.assertEqual(data['fields']['track_name'], listen.data['track_name']) self.assertEqual(data['fields']['artist_name'], listen.data['artist_name']) self.assertIn('inserted_timestamp', data['fields'])
def test_to_influx(self): listen = Listen(timestamp=int(time.time()), user_name='testuser', artist_msid=uuid.uuid4(), recording_msid=uuid.uuid4(), data={ 'artist_name': 'Radiohead', 'track_name': 'True Love Waits', 'additional_info': { 'release_type': ["ALBUM", "REMIX"], } }) data = listen.to_influx(quote(listen.user_name)) # Make sure every value that we don't explicitly support is a string for key in data['fields']: if key not in Listen.SUPPORTED_KEYS: print(key) self.assertIsInstance(data['fields'][key], str) # Check values self.assertEqual(data['measurement'], quote(listen.user_name)) self.assertEqual(data['time'], listen.ts_since_epoch) self.assertEqual(data['tags']['user_name'], listen.user_name) self.assertEqual(data['fields']['artist_msid'], listen.artist_msid) self.assertEqual(data['fields']['recording_msid'], listen.recording_msid) self.assertEqual(data['fields']['track_name'], listen.data['track_name']) self.assertEqual(data['fields']['artist_name'], listen.data['artist_name'])
def test_to_timescale(self): listen = Listen( timestamp=int(time.time()), user_name='testuser', artist_msid=str(uuid.uuid4()), dedup_tag=3, user_id=1, data={ 'artist_name': 'Radiohead', 'track_name': 'True Love Waits', 'additional_info': { 'release_type': ["ALBUM", "REMIX"], 'recording_msid': str(uuid.uuid4()), } } ) listened_at, track_name, user_name, data = listen.to_timescale() # Check data is of type string self.assertIsInstance(data, str) # Convert returned data to json json_data = ujson.loads(data) # Check that the required fields are dumped into data self.assertIn('track_metadata', json_data) self.assertIn('additional_info', json_data['track_metadata']) # Check that the required fields are dumped into data self.assertEqual(listened_at, listen.ts_since_epoch) self.assertEqual(track_name, listen.data['track_name']) self.assertEqual(user_name, listen.user_name) self.assertEqual(json_data['user_id'], listen.user_id) self.assertEqual(json_data['track_metadata']['artist_name'], listen.data['artist_name'])
def test_from_json_null_values(self): data = { "listened_at": 1618353413, "track_metadata": { "additional_info": {"recording_mbid": "99e087e1-5649-4e8c-b84f-eea05b8e143a", "release_mbid": "4b6ca48c-f7db-439d-ba57-6104b5fec61e", "artist_mbid": "e1564e98-978b-4947-8698-f6fd6f8b0181\u0000\ufeff9ad10546-b081-4cc8-a487-3d2eece82d9e\u0000\ufeff5245e5cd-4408-4d9e-a037-c71a53edce83", "artist_msid": "392f2883-724f-4c63-b155-81a7cc89a499", "release_msid": "632207f8-150f-4342-99ad-0fd5a6687e63"}, "artist_name": "Fort Minor Feat. Holly Brook & Jonah Matranga", "track_name": "some name"} } with self.assertRaises(ValueError): Listen.from_json(data)
def test_from_json(self): json_row = {"track_metadata": {"additional_info": {}}} json_row.update({'listened_at': 123456}) listen = Listen.from_json(json_row) self.assertEqual(listen.timestamp, json_row['listened_at']) del json_row['listened_at'] json_row.update({'playing_now': True}) listen = Listen.from_json(json_row) self.assertEqual(listen.timestamp, None)
def callback(self, ch, method, properties, body): listens = ujson.loads(body) msb_listens = [] for chunk in chunked(listens, MAX_ITEMS_PER_MESSYBRAINZ_LOOKUP): msb_listens.extend(self.messybrainz_lookup(chunk)) submit = [] for listen in msb_listens: try: submit.append(Listen.from_json(listen)) except ValueError: pass ret = self.insert_to_listenstore(submit) # If there is an error, we do not ack the message so that rabbitmq redelivers it later. if ret == LISTEN_INSERT_ERROR_SENTINEL: return ret while True: try: self.incoming_ch.basic_ack(delivery_tag=method.delivery_tag) break except pika.exceptions.ConnectionClosed: self.connect_to_rabbitmq() return ret
def fetch_listens_from_storage(self, user_name, from_ts, to_ts, limit, order): """ The timestamps are stored as UTC in the postgres datebase while on retrieving the value they are converted to the local server's timezone. So to compare datetime object we need to create a object in the same timezone as the server. from_ts: seconds since epoch, in float to_ts: seconds since epoch, in float """ # Quote single quote characters which could be used to mount an injection attack. # Sadly, influxdb does not provide a means to do this in the client library query = 'SELECT * FROM ' + get_escaped_measurement_name(user_name) if from_ts is not None: query += "WHERE time > " + get_influx_query_timestamp(from_ts) else: query += "WHERE time < " + get_influx_query_timestamp(to_ts) query += " ORDER BY time " + ORDER_TEXT[order] + " LIMIT " + str(limit) try: results = self.influx.query(query) except Exception as err: self.log.error("Cannot query influx while getting listens for user: %s: %s", user_name, str(err), exc_info=True) return [] listens = [] for result in results.get_points(measurement=get_measurement_name(user_name)): listens.append(Listen.from_influx(result)) if order == ORDER_ASC: listens.reverse() return listens
def create_test_data_for_timescalelistenstore(user_name: str, user_id: int, test_data_file_name: str = None): """Create listens for timescalelistenstore tests. From a json file in testdata it creates Listen objects with a specified user_name for tests. Args: user_name: MusicBrainz username of a user. user_id: listenbrainz row id of the user test_data_file_name: If specified use the given file to create Listen objects. DEFAULT = 'timescale_listenstore_test_listens.json' Returns: A list of Listen objects. """ if not test_data_file_name: test_data_file_name = 'timescale_listenstore_test_listens.json' test_data_file = os.path.join(TEST_DATA_PATH, test_data_file_name) with open(test_data_file, 'r') as f: listens = json.load(f) test_data = [] for listen in listens['payload']: listen['user_name'] = user_name listen['user_id'] = user_id test_data.append(Listen().from_json(listen)) return test_data
def fetch_listens_from_storage(self, user_name, from_ts, to_ts, limit, order): """ The timestamps are stored as UTC in the postgres datebase while on retrieving the value they are converted to the local server's timezone. So to compare datetime object we need to create a object in the same timezone as the server. from_ts: seconds since epoch, in float to_ts: seconds since epoch, in float """ # Quote single quote characters which could be used to mount an injection attack. # Sadly, influxdb does not provide a means to do this in the client library query = 'SELECT * FROM ' + get_escaped_measurement_name(user_name) if from_ts is not None: query += "WHERE time > " + get_influx_query_timestamp(from_ts) else: query += "WHERE time < " + get_influx_query_timestamp(to_ts) query += " ORDER BY time " + ORDER_TEXT[order] + " LIMIT " + str(limit) try: results = self.influx.query(query) except Exception as err: self.log.error("Cannot query influx: %s" % str(err)) return [] listens = [] for result in results.get_points( measurement=get_measurement_name(user_name)): listens.append(Listen.from_influx(result)) if order == ORDER_ASC: listens.reverse() return listens
def generate_data(test_user_id, user_name, from_ts, num_records, inserted_ts=None): test_data = [] artist_msid = str(uuid.uuid4()) for i in range(num_records): if not inserted_ts: inserted_timestamp = datetime.utcnow() else: inserted_timestamp = datetime.utcfromtimestamp(inserted_ts) timestamp = datetime.utcfromtimestamp(from_ts) item = Listen( user_name=user_name, user_id=test_user_id, timestamp=timestamp, artist_msid=artist_msid, recording_msid=str(uuid.uuid4()), inserted_timestamp=inserted_timestamp, data={ 'artist_name': 'Frank Ocean', 'track_name': 'Crack Rock', 'additional_info': {}, }, ) test_data.append(item) from_ts += 1 # Add one second if inserted_ts: inserted_ts += 1 # Add one second return test_data
def generate_data(test_user_id, user_name, from_ts, num_records): test_data = [] artist_msid = str(uuid.uuid4()) if from_ts == None: #check for playing now listens timestamp = None else: from_ts += 1 # Add one second timestamp = datetime.utcfromtimestamp(from_ts) for i in range(num_records): item = Listen( user_name=user_name, user_id=test_user_id, timestamp=timestamp, artist_msid=artist_msid, recording_msid=str(uuid.uuid4()), data={ 'artist_name': 'Frank Ocean', 'track_name': 'Crack Rock', 'additional_info': {}, }, ) test_data.append(item) return test_data
def fetch_recent_listens_for_users(self, user_list, limit=2, max_age=3600): """ Fetch recent listens for a list of users, given a limit which applies per user. If you have a limit of 3 and 3 users you should get 9 listens if they are available. user_list: A list containing the users for which you'd like to retrieve recent listens. limit: the maximum number of listens for each user to fetch. max_age: Only return listens if they are no more than max_age seconds old. Default 3600 seconds """ args = {'user_list': tuple(user_list), 'ts': int(time.time()) - max_age, 'limit': limit} query = """SELECT * FROM ( SELECT listened_at, track_name, user_name, created, data, row_number() OVER (partition by user_name ORDER BY listened_at DESC) AS rownum FROM listen WHERE user_name IN :user_list AND listened_at > :ts GROUP BY user_name, listened_at, track_name, created, data ORDER BY listened_at DESC) tmp WHERE rownum <= :limit""" listens = [] with timescale.engine.connect() as connection: curs = connection.execute(sqlalchemy.text(query), args) while True: result = curs.fetchone() if not result: break listens.append(Listen.from_timescale(result[0], result[1], result[2], result[3], result[4])) return listens
def test_update_and_get_recent_listens(self): recent = self._redis.get_recent_listens() self.assertEqual(recent, []) listens = [] t = int(time.time()) for i in range(RedisListenStore.RECENT_LISTENS_MAX * 3): listen = Listen(user_id=self.testuser['id'], user_name=self.testuser['musicbrainz_id'], timestamp=t - i, data={ 'artist_name': str(uuid.uuid4()), 'track_name': str(uuid.uuid4()), 'additional_info': {}, }) listens.append(listen) self._redis.update_recent_listens(listens) recent = self._redis.get_recent_listens() self.assertEqual(len(recent), RedisListenStore.RECENT_LISTENS_MAX) self.assertIsInstance(recent[0], Listen) for i, r in enumerate(recent): self.assertEqual(r.timestamp, listens[i].timestamp) recent = self._redis.get_recent_listens(5) self.assertEqual(len(recent), 5) for i, r in enumerate(recent): self.assertEqual(r.timestamp, listens[i].timestamp)
def generate_data(from_date, num_records, user_name): test_data = [] current_date = to_epoch(from_date) artist_msid = str(uuid.uuid4()) user = db_user.get_by_mb_id(user_name) if not user: db_user.create(user_name) user = db_user.get_by_mb_id(user_name) for i in range(num_records): current_date += 1 # Add one second item = Listen( user_id=user['id'], user_name=user_name, timestamp=datetime.utcfromtimestamp(current_date), artist_msid=artist_msid, recording_msid=str(uuid.uuid4()), release_msid=str(uuid.uuid4()), data={ 'artist_name': 'Test Artist Pls ignore', 'track_name': 'Hello Goodbye', 'additional_info': {}, }, ) test_data.append(item) return test_data
def fetch_recent_listens_for_users(self, user_list, limit=2, max_age=3600): """ Fetch recent listens for a list of users, given a limit which applies per user. If you have a limit of 3 and 3 users you should get 9 listens if they are available. user_list: A list containing the users for which you'd like to retrieve recent listens. limit: the maximum number of listens for each user to fetch. max_age: Only return listens if they are no more than max_age seconds old. Default 3600 seconds """ escaped_user_list = [] for user_name in user_list: escaped_user_list.append(get_escaped_measurement_name(user_name)) query = "SELECT username, * FROM " + ",".join(escaped_user_list) query += " WHERE time > " + get_influx_query_timestamp( int(time.time()) - max_age) query += " ORDER BY time DESC LIMIT " + str(limit) try: results = self.influx.query(query) except Exception as err: self.log.error( "Cannot query influx while getting listens for users: %s: %s", user_list, str(err), exc_info=True) return [] listens = [] for user in user_list: for result in results.get_points( measurement=get_measurement_name(user)): l = Listen.from_influx(result) listens.append(l) return listens
def fetch_listens_for_multiple_users_from_storage(self, user_names: List[str], from_ts: float, to_ts: float, limit: int, order: int, time_range: int=3): """ The timestamps are stored as UTC in the postgres datebase while on retrieving the value they are converted to the local server's timezone. So to compare datetime object we need to create a object in the same timezone as the server. from_ts: seconds since epoch, in float to_ts: seconds since epoch, in float limit: the maximum number of items to return order: 0 for ASCending order, 1 for DESCending order time_range: the time range (in units of 5 days) to search for listens. If none is given 3 ranges (15 days) are searched. If -1 is given then all listens are searched which is slow and should be avoided if at all possible. """ if time_range is None: time_range = 3 if time_range < 0: max_timestamp_window = -1 else: max_timestamp_window = SECONDS_IN_TIME_RANGE * time_range if to_ts is None: to_ts = from_ts + max_timestamp_window elif from_ts is None: from_ts = to_ts - max_timestamp_window query = """SELECT listened_at, track_name, created, data, user_name FROM listen WHERE user_name IN :user_names """ if max_timestamp_window < 0: if from_ts and to_ts: query += """AND listened_at > :from_ts AND listened_at < :to_ts """ elif from_ts is not None: query += "AND listened_at > :from_ts " else: query += "AND listened_at < :to_ts " else: query += """AND listened_at > :from_ts AND listened_at < :to_ts """ query += "ORDER BY listened_at " + ORDER_TEXT[order] + " LIMIT :limit" listens = [] with timescale.engine.connect() as connection: curs = connection.execute(sqlalchemy.text(query), user_names=tuple(user_names), from_ts=from_ts, to_ts=to_ts, limit=limit) while True: result = curs.fetchone() if not result: break listens.append(Listen.from_timescale(result[0], result[1], result[4], result[2], result[3])) if order == ORDER_ASC: listens.reverse() return listens
def convert_row(self, row): return Listen(user_id=row[1], user_name=row[2], timestamp=row[3], artist_msid=row[4], release_msid=row[5], recording_msid=row[6], data=row[7])
def get_playing_now(self, user_id): """ Return the current playing song of the user """ data = self.redis.get('playing_now' + ':' + str(user_id)) if not data: return None data = ujson.loads(data) data.update({'listened_at': MIN_ID + 1}) return Listen.from_json(data)
def _create_test_data(self, user_name): test_data = [] for jdata in TEST_LISTEN_JSON: x = ujson.loads(jdata) x['user_name'] = user_name test_data.append(Listen().from_json(x)) self.logstore.insert(test_data) return len(test_data)
def test_from_influx(self): """ Test for the from_influx method """ influx_row = { "time": "2017-06-07T17:23:05Z", "artist_mbids": "abaa7001-0d80-4e58-be5d-d2d246fd9d87", "artist_msid": "aa6130f2-a12d-47f3-8ffd-d0f71340de1f", "artist_name": "Majid Jordan", "best_song": "definitely", "genius_link": "https://genius.com/Majid-jordan-every-step-every-way-lyrics", "lastfm_link": "https://www.last.fm/music/Majid+Jordan/_/Every+Step+Every+Way", "other_stuff": "teststuffplsignore", "recording_mbid": None, "recording_msid": "db9a7483-a8f4-4a2c-99af-c8ab58850200", "release_msid": "cf138a00-05d5-4b35-8fce-181efcc15785", "release_name": "Majid Jordan", "track_name": "Every Step Every Way", "user_name": "iliekcomputers", "we_dict_now.hello": "afb", "we_dict_now.we_nested_now.hi": "312", "tags": "sing, song", "inserted_timestamp": 1525557084, } listen = Listen.from_influx(influx_row) # Check user name self.assertEqual(listen.user_name, influx_row['user_name']) # Check time stamp dt = datetime.strptime(influx_row['time'] , '%Y-%m-%dT%H:%M:%SZ') ts = int(dt.strftime("%s")) self.assertEqual(listen.ts_since_epoch, ts) # Check artist mbids self.assertIsInstance(listen.data['additional_info']['artist_mbids'], list) self.assertEqual(listen.data['additional_info']['artist_mbids'], influx_row['artist_mbids'].split(',')) # Check tags self.assertIsInstance(listen.data['additional_info']['tags'], list) self.assertEqual(listen.data['additional_info']['tags'], influx_row['tags'].split(',')) # Check track name self.assertEqual(listen.data['track_name'], influx_row['track_name']) # Check additional info self.assertEqual(listen.data['additional_info']['best_song'], influx_row['best_song']) # Check msids self.assertEqual(listen.artist_msid, influx_row['artist_msid']) self.assertEqual(listen.release_msid, influx_row['release_msid']) self.assertEqual(listen.recording_msid, influx_row['recording_msid']) # make sure additional info does not contain stuff like artist names, track names self.assertNotIn('track_name', listen.data['additional_info']) self.assertNotIn('artist_name', listen.data['additional_info']) self.assertNotIn('release_name', listen.data['additional_info'])
def test_from_influx(self): """ Test for the from_influx method """ influx_row = { "time": "2017-06-07T17:23:05Z", "artist_mbids": "abaa7001-0d80-4e58-be5d-d2d246fd9d87", "artist_msid": "aa6130f2-a12d-47f3-8ffd-d0f71340de1f", "artist_name": "Majid Jordan", "best_song": "definitely", "genius_link": "https://genius.com/Majid-jordan-every-step-every-way-lyrics", "lastfm_link": "https://www.last.fm/music/Majid+Jordan/_/Every+Step+Every+Way", "other_stuff": "teststuffplsignore", "recording_mbid": None, "recording_msid": "db9a7483-a8f4-4a2c-99af-c8ab58850200", "release_msid": "cf138a00-05d5-4b35-8fce-181efcc15785", "release_name": "Majid Jordan", "track_name": "Every Step Every Way", "user_name": "iliekcomputers", "we_dict_now.hello": "afb", "we_dict_now.we_nested_now.hi": "312", "tags": "sing, song" } listen = Listen.from_influx(influx_row) # Check user name self.assertEqual(listen.user_name, influx_row['user_name']) # Check time stamp dt = datetime.strptime(influx_row['time'], '%Y-%m-%dT%H:%M:%SZ') ts = int(dt.strftime("%s")) self.assertEqual(listen.ts_since_epoch, ts) # Check artist mbids self.assertIsInstance(listen.data['additional_info']['artist_mbids'], list) self.assertEqual(listen.data['additional_info']['artist_mbids'], influx_row['artist_mbids'].split(',')) # Check tags self.assertIsInstance(listen.data['additional_info']['tags'], list) self.assertEqual(listen.data['additional_info']['tags'], influx_row['tags'].split(',')) # Check track name self.assertEqual(listen.data['track_name'], influx_row['track_name']) # Check additional info self.assertEqual(listen.data['additional_info']['best_song'], influx_row['best_song']) # Check msids self.assertEqual(listen.artist_msid, influx_row['artist_msid']) self.assertEqual(listen.release_msid, influx_row['release_msid']) self.assertEqual(listen.recording_msid, influx_row['recording_msid'])
def dump_user(self, username, fileobj, dump_time): """ Dump specified user's listens into specified file object. Args: username (str): the MusicBrainz ID of the user whose listens are to be dumped fileobj (file): the file into which listens should be written dump_time (datetime): the time at which the specific data dump was initiated spark_format (bool): dump files in Apache Spark friendly format if True, else full dumps Returns: int: the number of bytes this user's listens take in the dump file """ t0 = time.time() offset = 0 bytes_written = 0 listen_count = 0 # Get this user's listens in chunks while True: result = self.get_listens_batch_for_dump(username, dump_time, offset) rows_added = 0 for row in result.get_points(get_measurement_name(username)): listen = Listen.from_influx(row).to_api() listen['user_name'] = username try: bytes_written += fileobj.write(ujson.dumps(listen)) bytes_written += fileobj.write('\n') rows_added += 1 except IOError as e: self.log.critical( 'IOError while writing listens into file for user %s', username, exc_info=True) raise except Exception as e: self.log.error( 'Exception while creating json for user %s: %s', username, str(e), exc_info=True) raise listen_count += rows_added if not rows_added: break offset += DUMP_CHUNK_SIZE time_taken = time.time() - t0 self.log.info( 'Listens for user %s dumped, total %d listens written at %.2f listens / sec!', username, listen_count, listen_count / time_taken) # the size for this user should not include the last newline we wrote # hence return bytes_written - 1 as the size in the dump for this user return bytes_written - 1
def get_recent_listens(self, max = RECENT_LISTENS_MAX): """ Get the max number of most recent listens """ recent = [] for listen in cache._r.zrevrange(cache._prep_key(self.RECENT_LISTENS_KEY), 0, max - 1): recent.append(Listen.from_json(ujson.loads(listen))) return recent
def import_listens_dump(self, archive_path, threads=None): """ Imports listens into InfluxDB from a ListenBrainz listens dump .tar.xz archive. Args: archive (str): the path to the listens dump .tar.xz archive to be imported threads (int): the number of threads to be used for decompression (defaults to 1) """ self.log.info('Beginning import of listens from dump %s...', archive_path) pxz_command = ['pxz', '--decompress', '--stdout', archive_path] if threads is not None: pxz_command.append('-T {threads}'.format(threads=threads)) pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE) with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: for member in tar: file_name = member.name.split('/')[-1] if file_name == 'SCHEMA_SEQUENCE': self.log.info( 'Checking if schema version of dump matches...') schema_seq = int(tar.extractfile(member).read().strip()) if schema_seq != LISTENS_DUMP_SCHEMA_VERSION: raise SchemaMismatchException( 'Incorrect schema version! Expected: %d, got: %d.' 'Please ensure that the data dump version matches the code version' 'in order to import the data.' % (LISTENS_DUMP_SCHEMA_VERSION, schema_seq)) elif file_name.endswith('.listens'): # remove .listens from the filename to get the username user_name = file_name[:-8] self.log.info('Importing user %s', user_name) listens = [] listen_count = 0 # iterate through files and keep writing listens in chunks for listen in tar.extractfile(member): influx_listen = Listen.from_json( ujson.loads(listen)).to_influx(quote(user_name)) listens.append(influx_listen) listen_count += 1 if listen_count > DUMP_CHUNK_SIZE: self.write_points_to_db(listens) listen_count = 0 listens = [] # if some listens are left, write them to db if listen_count > 0: self.write_points_to_db(listens) self.log.info('Import of listens from dump %s done!', archive_path)
def get_recent_listens(self, max=RECENT_LISTENS_MAX): """ Get the max number of most recent listens """ recent = [] for listen in self.redis.zrevrange(self.ns + self.RECENT_LISTENS_KEY, 0, max - 1): recent.append(Listen.from_json(ujson.loads(listen))) return recent
def generate_data(test_user_id, from_ts, num_records): test_data = [] artist_msid = str(uuid.uuid4()) for i in range(num_records): from_ts += 1 # Add one second item = Listen(user_id=test_user_id, timestamp=datetime.utcfromtimestamp(from_ts), artist_msid=artist_msid, recording_msid=str(uuid.uuid4())) test_data.append(item) return test_data
def send_listens(self, event_name, message): listens = json.loads(message.body.decode("utf-8")) for data in listens: if event_name == "playing_now": listen = NowPlayingListen(user_id=data["user_id"], user_name=data["user_name"], data=data["track_metadata"]) else: data["track_metadata"] = data["data"] del data["data"] listen = Listen.from_json(data) self.socketio.emit(event_name, json.dumps(listen.to_api()), to=listen.user_name) message.ack()
def write_incremental_listens(self, start_time, end_time, temp_dir): """ Dump listens in the format for the ListenBrainz dump. Args: start_time and end_time (datetime): the range of time for the listens dump. temp_dir (str): the dir to use to write files before adding to archive """ t0 = time.monotonic() offset = 0 listen_count = 0 unwritten_listens = {} while True: query, args = self.get_incremental_listens_query_batch( start_time, end_time, offset) rows_added = 0 with timescale.engine.connect() as connection: curs = connection.execute(sqlalchemy.text(query), args) while True: result = curs.fetchone() if not result: break listen = Listen.from_timescale(result[0], result[1], result[2], result[3], result[4]).to_json() timestamp = listen['timestamp'] if timestamp.year not in unwritten_listens: unwritten_listens[timestamp.year] = {} if timestamp.month not in unwritten_listens[ timestamp.year]: unwritten_listens[timestamp.year][timestamp.month] = [] unwritten_listens[timestamp.year][timestamp.month].append( listen) rows_added += 1 if rows_added == 0: break listen_count += rows_added offset += DUMP_CHUNK_SIZE self.write_incremental_listens_to_disk(unwritten_listens, temp_dir) self.log.info("%d listens dumped at %.2f listens / sec", listen_count, listen_count / (time.monotonic() - t0))
def get_playing_now(self, user_id): """ Return the current playing song of the user Arguments: user_id (int): the id of the user in the db Returns: Listen object which is the currently playing song of the user """ data = self.redis.get('playing_now:{}'.format(user_id)) if not data: return None data = ujson.loads(data) data.update({'listened_at': MIN_ID+1}) return Listen.from_json(data)
def get_playing_now(self, user_id): """ Return the current playing song of the user Arguments: user_id (int): the id of the user in the db Returns: Listen object which is the currently playing song of the user """ data = self.redis.get('playing_now:{}'.format(user_id)) if not data: return None data = ujson.loads(data) data.update({'playing_now': True}) return Listen.from_json(data)
def get_playing_now(self, user_id): """ Return the current playing song of the user Arguments: user_id (int): the id of the user in the db Returns: Listen object which is the currently playing song of the user """ data = cache.get(self.PLAYING_NOW_KEY + str(user_id)) if not data: return None data = ujson.loads(data) data.update({'playing_now': True}) return Listen.from_json(data)
def callback(self, ch, method, properties, body): listens = ujson.loads(body) non_null_listens = [] for listen in listens: try: check_recursively_for_nulls(listen) except ValueError: # temporary to make sure fix is working current_app.logger.error( "Found null byte in listen. Skipping!", exc_info=True) continue non_null_listens.append(listen) msb_listens = [] for chunk in chunked(non_null_listens, MAX_ITEMS_PER_MESSYBRAINZ_LOOKUP): msb_listens.extend(self.messybrainz_lookup(chunk)) submit = [] for listen in msb_listens: try: submit.append(Listen.from_json(listen)) except ValueError: pass ret = self.insert_to_listenstore(submit) # If there is an error, we do not ack the message so that rabbitmq redelivers it later. if ret == LISTEN_INSERT_ERROR_SENTINEL: return ret while True: try: self.incoming_ch.basic_ack(delivery_tag=method.delivery_tag) break except pika.exceptions.ConnectionClosed: self.connect_to_rabbitmq() return ret
def fetch_recent_listens_for_users(self, user_list, limit=2, max_age=3600): """ Fetch recent listens for a list of users, given a limit which applies per user. If you have a limit of 3 and 3 users you should get 9 listens if they are available. user_list: A list containing the users for which you'd like to retrieve recent listens. limit: the maximum number of listens for each user to fetch. max_age: Only return listens if they are no more than max_age seconds old. Default 3600 seconds """ args = { 'user_list': tuple(user_list), 'ts': int(time.time()) - max_age, 'limit': limit } query = """SELECT * FROM ( SELECT listened_at, track_name, user_name, created, data, recording_mbid, release_mbid, artist_mbids, row_number() OVER (partition by user_name ORDER BY listened_at DESC) AS rownum FROM listen l FULL OUTER JOIN listen_join_listen_mbid_mapping lj ON (data->'track_metadata'->'additional_info'->>'recording_msid')::uuid = lj.recording_msid FULL OUTER JOIN listen_mbid_mapping m ON lj.listen_mbid_mapping = m.id WHERE user_name IN :user_list AND listened_at > :ts GROUP BY user_name, listened_at, track_name, created, data, recording_mbid, release_mbid, artist_mbids ORDER BY listened_at DESC) tmp WHERE rownum <= :limit""" listens = [] with timescale.engine.connect() as connection: curs = connection.execute(sqlalchemy.text(query), args) while True: result = curs.fetchone() if not result: break listens.append(Listen.from_timescale(*result[0:8])) return listens
def create_test_data_for_influxlistenstore(user_name): """Create listens for influxlistenstore tests. From a json file 'influx_listenstore_test_listens.json' in testdata it creates Listen objects with a specified user_name for tests. Args: user_name (str): MusicBrainz username of a user. Returns: A list of Listen objects. """ test_data_file = os.path.join(TEST_DATA_PATH, 'influx_listenstore_test_listens.json') with open(test_data_file, 'r') as f: listens = json.load(f) test_data = [] for listen in listens['payload']: listen['user_name'] = user_name test_data.append(Listen().from_json(listen)) return test_data
def import_listens_dump(self, archive_path, threads=DUMP_DEFAULT_THREAD_COUNT): """ Imports listens into InfluxDB from a ListenBrainz listens dump .tar.xz archive. Args: archive (str): the path to the listens dump .tar.xz archive to be imported threads (int): the number of threads to be used for decompression (defaults to DUMP_DEFAULT_THREAD_COUNT) Returns: int: the number of users for whom listens have been imported """ self.log.info('Beginning import of listens from dump %s...', archive_path) # construct the pxz command to decompress the archive pxz_command = ['pxz', '--decompress', '--stdout', archive_path, '-T{threads}'.format(threads=threads)] # run the command once to ensure schema version is correct # and load the index pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE) index = None with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: schema_check_done = False index_loaded = False for member in tar: file_name = member.name.split('/')[-1] if file_name == 'SCHEMA_SEQUENCE': self.log.info('Checking if schema version of dump matches...') schema_seq = int(tar.extractfile(member).read().strip()) if schema_seq != LISTENS_DUMP_SCHEMA_VERSION: raise SchemaMismatchException('Incorrect schema version! Expected: %d, got: %d.' 'Please ensure that the data dump version matches the code version' 'in order to import the data.' % (LISTENS_DUMP_SCHEMA_VERSION, schema_seq)) schema_check_done = True elif file_name == 'index.json': with tar.extractfile(member) as f: index = ujson.load(f) index_loaded = True if schema_check_done and index_loaded: self.log.info('Schema version matched and index.json loaded!') self.log.info('Starting import of listens...') break else: raise SchemaMismatchException('Metadata files missing in dump, please ensure that the dump file is valid.') # close pxz command and start over again, this time with the aim of importing all listens pxz.stdout.close() file_contents = defaultdict(list) for user, info in index.items(): file_contents[info['file_name']].append({ 'user_name': user, 'offset': info['offset'], 'size': info['size'], }) for file_name in file_contents: file_contents[file_name] = sorted(file_contents[file_name], key=lambda x: x['offset']) pxz = subprocess.Popen(pxz_command, stdout=subprocess.PIPE) users_done = 0 with tarfile.open(fileobj=pxz.stdout, mode='r|') as tar: for member in tar: file_name = member.name.split('/')[-1] if file_name.endswith('.listens'): file_name = file_name[:-8] with tar.extractfile(member) as f: for user in file_contents[file_name]: self.log.info('Importing user %s...', user['user_name']) assert(f.tell() == user['offset']) bytes_read = 0 listens = [] while bytes_read < user['size']: line = f.readline() bytes_read += len(line) listen = Listen.from_json(ujson.loads(line)).to_influx(quote(user['user_name'])) listens.append(listen) if len(listens) > DUMP_CHUNK_SIZE: self.write_points_to_db(listens) listens = [] if len(listens) > 0: self.write_points_to_db(listens) self.log.info('Import of user %s done!', user['user_name']) users_done += 1 self.log.info('Import of listens from dump %s done!', archive_path) pxz.stdout.close() return users_done
def dump_user(self, username, fileobj, dump_time): """ Dump specified user's listens into specified file object. Args: username (str): the MusicBrainz ID of the user whose listens are to be dumped fileobj (file): the file into which listens should be written dump_time (datetime): the time at which the specific data dump was initiated Returns: int: the number of bytes this user's listens take in the dump file """ t0 = time.time() offset = 0 bytes_written = 0 listen_count = 0 # Get this user's listens in chunks while True: # loop until we get this chunk of listens while True: try: result = self.influx.query(""" SELECT * FROM {measurement} WHERE time <= {timestamp} ORDER BY time DESC LIMIT {limit} OFFSET {offset} """.format( measurement=get_escaped_measurement_name(username), timestamp=get_influx_query_timestamp(dump_time.strftime('%s')), limit=DUMP_CHUNK_SIZE, offset=offset, )) break except Exception as e: self.log.error('Error while getting listens to dump for user %s: %s', user['musicbrainz_id'], str(e), exc_info=True) time.sleep(3) rows_added = 0 for row in result.get_points(get_measurement_name(username)): listen = Listen.from_influx(row).to_api() listen['user_name'] = username try: bytes_written += fileobj.write(ujson.dumps(listen)) bytes_written += fileobj.write('\n') rows_added += 1 except IOError as e: self.log.critical('IOError while writing listens into file for user %s', username, exc_info=True) raise except Exception as e: self.log.error('Exception while creating json for user %s: %s', user['musicbrainz_id'], str(e), exc_info=True) raise listen_count += rows_added if not rows_added: break offset += DUMP_CHUNK_SIZE time_taken = time.time() - t0 self.log.info('Listens for user %s dumped, total %d listens written at %.2f listens / sec!', username, listen_count, listen_count / time_taken) # the size for this user should not include the last newline we wrote # hence return bytes_written - 1 as the size in the dump for this user return bytes_written - 1