def fetch_recent_listens_for_users(self, user_list, limit=2, max_age=3600): """ Fetch recent listens for a list of users, given a limit which applies per user. If you have a limit of 3 and 3 users you should get 9 listens if they are available. user_list: A list containing the users for which you'd like to retrieve recent listens. limit: the maximum number of listens for each user to fetch. max_age: Only return listens if they are no more than max_age seconds old. Default 3600 seconds """ escaped_user_list = [] for user_name in user_list: escaped_user_list.append(get_escaped_measurement_name(user_name)) query = "SELECT username, * FROM " + ",".join(escaped_user_list) query += " WHERE time > " + get_influx_query_timestamp( int(time.time()) - max_age) query += " ORDER BY time DESC LIMIT " + str(limit) try: results = self.influx.query(query) except Exception as err: self.log.error( "Cannot query influx while getting listens for users: %s: %s", user_list, str(err), exc_info=True) return [] listens = [] for user in user_list: for result in results.get_points( measurement=get_measurement_name(user)): l = Listen.from_influx(result) listens.append(l) return listens
def delete(self, musicbrainz_id): """ Delete all listens for user with specified MusicBrainz ID. Note: this method tries to delete the user 5 times before giving up. Args: musicbrainz_id (str): the MusicBrainz ID of the user Raises: Exception if unable to delete the user in 5 retries """ for _ in range(5): try: self.influx.drop_measurement(get_measurement_name(musicbrainz_id)) break except InfluxDBClientError as e: # influxdb-python raises client error if measurement isn't found # so we have to handle that case. if 'measurement not found' in e.content: return else: self.log.error('Error in influx client while dropping user %s: %s', musicbrainz_id, str(e), exc_info=True) time.sleep(3) except InfluxDBServerError as e: self.log.error('Error in influx server while dropping user %s: %s', musicbrainz_id, str(e), exc_info=True) time.sleep(3) except Exception as e: self.log.error('Error while trying to drop user %s: %s', musicbrainz_id, str(e), exc_info=True) time.sleep(3) else: raise InfluxListenStoreException("Couldn't delete user with MusicBrainz ID: %s" % musicbrainz_id)
def fetch_listens_from_storage(self, user_name, from_ts, to_ts, limit, order): """ The timestamps are stored as UTC in the postgres datebase while on retrieving the value they are converted to the local server's timezone. So to compare datetime object we need to create a object in the same timezone as the server. from_ts: seconds since epoch, in float to_ts: seconds since epoch, in float """ # Quote single quote characters which could be used to mount an injection attack. # Sadly, influxdb does not provide a means to do this in the client library query = 'SELECT * FROM ' + get_escaped_measurement_name(user_name) if from_ts is not None: query += "WHERE time > " + get_influx_query_timestamp(from_ts) else: query += "WHERE time < " + get_influx_query_timestamp(to_ts) query += " ORDER BY time " + ORDER_TEXT[order] + " LIMIT " + str(limit) try: results = self.influx.query(query) except Exception as err: self.log.error("Cannot query influx while getting listens for user: %s: %s", user_name, str(err), exc_info=True) return [] listens = [] for result in results.get_points(measurement=get_measurement_name(user_name)): listens.append(Listen.from_influx(result)) if order == ORDER_ASC: listens.reverse() return listens
def dump_user_for_spark(self, username, dump_time, temp_dir): t0 = time.time() offset = 0 listen_count = 0 unwritten_listens = {} while True: result = self.get_listens_batch_for_dump(username, dump_time, offset) rows_added = 0 for row in result.get_points(get_measurement_name(username)): listen = convert_influx_row_to_spark_row(row) timestamp = convert_influx_to_datetime(row['time']) if timestamp.year not in unwritten_listens: unwritten_listens[timestamp.year] = {} if timestamp.month not in unwritten_listens[timestamp.year]: unwritten_listens[timestamp.year][timestamp.month] = [] unwritten_listens[timestamp.year][timestamp.month].append( listen) rows_added += 1 if rows_added == 0: break listen_count += rows_added offset += DUMP_CHUNK_SIZE self.write_spark_listens_to_disk(unwritten_listens, temp_dir) self.log.info("%d listens for user %s dumped at %.2f listens / sec", listen_count, username, listen_count / (time.time() - t0))
def fetch_listens_from_storage(self, user_name, from_ts, to_ts, limit, order): """ The timestamps are stored as UTC in the postgres datebase while on retrieving the value they are converted to the local server's timezone. So to compare datetime object we need to create a object in the same timezone as the server. from_ts: seconds since epoch, in float to_ts: seconds since epoch, in float """ # Quote single quote characters which could be used to mount an injection attack. # Sadly, influxdb does not provide a means to do this in the client library query = 'SELECT * FROM ' + get_escaped_measurement_name(user_name) if from_ts is not None: query += "WHERE time > " + get_influx_query_timestamp(from_ts) else: query += "WHERE time < " + get_influx_query_timestamp(to_ts) query += " ORDER BY time " + ORDER_TEXT[order] + " LIMIT " + str(limit) try: results = self.influx.query(query) except Exception as err: self.log.error("Cannot query influx: %s" % str(err)) return [] listens = [] for result in results.get_points( measurement=get_measurement_name(user_name)): listens.append(Listen.from_influx(result)) if order == ORDER_ASC: listens.reverse() return listens
def dump_user(self, username, fileobj, dump_time): """ Dump specified user's listens into specified file object. Args: username (str): the MusicBrainz ID of the user whose listens are to be dumped fileobj (file): the file into which listens should be written dump_time (datetime): the time at which the specific data dump was initiated spark_format (bool): dump files in Apache Spark friendly format if True, else full dumps Returns: int: the number of bytes this user's listens take in the dump file """ t0 = time.time() offset = 0 bytes_written = 0 listen_count = 0 # Get this user's listens in chunks while True: result = self.get_listens_batch_for_dump(username, dump_time, offset) rows_added = 0 for row in result.get_points(get_measurement_name(username)): listen = Listen.from_influx(row).to_api() listen['user_name'] = username try: bytes_written += fileobj.write(ujson.dumps(listen)) bytes_written += fileobj.write('\n') rows_added += 1 except IOError as e: self.log.critical( 'IOError while writing listens into file for user %s', username, exc_info=True) raise except Exception as e: self.log.error( 'Exception while creating json for user %s: %s', username, str(e), exc_info=True) raise listen_count += rows_added if not rows_added: break offset += DUMP_CHUNK_SIZE time_taken = time.time() - t0 self.log.info( 'Listens for user %s dumped, total %d listens written at %.2f listens / sec!', username, listen_count, listen_count / time_taken) # the size for this user should not include the last newline we wrote # hence return bytes_written - 1 as the size in the dump for this user return bytes_written - 1
def insert_to_listenstore(self, data, retries=5): """ Inserts a batch of listens to the ListenStore. If this fails, then breaks the data into two parts and recursively tries to insert them, until we find the culprit listen Args: data: the data to be inserted into the ListenStore retries: the number of retries to make before deciding that we've failed Returns: number of listens successfully sent """ if not data: return 0 failure_count = 0 while True: try: self.ls.insert(data) return len(data) except (InfluxDBServerError, InfluxDBClientError, ValueError) as e: failure_count += 1 if failure_count >= retries: break sleep(self.ERROR_RETRY_DELAY) except ConnectionError as e: current_app.logger.error( "Cannot write data to listenstore: %s. Sleep." % str(e), exc_info=True) sleep(self.ERROR_RETRY_DELAY) # if we get here, we failed on trying to write the data if len(data) == 1: # try to send the bad listen one more time and if it doesn't work # log the error try: self.ls.insert(data) return 1 except (InfluxDBServerError, InfluxDBClientError, ValueError, ConnectionError) as e: error_message = 'Unable to insert bad listen to listenstore: {error}, listen={json}' influx_dict = data[0].to_influx( get_measurement_name(data[0].user_name)) current_app.logger.error(error_message.format(error=str(e), json=json.dumps( influx_dict, indent=3)), exc_info=True) return 0 else: slice_index = len(data) // 2 # send first half sent = self.insert_to_listenstore(data[:slice_index], retries) # send second half sent += self.insert_to_listenstore(data[slice_index:], retries) return sent
def get_timestamps_for_user(self, user_name): """ Return the max_ts and min_ts for a given user and cache the result in brainzutils cache """ tss = cache.get(REDIS_USER_TIMESTAMPS % user_name) if tss: (min_ts, max_ts) = tss.split(",") min_ts = int(min_ts) max_ts = int(max_ts) else: query = 'SELECT first(artist_msid) FROM ' + get_escaped_measurement_name(user_name) min_ts = self._select_single_timestamp(query, get_measurement_name(user_name)) query = 'SELECT last(artist_msid) FROM ' + get_escaped_measurement_name(user_name) max_ts = self._select_single_timestamp(query, get_measurement_name(user_name)) cache.set(REDIS_USER_TIMESTAMPS % user_name, "%d,%d" % (min_ts, max_ts), USER_CACHE_TIME) return min_ts, max_ts
def dump_user_for_spark(self, username, start_time, end_time, temp_dir): """ Dump listens for a particular user in the format for the ListenBrainz spark dump. Args: username (str): the MusicBrainz ID of the user start_time and end_time (datetime): the range of time for the listens dump. temp_dir (str): the dir to use to write files before adding to archive """ t0 = time.time() offset = 0 listen_count = 0 unwritten_listens = {} while True: if start_time == datetime.utcfromtimestamp( 0): # if we need a full dump result = self.get_listens_batch_for_dump( username, end_time, offset) else: result = self.get_incremental_listens_batch( username, start_time, end_time, offset) rows_added = 0 for row in result.get_points(get_measurement_name(username)): # make sure that listen was inserted in current dump's time range # need to do this check in python, because influx doesn't # do "IS NULL" operations and we have null inserted_timestamps from # old data if not self.row_inserted_before_or_equal(row, end_time): continue listen = convert_influx_row_to_spark_row(row) timestamp = convert_influx_to_datetime(row['time']) if timestamp.year not in unwritten_listens: unwritten_listens[timestamp.year] = {} if timestamp.month not in unwritten_listens[timestamp.year]: unwritten_listens[timestamp.year][timestamp.month] = [] unwritten_listens[timestamp.year][timestamp.month].append( listen) rows_added += 1 if rows_added == 0: break listen_count += rows_added offset += DUMP_CHUNK_SIZE self.write_spark_listens_to_disk(unwritten_listens, temp_dir) self.log.info("%d listens for user %s dumped at %.2f listens / sec", listen_count, username, listen_count / (time.time() - t0))
def get_timestamps_for_user(self, user_name): """ Return the max_ts and min_ts for a given user and cache the result in brainzutils cache """ tss = cache.get(REDIS_USER_TIMESTAMPS % user_name) if tss: (min_ts, max_ts) = tss.split(",") min_ts = int(min_ts) max_ts = int(max_ts) else: query = 'SELECT first(artist_msid) FROM ' + get_escaped_measurement_name( user_name) min_ts = self._select_single_timestamp( query, get_measurement_name(user_name)) query = 'SELECT last(artist_msid) FROM ' + get_escaped_measurement_name( user_name) max_ts = self._select_single_timestamp( query, get_measurement_name(user_name)) cache.set(REDIS_USER_TIMESTAMPS % user_name, "%d,%d" % (min_ts, max_ts), USER_CACHE_TIME) return min_ts, max_ts
def delete(self, musicbrainz_id): """ Delete all listens for user with specified MusicBrainz ID. Note: this method tries to delete the user 5 times before giving up. Args: musicbrainz_id (str): the MusicBrainz ID of the user Raises: Exception if unable to delete the user in 5 retries """ for _ in range(5): try: self.influx.drop_measurement( get_measurement_name(musicbrainz_id)) break except InfluxDBClientError as e: # influxdb-python raises client error if measurement isn't found # so we have to handle that case. if 'measurement not found' in e.content: return else: self.log.error( 'Error in influx client while dropping user %s: %s', musicbrainz_id, str(e), exc_info=True) time.sleep(3) except InfluxDBServerError as e: self.log.error( 'Error in influx server while dropping user %s: %s', musicbrainz_id, str(e), exc_info=True) time.sleep(3) except Exception as e: self.log.error('Error while trying to drop user %s: %s', musicbrainz_id, str(e), exc_info=True) time.sleep(3) else: raise InfluxListenStoreException( "Couldn't delete user with MusicBrainz ID: %s" % musicbrainz_id)
def copy_measurement(self, src, dest, apply_filter=False): done = False offset = 0 while True: result = self.ls.get_listens_batch_for_dump( src, self.max_time, offset) rows = [] count = 0 for row in result.get_points(get_measurement_name(src)): count += 1 if apply_filter: row = self.filter_function(row) if row: rows.append( self.convert_to_influx_insert_format(row, quote(dest))) self.ls.write_points_to_db(rows) offset += DUMP_CHUNK_SIZE if count == 0: break
def get_listen_count_for_user(self, user_name, need_exact=False): """Get the total number of listens for a user. The number of listens comes from brainzutils cache unless an exact number is asked for. Args: user_name: the user to get listens for need_exact: if True, get an exact number of listens directly from the ListenStore """ if not need_exact: # check if the user's listen count is already in cache # if already present return it directly instead of calculating it again # decode is set to False as we have not encoded the value when we set it # in brainzutils cache as we need to call increment operation which requires # an integer value user_key = '{}{}'.format(REDIS_INFLUX_USER_LISTEN_COUNT, user_name) count = cache.get(user_key, decode=False) if count: return int(count) try: results = self.influx.query( 'SELECT count(*) FROM ' + get_escaped_measurement_name(user_name)) except (InfluxDBServerError, InfluxDBClientError) as e: self.log.error("Cannot query influx: %s" % str(e), exc_info=True) raise # get the number of listens from the json try: count = results.get_points(measurement=get_measurement_name( user_name)).__next__()['count_recording_msid'] except (KeyError, StopIteration): count = 0 # put this value into brainzutils cache with an expiry time user_key = "{}{}".format(REDIS_INFLUX_USER_LISTEN_COUNT, user_name) cache.set(user_key, int(count), InfluxListenStore.USER_LISTEN_COUNT_CACHE_TIME, encode=False) return int(count)
def get_listen_count_for_user(self, user_name, need_exact=False): """Get the total number of listens for a user. The number of listens comes from a redis cache unless an exact number is asked for. Args: user_name: the user to get listens for need_exact: if True, get an exact number of listens directly from the ListenStore """ if not need_exact: # check if the user's listen count is already in redis # if already present return it directly instead of calculating it again count = self.redis.get(REDIS_INFLUX_USER_LISTEN_COUNT + user_name) if count: return int(count) try: results = self.influx.query( 'SELECT count(*) FROM ' + get_escaped_measurement_name(user_name)) except (InfluxDBServerError, InfluxDBClientError) as e: self.log.error("Cannot query influx: %s" % str(e)) raise # get the number of listens from the json try: count = results.get_points(measurement=get_measurement_name( user_name)).__next__()['count_recording_msid'] except (KeyError, StopIteration): count = 0 # put this value into redis with an expiry time user_key = "{}{}".format(REDIS_INFLUX_USER_LISTEN_COUNT, user_name) self.redis.setex(user_key, count, InfluxListenStore.USER_LISTEN_COUNT_CACHE_TIME) return int(count)
def get_listen_count_for_user(self, user_name, need_exact=False): """Get the total number of listens for a user. The number of listens comes from brainzutils cache unless an exact number is asked for. Args: user_name: the user to get listens for need_exact: if True, get an exact number of listens directly from the ListenStore """ if not need_exact: # check if the user's listen count is already in cache # if already present return it directly instead of calculating it again # decode is set to False as we have not encoded the value when we set it # in brainzutils cache as we need to call increment operation which requires # an integer value user_key = '{}{}'.format(REDIS_INFLUX_USER_LISTEN_COUNT, user_name) count = cache.get(user_key, decode=False) if count: return int(count) try: results = self.influx.query('SELECT count(*) FROM ' + get_escaped_measurement_name(user_name)) except (InfluxDBServerError, InfluxDBClientError) as e: self.log.error("Cannot query influx: %s" % str(e), exc_info=True) raise # get the number of listens from the json try: count = results.get_points(measurement = get_measurement_name(user_name)).__next__()['count_recording_msid'] except (KeyError, StopIteration): count = 0 # put this value into brainzutils cache with an expiry time user_key = "{}{}".format(REDIS_INFLUX_USER_LISTEN_COUNT, user_name) cache.set(user_key, int(count), InfluxListenStore.USER_LISTEN_COUNT_CACHE_TIME, encode=False) return int(count)
def dump_user(self, username, fileobj, start_time, end_time): """ Dump specified user's listens into specified file object. Args: username (str): the MusicBrainz ID of the user whose listens are to be dumped fileobj (file): the file into which listens should be written start_time and end_time (datetime): the range of time for which listens are to be dumped Returns: int: the number of bytes this user's listens take in the dump file """ t0 = time.time() offset = 0 bytes_written = 0 listen_count = 0 # Get this user's listens in chunks while True: if start_time == datetime.utcfromtimestamp(0): result = self.get_listens_batch_for_dump( username, end_time, offset) else: result = self.get_incremental_listens_batch( username, start_time, end_time, offset) rows_added = 0 for row in result.get_points(get_measurement_name(username)): # make sure that listen was inserted in current dump's time range # need to do this check in python, because influx doesn't # do "IS NULL" operations and we have null inserted_timestamps from # old data if not self.row_inserted_before_or_equal(row, end_time): continue listen = Listen.from_influx(row).to_api() listen['user_name'] = username try: bytes_written += fileobj.write(ujson.dumps(listen)) bytes_written += fileobj.write('\n') rows_added += 1 except IOError as e: self.log.critical( 'IOError while writing listens into file for user %s', username, exc_info=True) raise except Exception as e: self.log.error( 'Exception while creating json for user %s: %s', username, str(e), exc_info=True) raise listen_count += rows_added if not rows_added: break offset += DUMP_CHUNK_SIZE time_taken = time.time() - t0 self.log.info( 'Listens for user %s dumped, total %d listens written at %.2f listens / sec!', username, listen_count, listen_count / time_taken) # the size for this user should not include the last newline we wrote # hence return bytes_written - 1 as the size in the dump for this user return bytes_written - 1
def dump_user(self, username, fileobj, dump_time): """ Dump specified user's listens into specified file object. Args: username (str): the MusicBrainz ID of the user whose listens are to be dumped fileobj (file): the file into which listens should be written dump_time (datetime): the time at which the specific data dump was initiated Returns: int: the number of bytes this user's listens take in the dump file """ t0 = time.time() offset = 0 bytes_written = 0 listen_count = 0 # Get this user's listens in chunks while True: # loop until we get this chunk of listens while True: try: result = self.influx.query(""" SELECT * FROM {measurement} WHERE time <= {timestamp} ORDER BY time DESC LIMIT {limit} OFFSET {offset} """.format( measurement=get_escaped_measurement_name(username), timestamp=get_influx_query_timestamp(dump_time.strftime('%s')), limit=DUMP_CHUNK_SIZE, offset=offset, )) break except Exception as e: self.log.error('Error while getting listens to dump for user %s: %s', user['musicbrainz_id'], str(e), exc_info=True) time.sleep(3) rows_added = 0 for row in result.get_points(get_measurement_name(username)): listen = Listen.from_influx(row).to_api() listen['user_name'] = username try: bytes_written += fileobj.write(ujson.dumps(listen)) bytes_written += fileobj.write('\n') rows_added += 1 except IOError as e: self.log.critical('IOError while writing listens into file for user %s', username, exc_info=True) raise except Exception as e: self.log.error('Exception while creating json for user %s: %s', user['musicbrainz_id'], str(e), exc_info=True) raise listen_count += rows_added if not rows_added: break offset += DUMP_CHUNK_SIZE time_taken = time.time() - t0 self.log.info('Listens for user %s dumped, total %d listens written at %.2f listens / sec!', username, listen_count, listen_count / time_taken) # the size for this user should not include the last newline we wrote # hence return bytes_written - 1 as the size in the dump for this user return bytes_written - 1
def dump_listens(self, location, dump_time=datetime.today(), threads=None): """ Fetches listens of each user from her measurement and dumps them into a file. These files are compressed into an archive. Args: location: the directory where the listens dump archive should be created dump_time (datetime): the time at which the data dump was started threads (int): the number of threads to user for compression Returns: the path to the dump archive """ self.log.info('Beginning dump of listens from InfluxDB...') self.log.info( 'Getting list of users whose listens are to be dumped...') users = db_user.get_all_users() self.log.info('Total number of users: %d', len(users)) archive_name = 'listenbrainz-listens-dump-{time}'.format( time=dump_time.strftime('%Y%m%d-%H%M%S')) archive_path = os.path.join( location, '{filename}.tar.xz'.format(filename=archive_name)) with open(archive_path, 'w') as archive: pxz_command = ['pxz', '--compress'] if threads is not None: pxz_command.append('-T {threads}'.format(threads=threads)) pxz = subprocess.Popen(pxz_command, stdin=subprocess.PIPE, stdout=archive) with tarfile.open(fileobj=pxz.stdin, mode='w|') as tar: temp_dir = tempfile.mkdtemp() try: # add timestamp timestamp_path = os.path.join(temp_dir, 'TIMESTAMP') with open(timestamp_path, 'w') as f: f.write(dump_time.isoformat(' ')) tar.add(timestamp_path, arcname=os.path.join(archive_name, 'TIMESTAMP')) # add schema version schema_version_path = os.path.join(temp_dir, 'SCHEMA_SEQUENCE') with open(schema_version_path, 'w') as f: f.write(str(LISTENS_DUMP_SCHEMA_VERSION)) tar.add(schema_version_path, arcname=os.path.join(archive_name, 'SCHEMA_SEQUENCE')) # add copyright notice tar.add(DUMP_LICENSE_FILE_PATH, arcname=os.path.join(archive_name, 'COPYING')) except IOError as e: log_ioerrors(self.log, e) raise except Exception as e: self.log.error('Exception while adding dump metadata: %s', str(e)) raise listens_path = os.path.join(temp_dir, 'listens') create_path(listens_path) # get listens from all measurements and write them to files in # a temporary dir before adding them to the archive for user in users: username = user['musicbrainz_id'] offset = 0 user_listens_file = '{username}.listens'.format( username=username) user_listens_path = os.path.join(listens_path, user_listens_file) with open(user_listens_path, 'w') as f: # Get this user's listens in chunks while True: # loop until we get this chunk of listens while True: try: result = self.influx.query(""" SELECT * FROM {measurement} WHERE time <= {timestamp} ORDER BY time DESC LIMIT {limit} OFFSET {offset} """.format( measurement= get_escaped_measurement_name(username), timestamp=get_influx_query_timestamp( dump_time.strftime('%s')), limit=DUMP_CHUNK_SIZE, offset=offset, )) break except Exception as e: self.log.error( 'Error while getting listens for user %s', user['musicbrainz_id']) self.log.error(str(e)) time.sleep(3) rows = list( result.get_points( get_measurement_name(username))) if not rows: break for row in rows: listen = Listen.from_influx(row).to_api() try: f.write(ujson.dumps(listen)) f.write('\n') except IOError as e: log_ioerrors(self.log, e) raise except Exception as e: self.log.error( 'Exception while creating json for user: %s', user['musicbrainz_id']) self.log.error(str(e)) raise offset += DUMP_CHUNK_SIZE # add the listens directory to the archive self.log.info('Got all listens, adding them to the archive...') tar.add(listens_path, arcname=os.path.join(archive_name, 'listens')) # remove the temporary directory shutil.rmtree(temp_dir) pxz.stdin.close() self.log.info('ListenBrainz listen dump done!') self.log.info('Dump present at %s!', archive_path) return archive_path
def dump_user(self, username, fileobj, dump_time): """ Dump specified user's listens into specified file object. Args: username (str): the MusicBrainz ID of the user whose listens are to be dumped fileobj (file): the file into which listens should be written dump_time (datetime): the time at which the specific data dump was initiated Returns: int: the number of bytes this user's listens take in the dump file """ t0 = time.time() offset = 0 bytes_written = 0 listen_count = 0 # Get this user's listens in chunks while True: # loop until we get this chunk of listens while True: try: result = self.influx.query(""" SELECT * FROM {measurement} WHERE time <= {timestamp} ORDER BY time DESC LIMIT {limit} OFFSET {offset} """.format( measurement=get_escaped_measurement_name(username), timestamp=get_influx_query_timestamp( dump_time.strftime('%s')), limit=DUMP_CHUNK_SIZE, offset=offset, )) break except Exception as e: self.log.error('Error while getting listens for user %s', user['musicbrainz_id']) self.log.error(str(e)) time.sleep(3) rows_added = 0 for row in result.get_points(get_measurement_name(username)): listen = Listen.from_influx(row).to_api() try: bytes_written += fileobj.write(ujson.dumps(listen)) bytes_written += fileobj.write('\n') rows_added += 1 except IOError as e: log_ioerrors(self.log, e) raise except Exception as e: self.log.error( 'Exception while creating json for user: %s', user['musicbrainz_id']) self.log.error(str(e)) raise listen_count += rows_added if not rows_added: break offset += DUMP_CHUNK_SIZE time_taken = time.time() - t0 self.log.info( 'Listens for user %s dumped, total %d listens written at %.2f listens / sec!', username, listen_count, listen_count / time_taken) # the size for this user should not include the last newline we wrote # hence return bytes_written - 1 as the size in the dump for this user return bytes_written - 1
def write(self, listen_dicts): submit = [] unique = [] duplicate_count = 0 unique_count = 0 # Partition the listens on the basis of user names # and then store the time range for each user users = {} for listen in listen_dicts: t = int(listen['listened_at']) user_name = listen['user_name'] if user_name not in users: users[user_name] = { 'min_time': t, 'max_time': t, 'listens': [listen], } continue if t > users[user_name]['max_time']: users[user_name]['max_time'] = t if t < users[user_name]['min_time']: users[user_name]['min_time'] = t users[user_name]['listens'].append(listen) # get listens in the time range for each user and # remove duplicates on the basis of timestamps for user_name in users: # get the range of time that we need to get from influx for # deduplication of listens min_time = users[user_name]['min_time'] max_time = users[user_name]['max_time'] query = """SELECT time, recording_msid FROM %s WHERE time >= %s AND time <= %s """ % (get_escaped_measurement_name(user_name), get_influx_query_timestamp(min_time), get_influx_query_timestamp(max_time)) while True: try: results = self.influx.query(query) break except Exception as e: self.log.error("Cannot query influx: %s" % str(e)) sleep(3) # collect all the timestamps for this given time range. timestamps = defaultdict(list) # dict of list of listens indexed by timestamp for result in results.get_points(measurement=get_measurement_name(user_name)): timestamps[convert_to_unix_timestamp(result['time'])].append(result) for listen in users[user_name]['listens']: # Check if a listen with the same timestamp and recording msid is already present in # Influx DB and if it is, mark current listen as duplicate t = int(listen['listened_at']) recording_msid = listen['recording_msid'] dup = False if t in timestamps: for row in timestamps[t]: if row['recording_msid'] == recording_msid: duplicate_count += 1 dup = True break else: # if there are listens with the same timestamp but different # metadata, we add a tag specifically for making sure that # influxdb doesn't drop one of the listens. This value # is monotonically increasing and defaults to 0 listen['dedup_tag'] = len(timestamps[t]) if not dup: unique_count += 1 submit.append(Listen.from_json(listen)) unique.append(listen) timestamps[t].append({ 'time': convert_timestamp_to_influx_row_format(t), 'recording_msid': recording_msid }) t0 = time() submitted_count = self.insert_to_listenstore(submit) self.time += time() - t0 self.log.error("dups: %d, unique: %d, submitted: %d" % (duplicate_count, unique_count, submitted_count)) if not unique_count: return True while True: try: self.unique_ch.basic_publish( exchange=self.config.UNIQUE_EXCHANGE, routing_key='', body=ujson.dumps(unique), properties=pika.BasicProperties(delivery_mode = 2,), ) break except pika.exceptions.ConnectionClosed: self.connect_to_rabbitmq() return True
def write(self, listen_dicts): submit = [] unique = [] duplicate_count = 0 unique_count = 0 # Partition the listens on the basis of user names # and then store the time range for each user users = {} for listen in listen_dicts: t = int(listen['listened_at']) user_name = listen['user_name'] if user_name not in users: users[user_name] = { 'min_time': t, 'max_time': t, 'listens': [listen], } continue if t > users[user_name]['max_time']: users[user_name]['max_time'] = t if t < users[user_name]['min_time']: users[user_name]['min_time'] = t users[user_name]['listens'].append(listen) # get listens in the time range for each user and # remove duplicates on the basis of timestamps for user_name in users: # get the range of time that we need to get from influx for # deduplication of listens min_time = users[user_name]['min_time'] max_time = users[user_name]['max_time'] # quering for artist name here, since a field must be included in the query. query = """SELECT time, artist_name FROM %s WHERE time >= %s AND time <= %s """ % (get_escaped_measurement_name(user_name), get_influx_query_timestamp(min_time), get_influx_query_timestamp(max_time)) while True: try: results = self.influx.query(query) break except Exception as e: self.log.error("Cannot query influx: %s" % str(e)) sleep(3) # collect all the timestamps for this given time range. timestamps = {} for result in results.get_points( measurement=get_measurement_name(user_name)): timestamps[convert_to_unix_timestamp(result['time'])] = 1 for listen in users[user_name]['listens']: # Check if this listen is already present in Influx DB and if it is # mark current listen as duplicate t = int(listen['listened_at']) if t in timestamps: duplicate_count += 1 continue else: unique_count += 1 submit.append(Listen.from_json(listen)) unique.append(listen) timestamps[t] = 1 t0 = time() submitted_count = self.insert_to_listenstore(submit) self.time += time() - t0 self.log.error("dups: %d, unique: %d, submitted: %d" % (duplicate_count, unique_count, submitted_count)) if not unique_count: return True while True: try: self.unique_ch.basic_publish(exchange='unique', routing_key='', body=ujson.dumps(unique), properties=pika.BasicProperties( delivery_mode=2, )) break except pika.exceptions.ConnectionClosed: self.connect_to_rabbitmq() return True