def get_total_listen_count(self, cache_value=True): """ Returns the total number of listens stored in the ListenStore. First checks the brainzutils cache for the value, if not present there makes a query to the db and caches it in brainzutils cache. """ if cache_value: count = cache.get( InfluxListenStore.REDIS_INFLUX_TOTAL_LISTEN_COUNT, decode=False) if count: return int(count) try: result = self.influx.query( """SELECT %s FROM "%s" ORDER BY time DESC LIMIT 1""" % (COUNT_MEASUREMENT_NAME, TIMELINE_COUNT_MEASUREMENT)) except (InfluxDBServerError, InfluxDBClientError) as err: self.log.error("Cannot query influx: %s" % str(err), exc_info=True) raise try: item = result.get_points( measurement=TIMELINE_COUNT_MEASUREMENT).__next__() count = int(item[COUNT_MEASUREMENT_NAME]) timestamp = convert_to_unix_timestamp(item['time']) except (KeyError, ValueError, StopIteration): timestamp = 0 count = 0 # Now sum counts that have been added in the interval we're interested in try: result = self.influx.query( """SELECT sum(%s) as total FROM "%s" WHERE time > %s""" % (COUNT_MEASUREMENT_NAME, TEMP_COUNT_MEASUREMENT, get_influx_query_timestamp(timestamp))) except (InfluxDBServerError, InfluxDBClientError) as err: self.log.error("Cannot query influx: %s" % str(err), exc_info=True) raise try: data = result.get_points( measurement=TEMP_COUNT_MEASUREMENT).__next__() count += int(data['total']) except StopIteration: pass if cache_value: cache.set( InfluxListenStore.REDIS_INFLUX_TOTAL_LISTEN_COUNT, int(count), InfluxListenStore.TOTAL_LISTEN_COUNT_CACHE_TIME, encode=False, ) return count
def from_influx(cls, row): """ Factory to make Listen objects from an influx row """ def convert_comma_seperated_string_to_list(string): if not string: return [] return [val for val in string.split(',')] t = convert_to_unix_timestamp(row['time']) data = { 'release_msid': row.get('release_msid'), 'release_mbid': row.get('release_mbid'), 'release_name': row.get('release_name'), 'recording_mbid': row.get('recording_mbid'), 'release_group_mbid': row.get('release_group_mbid'), 'artist_mbids': convert_comma_seperated_string_to_list(row.get('artist_mbids', '')), 'tags': convert_comma_seperated_string_to_list(row.get('tags', '')), 'work_mbids': convert_comma_seperated_string_to_list(row.get('work_mbids', '')), 'isrc': row.get('isrc'), 'spotify_id': row.get('spotify_id'), 'tracknumber': row.get('tracknumber'), 'track_mbid': row.get('track_mbid'), } # The influx row can contain many fields that are user-generated. # We only need to add those fields which have some value in them to additional_info. # Also, we need to make sure that we don't add fields like time, user_name etc. into # the additional_info. for key, value in row.items(): if key not in data and \ key not in ['time', 'user_name', 'recording_msid', 'artist_mbids', 'tags'] and \ value is not None: data[key] = value return cls(timestamp=t, user_name=row.get('user_name'), artist_msid=row.get('artist_msid'), recording_msid=row.get('recording_msid'), release_msid=row.get('release_msid'), data={ 'additional_info': data, 'artist_name': row.get('artist_name'), 'track_name': row.get('track_name'), })
def get_total_listen_count(self, cache_value=True): """ Returns the total number of listens stored in the ListenStore. First checks the brainzutils cache for the value, if not present there makes a query to the db and caches it in brainzutils cache. """ if cache_value: count = cache.get(InfluxListenStore.REDIS_INFLUX_TOTAL_LISTEN_COUNT, decode=False) if count: return int(count) try: result = self.influx.query("""SELECT %s FROM "%s" ORDER BY time DESC LIMIT 1""" % (COUNT_MEASUREMENT_NAME, TIMELINE_COUNT_MEASUREMENT)) except (InfluxDBServerError, InfluxDBClientError) as err: self.log.error("Cannot query influx: %s" % str(err), exc_info=True) raise try: item = result.get_points(measurement=TIMELINE_COUNT_MEASUREMENT).__next__() count = int(item[COUNT_MEASUREMENT_NAME]) timestamp = convert_to_unix_timestamp(item['time']) except (KeyError, ValueError, StopIteration): timestamp = 0 count = 0 # Now sum counts that have been added in the interval we're interested in try: result = self.influx.query("""SELECT sum(%s) as total FROM "%s" WHERE time > %s""" % (COUNT_MEASUREMENT_NAME, TEMP_COUNT_MEASUREMENT, get_influx_query_timestamp(timestamp))) except (InfluxDBServerError, InfluxDBClientError) as err: self.log.error("Cannot query influx: %s" % str(err), exc_info=True) raise try: data = result.get_points(measurement=TEMP_COUNT_MEASUREMENT).__next__() count += int(data['total']) except StopIteration: pass if cache_value: cache.set( InfluxListenStore.REDIS_INFLUX_TOTAL_LISTEN_COUNT, int(count), InfluxListenStore.TOTAL_LISTEN_COUNT_CACHE_TIME, encode=False, ) return count
def convert_to_influx_insert_format(self, row, measurement): data = { 'measurement': measurement, 'time': convert_to_unix_timestamp(row['time']), } data['fields'] = row data['fields'].pop('time') try: dedup_tag = data['fields'].pop('dedup_tag') data['tags'] = {'dedup_tag': dedup_tag} except KeyError: pass # no dedup tag, don't need to do anything return data
def from_influx(cls, row): """ Factory to make Listen objects from an influx row """ t = convert_to_unix_timestamp(row['time']) mbids = [] artist_mbids = row.get('artist_mbids') if artist_mbids: for mbid in artist_mbids.split(','): mbids.append(mbid) tags = [] influx_tags = row.get('tags') if influx_tags: for tag in influx_tags.split(','): tags.append(tag) data = { 'artist_mbids': mbids, 'release_msid': row.get('release_msid'), 'release_mbid': row.get('release_mbid'), 'release_name': row.get('release_name'), 'recording_mbid': row.get('recording_mbid'), 'tags': tags, } # The influx row can contain many fields that are user-generated. # We only need to add those fields which have some value in them to additional_info. # Also, we need to make sure that we don't add fields like time, user_name etc. into # the additional_info. for key, value in row.items(): if key not in [ 'time', 'user_name', 'recording_msid', 'artist_mbids', 'tags' ] and value is not None: data[key] = value return cls(timestamp=t, user_name=row.get('user_name'), artist_msid=row.get('artist_msid'), recording_msid=row.get('recording_msid'), release_msid=row.get('release_msid'), data={ 'additional_info': data, 'artist_name': row.get('artist_name'), 'track_name': row.get('track_name'), })
def write(self, listen_dicts): submit = [] unique = [] duplicate_count = 0 unique_count = 0 # Partition the listens on the basis of user names # and then store the time range for each user users = {} for listen in listen_dicts: t = int(listen['listened_at']) user_name = listen['user_name'] if user_name not in users: users[user_name] = { 'min_time': t, 'max_time': t, 'listens': [listen], } continue if t > users[user_name]['max_time']: users[user_name]['max_time'] = t if t < users[user_name]['min_time']: users[user_name]['min_time'] = t users[user_name]['listens'].append(listen) # get listens in the time range for each user and # remove duplicates on the basis of timestamps for user_name in users: # get the range of time that we need to get from influx for # deduplication of listens min_time = users[user_name]['min_time'] max_time = users[user_name]['max_time'] query = """SELECT time, recording_msid FROM %s WHERE time >= %s AND time <= %s """ % (get_escaped_measurement_name(user_name), get_influx_query_timestamp(min_time), get_influx_query_timestamp(max_time)) while True: try: results = self.influx.query(query) break except Exception as e: self.log.error("Cannot query influx: %s" % str(e)) sleep(3) # collect all the timestamps for this given time range. timestamps = defaultdict(list) # dict of list of listens indexed by timestamp for result in results.get_points(measurement=get_measurement_name(user_name)): timestamps[convert_to_unix_timestamp(result['time'])].append(result) for listen in users[user_name]['listens']: # Check if a listen with the same timestamp and recording msid is already present in # Influx DB and if it is, mark current listen as duplicate t = int(listen['listened_at']) recording_msid = listen['recording_msid'] dup = False if t in timestamps: for row in timestamps[t]: if row['recording_msid'] == recording_msid: duplicate_count += 1 dup = True break else: # if there are listens with the same timestamp but different # metadata, we add a tag specifically for making sure that # influxdb doesn't drop one of the listens. This value # is monotonically increasing and defaults to 0 listen['dedup_tag'] = len(timestamps[t]) if not dup: unique_count += 1 submit.append(Listen.from_json(listen)) unique.append(listen) timestamps[t].append({ 'time': convert_timestamp_to_influx_row_format(t), 'recording_msid': recording_msid }) t0 = time() submitted_count = self.insert_to_listenstore(submit) self.time += time() - t0 self.log.error("dups: %d, unique: %d, submitted: %d" % (duplicate_count, unique_count, submitted_count)) if not unique_count: return True while True: try: self.unique_ch.basic_publish( exchange=self.config.UNIQUE_EXCHANGE, routing_key='', body=ujson.dumps(unique), properties=pika.BasicProperties(delivery_mode = 2,), ) break except pika.exceptions.ConnectionClosed: self.connect_to_rabbitmq() return True
def from_influx(cls, row): """ Factory to make Listen objects from an influx row """ t = convert_to_unix_timestamp(row['time']) data = { 'release_msid': row.get('release_msid'), 'release_mbid': row.get('release_mbid'), 'recording_mbid': row.get('recording_mbid'), 'release_group_mbid': row.get('release_group_mbid'), 'artist_mbids': convert_comma_seperated_string_to_list(row.get('artist_mbids', '')), 'tags': convert_comma_seperated_string_to_list(row.get('tags', '')), 'work_mbids': convert_comma_seperated_string_to_list(row.get('work_mbids', '')), 'isrc': row.get('isrc'), 'spotify_id': row.get('spotify_id'), 'tracknumber': row.get('tracknumber'), 'track_mbid': row.get('track_mbid'), } # The influx row can contain many fields that are user-generated. # We only need to add those fields which have some value in them to additional_info. # Also, we need to make sure that we don't add fields like time, user_name etc. into # the additional_info. for key, value in row.items(): if key not in data and key not in Listen.TOP_LEVEL_KEYS + Listen.PRIVATE_KEYS and value is not None: try: value = ujson.loads(value) data[key] = value continue except (ValueError, TypeError): pass # there are some lists in the database that were converted to string # via str(list) so they can't be loaded via json. # Example: "['Blank & Jones']" # However, yaml parses them safely and correctly try: value = yaml.safe_load(value) data[key] = value continue except (ValueError, yaml.scanner.ScannerError, yaml.parser.ParserError, Exception): pass data[key] = value return cls(timestamp=t, user_name=row.get('user_name'), artist_msid=row.get('artist_msid'), recording_msid=row.get('recording_msid'), release_msid=row.get('release_msid'), inserted_timestamp=row.get('inserted_timestamp'), data={ 'additional_info': data, 'artist_name': row.get('artist_name'), 'track_name': row.get('track_name'), 'release_name': row.get('release_name'), })
def from_influx(cls, row): """ Factory to make Listen objects from an influx row """ def convert_comma_seperated_string_to_list(string): if not string: return [] return [val for val in string.split(',')] t = convert_to_unix_timestamp(row['time']) data = { 'release_msid': row.get('release_msid'), 'release_mbid': row.get('release_mbid'), 'recording_mbid': row.get('recording_mbid'), 'release_group_mbid': row.get('release_group_mbid'), 'artist_mbids': convert_comma_seperated_string_to_list(row.get('artist_mbids', '')), 'tags': convert_comma_seperated_string_to_list(row.get('tags', '')), 'work_mbids': convert_comma_seperated_string_to_list(row.get('work_mbids', '')), 'isrc': row.get('isrc'), 'spotify_id': row.get('spotify_id'), 'tracknumber': row.get('tracknumber'), 'track_mbid': row.get('track_mbid'), } # The influx row can contain many fields that are user-generated. # We only need to add those fields which have some value in them to additional_info. # Also, we need to make sure that we don't add fields like time, user_name etc. into # the additional_info. for key, value in row.items(): if key not in data and key not in Listen.TOP_LEVEL_KEYS + Listen.PRIVATE_KEYS and value is not None: try: value = ujson.loads(value) data[key] = value continue except (ValueError, TypeError): pass # there are some lists in the database that were converted to string # via str(list) so they can't be loaded via json. # Example: "['Blank & Jones']" # However, yaml parses them safely and correctly try: value = yaml.safe_load(value) data[key] = value continue except ValueError: pass data[key] = value return cls( timestamp=t, user_name=row.get('user_name'), artist_msid=row.get('artist_msid'), recording_msid=row.get('recording_msid'), release_msid=row.get('release_msid'), inserted_timestamp=row.get('inserted_timestamp'), data={ 'additional_info': data, 'artist_name': row.get('artist_name'), 'track_name': row.get('track_name'), 'release_name': row.get('release_name'), } )
def write(self, listen_dicts): submit = [] unique = [] duplicate_count = 0 unique_count = 0 # Partition the listens on the basis of user names # and then store the time range for each user users = {} for listen in listen_dicts: t = int(listen['listened_at']) user_name = listen['user_name'] if user_name not in users: users[user_name] = { 'min_time': t, 'max_time': t, 'listens': [listen], } continue if t > users[user_name]['max_time']: users[user_name]['max_time'] = t if t < users[user_name]['min_time']: users[user_name]['min_time'] = t users[user_name]['listens'].append(listen) # get listens in the time range for each user and # remove duplicates on the basis of timestamps for user_name in users: # get the range of time that we need to get from influx for # deduplication of listens min_time = users[user_name]['min_time'] max_time = users[user_name]['max_time'] # quering for artist name here, since a field must be included in the query. query = """SELECT time, artist_name FROM %s WHERE time >= %s AND time <= %s """ % (get_escaped_measurement_name(user_name), get_influx_query_timestamp(min_time), get_influx_query_timestamp(max_time)) while True: try: results = self.influx.query(query) break except Exception as e: self.log.error("Cannot query influx: %s" % str(e)) sleep(3) # collect all the timestamps for this given time range. timestamps = {} for result in results.get_points( measurement=get_measurement_name(user_name)): timestamps[convert_to_unix_timestamp(result['time'])] = 1 for listen in users[user_name]['listens']: # Check if this listen is already present in Influx DB and if it is # mark current listen as duplicate t = int(listen['listened_at']) if t in timestamps: duplicate_count += 1 continue else: unique_count += 1 submit.append(Listen.from_json(listen)) unique.append(listen) timestamps[t] = 1 t0 = time() submitted_count = self.insert_to_listenstore(submit) self.time += time() - t0 self.log.error("dups: %d, unique: %d, submitted: %d" % (duplicate_count, unique_count, submitted_count)) if not unique_count: return True while True: try: self.unique_ch.basic_publish(exchange='unique', routing_key='', body=ujson.dumps(unique), properties=pika.BasicProperties( delivery_mode=2, )) break except pika.exceptions.ConnectionClosed: self.connect_to_rabbitmq() return True