def analyze_missing_similar(): s = SongSegment() segment_data = [] for segment in s._db._db[s._dbcol].find({ 'similar.' + str(MATCHES - 1): { '$exists': False } }).limit(1000000): if segment['mfcc'] == None or segment['chroma'] == None or segment[ 'tempogram'] == None: break feature = _create_feature(np.frombuffer(segment['mfcc']), np.frombuffer(segment['chroma']), np.frombuffer(segment['tempogram'])) segment_data.append((segment['_id'], segment['song_id'], segment['time_from'], feature)) s.close() print("Updating similar for " + str(len(segment_data)) + " segments") if len(segment_data): analyze_segments(segment_data) else: # This seems like the wrong place for this, but good enough for now time.sleep(60 * 10)
class Loader(pykka.ThreadingActor): def __init__(self): super().__init__() self.seg_db = SongSegment() def load(self, song): return _load_song(song[0], song[1], self.seg_db) def on_stop(self): self.seg_db.close()
def query_similar(song_id, from_time, to_time): """Queries the database for segments similar to the segment provided Parameters ---------- song_id : string Id of the given song from_time : int The start time of the segment to_time : int The end time of the segment Returns ------- list of Dict[song_id : string, from_time: time_from, to_time: time_to] A list of all the segments which are similar """ seg_db = SongSegment() segments = seg_db.get_all_by_song_id(song_id) best = (None, None) for segment in segments: localdist = abs(from_time - segment['time_from']) if best is None or localdist < best[0]: best = (localdist, segment) if best is None or 'similar' not in best[1]: return None segment = best[1] similar = segment['similar'] similar_ids = list(map(lambda sim: sim['id'], similar)) similar_full = seg_db.get_by_ids(similar_ids) similar_segments = [] for i in range(0, len(similar)): sim_seg = next(seg for seg in similar_full if seg['_id'] == similar[i]['id']) similar_segments.append( dict({ 'song_id': sim_seg['song_id'], 'from_time': sim_seg['time_from'], 'to_time': sim_seg['time_to'], 'distance': similar[i]['distance'], })) seg_db.close() return similar_segments
def analyze_segments(segs): ss = SongSegment() count = ss.count() allMatches = list(map(lambda x: [], segs)) matchers = [Matcher.start().proxy() for _ in range(cpu_count())] print("Searching through " + str(count // BUCKET_SIZE + 1) + " buckets, with " + str(BUCKET_SIZE) + " segments in each") for i in range(0, count // BUCKET_SIZE + 1): print("Bucket: " + str(i + 1)) established_segments = list( filter( lambda x: x['mfcc'] is not None and x['chroma'] is not None and x['tempogram'] is not None, ss.get_all_in_range(i * BUCKET_SIZE, (i + 1) * BUCKET_SIZE))) established_segments = list( map(_process_db_segment, established_segments)) data = np.array(list(map(lambda x: x[3], established_segments))) bucket = _create_bucket(data) query_object = bucket[1].construct_query_pool() query_object.set_num_probes(25) matched = [] for i, seg in enumerate(segs): matched.append(matchers[i % len(matchers)].match( seg, query_object)) matches = pykka.get_all(matched) for j in range(0, len(matches)): allMatches[j].append( list(map(lambda x: established_segments[x], matches[j]))) del data del bucket del established_segments del query_object del matches for matcher in matchers: matcher.stop() for i in range(0, len(segs)): best = _find_best_matches(_flatten(allMatches[i]), segs[i]) matches = ss.get_by_ids(list(map(lambda match: match[0][0], best))) matches = list( map(lambda match: (match['_id'], match['similar']), matches)) for j in range(0, len(matches)): matches[j][1].append( dict({ 'id': segs[i][0], 'distance': best[j][1], })) match_ids = list(set(map(lambda x: x['id'], matches[j][1]))) innerMatches = list( map( lambda match_id: next(x for x in matches[j][1] if x['id'] == match_id), match_ids)) innerMatches.sort(key=lambda m: m['distance']) ss.update_similar(matches[j][0], innerMatches[:10]) formatted = [] for match in best: formatted.append(dict({'id': match[0][0], 'distance': match[1]})) ss.update_similar(segs[i][0], formatted) ss.close()