# Must remove .mp3 from the path to get generic path 'path': row[4].replace('.mp3', '') } for n, row in enumerate(uspop2002_list) ] whoosh_search.create_index( os.path.join(BASE_DATA_PATH, 'uspop2002', 'index'), uspop2002_list) # Quick test artist = 'bon jovi' title = 'livin on a prayer' index = whoosh_search.get_whoosh_index( os.path.join(BASE_DATA_PATH, 'cal500', 'index')) with index.searcher() as searcher: print 'cal500:\t{}'.format( whoosh_search.search(searcher, index.schema, artist, title)) index = whoosh_search.get_whoosh_index( os.path.join(BASE_DATA_PATH, 'cal10k', 'index')) with index.searcher() as searcher: print 'cal10k:\t{}'.format( whoosh_search.search(searcher, index.schema, artist, title)) index = whoosh_search.get_whoosh_index( os.path.join(BASE_DATA_PATH, 'msd', 'index')) with index.searcher() as searcher: print 'msd:\t{}'.format( whoosh_search.search(searcher, index.schema, artist, title)) index = whoosh_search.get_whoosh_index( os.path.join(BASE_DATA_PATH, 'uspop2002', 'index'))
os.path.join(FILE_LIST_PATH, 'uspop2002.txt')) uspop2002_list = [{'id': unicode(n), 'artist': row[1], 'title': row[3], # Must remove .mp3 from the path to get generic path 'path': row[4].replace('.mp3', '')} for n, row in enumerate(uspop2002_list)] whoosh_search.create_index( os.path.join(BASE_DATA_PATH, 'uspop2002', 'index'), uspop2002_list) # Quick test artist = 'bon jovi' title = 'livin on a prayer' index = whoosh_search.get_whoosh_index( os.path.join(BASE_DATA_PATH, 'cal500', 'index')) with index.searcher() as searcher: print 'cal500:\t{}'.format(whoosh_search.search(searcher, index.schema, artist, title)) index = whoosh_search.get_whoosh_index( os.path.join(BASE_DATA_PATH, 'cal10k', 'index')) with index.searcher() as searcher: print 'cal10k:\t{}'.format(whoosh_search.search(searcher, index.schema, artist, title)) index = whoosh_search.get_whoosh_index( os.path.join(BASE_DATA_PATH, 'msd', 'index')) with index.searcher() as searcher: print 'msd:\t{}'.format(whoosh_search.search(searcher, index.schema, artist, title)) index = whoosh_search.get_whoosh_index( os.path.join(BASE_DATA_PATH, 'uspop2002', 'index'))
# Get all entries with the same artist/title midi_matches = [ e for e in midi_list if (e['artist'] == midi_entry['artist'] and e['title'] == midi_entry['title']) ] # Remove these matches so we don't use them more than once for match in midi_matches: del midi_list[midi_list.index(match)] # This should never happen if len(midi_matches) == 0: print "Error: No matches found for {}".format(midi_entry) # Match each of these MIDIs against each dataset dataset_matches = [] for dataset in DATASETS: matches = whoosh_search.search(searchers[dataset], indices[dataset].schema, midi_entry['artist'], midi_entry['title']) # Add the each matched dataset entry in if we haven't already for match in matches: if [dataset, match[0]] not in dataset_matches: dataset_matches.append([dataset, match[0]]) # If there are any matches, add them to pairs if len(dataset_matches) > 0: pairs.append([[m['id'] for m in midi_matches], dataset_matches]) # Find other pairs which have include one of these dataset entries merge_indices = [] for n, pair in enumerate(pairs): for dataset_match in dataset_matches: if dataset_match in pair[1]: merge_indices.append(n) break
midi_entry = midi_list[-1] # Get all entries with the same artist/title midi_matches = [e for e in midi_list if (e['artist'] == midi_entry['artist'] and e['title'] == midi_entry['title'])] # Remove these matches so we don't use them more than once for match in midi_matches: del midi_list[midi_list.index(match)] # This should never happen if len(midi_matches) == 0: print "Error: No matches found for {}".format(midi_entry) # Match each of these MIDIs against each dataset dataset_matches = [] for dataset in DATASETS: matches = whoosh_search.search( searchers[dataset], indices[dataset].schema, midi_entry['artist'], midi_entry['title']) # Add the each matched dataset entry in if we haven't already for match in matches: if [dataset, match[0]] not in dataset_matches: dataset_matches.append([dataset, match[0]]) # If there are any matches, add them to pairs if len(dataset_matches) > 0: pairs.append([[m['id'] for m in midi_matches], dataset_matches]) # Find other pairs which have include one of these dataset entries merge_indices = [] for n, pair in enumerate(pairs): for dataset_match in dataset_matches: if dataset_match in pair[1]: merge_indices.append(n) break