def _compare_to_track(self, track): """ Compare file metadata to a MusicBrainz track. Weigths: * title = 13 * artist name = 4 * release name = 5 * length = 10 * number of tracks = 4 * album type = 20 * release country = 2 * format = 2 """ total = 0.0 parts = [] w = self.comparison_weights if 'title' in self.metadata: a = self.metadata['title'] b = track.title[0].text parts.append((similarity2(a, b), w["title"])) total += w["title"] if 'artist' in self.metadata: a = self.metadata['artist'] b = artist_credit_from_node(track.artist_credit[0], self.config)[0] parts.append((similarity2(a, b), w["artist"])) total += w["artist"] a = self.metadata.length if a > 0 and 'length' in track.children: b = int(track.length[0].text) score = 1.0 - min(abs(a - b), 30000) / 30000.0 parts.append((score, w["length"])) total += w["length"] releases = [] if "release_list" in track.children and "release" in track.release_list[0].children: releases = track.release_list[0].release if not releases: return (total, None) scores = [] for release in releases: t, p = self.metadata.compare_to_release(release, w, self.config) total_ = total + t parts_ = list(parts) + p scores.append((reduce(lambda x, y: x + y[0] * y[1] / total_, parts_, 0.0), release.id)) return max(scores, key=lambda x: x[0])
def _compare_to_release(self, release): """ Compare cluster metadata to a MusicBrainz release. Produces a probability as a linear combination of weights that the cluster is a certain album. Weights: * title = 17 * artist name = 6 * number of tracks = 5 * release country = 2 * format = 2 """ total = 0.0 parts = [] w = Cluster.comparison_weights a = self.metadata['albumartist'] b = artist_credit_from_node(release.artist_credit[0], self.config)[0] parts.append((similarity2(a, b), w["artist"])) total += w["artist"] t, p = self.metadata.compare_to_release(release, w, self.config) total += t parts.extend(p) return reduce(lambda x, y: x + y[0] * y[1] / total, parts, 0.0)
def compare(self, other): parts = [] total = 0 if self.length and other.length: score = 1.0 - min(abs(self.length - other.length), 30000) / 30000.0 parts.append((score, 8)) total += 8 for name, weight in self.__weights: a = self[name] b = other[name] if a and b: if name in ('tracknumber', 'totaltracks'): try: ia = int(a) ib = int(b) except ValueError: ia = a ib = b score = 1.0 - abs(cmp(ia, ib)) else: score = similarity2(a, b) parts.append((score, weight)) total += weight return reduce(lambda x, y: x + y[0] * y[1] / total, parts, 0.0)
def compare(self, other): parts = [] if self.length and other.length: score = self.length_score(self.length, other.length) parts.append((score, 8)) for name, weight in self.__weights: a = self[name] b = other[name] if a and b: if name in ('tracknumber', 'totaltracks'): try: ia = int(a) ib = int(b) except ValueError: ia = a ib = b score = 1.0 - (int(ia != ib)) else: score = similarity2(a, b) parts.append((score, weight)) elif (a and name in other.deleted_tags or b and name in self.deleted_tags): parts.append((0, weight)) return linear_combination_of_weights(parts)
def compare_to_track(self, track, weights): total = 0.0 parts = [] if 'title' in self: a = self['title'] b = track.title[0].text parts.append((similarity2(a, b), weights["title"])) total += weights["title"] if 'artist' in self: a = self['artist'] b = artist_credit_from_node(track.artist_credit[0])[0] parts.append((similarity2(a, b), weights["artist"])) total += weights["artist"] a = self.length if a > 0 and 'length' in track.children: b = int(track.length[0].text) score = 1.0 - min(abs(a - b), 30000) / 30000.0 parts.append((score, weights["length"])) total += weights["length"] releases = [] if "release_list" in track.children and "release" in track.release_list[ 0].children: releases = track.release_list[0].release if not releases: sim = reduce(lambda x, y: x + y[0] * y[1] / total, parts, 0.0) return (sim, None, None, track) result = (-1, ) for release in releases: t, p = self.compare_to_release(release, weights, return_parts=True) sim = reduce(lambda x, y: x + y[0] * y[1] / (total + t), parts + p, 0.0) if sim > result[0]: rg = release.release_group[ 0] if "release_group" in release.children else None result = (sim, rg, release, track) return result
def compare_to_track(self, track, weights): parts = [] if 'title' in self: a = self['title'] b = track.get('title', '') parts.append((similarity2(a, b), weights["title"])) if 'artist' in self: a = self['artist'] artist_credits = track.get('artist-credit', []) b = artist_credit_from_node(artist_credits)[0] parts.append((similarity2(a, b), weights["artist"])) a = self.length if a > 0 and 'length' in track: b = track['length'] score = self.length_score(a, b) parts.append((score, weights["length"])) releases = [] if "releases" in track: releases = track['releases'] search_score = get_score(track) if not releases: sim = linear_combination_of_weights(parts) * search_score return SimMatchTrack(similarity=sim, releasegroup=None, release=None, track=track) if 'isvideo' in weights: metadata_is_video = self['~video'] == '1' track_is_video = track.get('video', False) score = 1 if metadata_is_video == track_is_video else 0 parts.append((score, weights['isvideo'])) result = SimMatchTrack(similarity=-1, releasegroup=None, release=None, track=None) for release in releases: release_parts = self.compare_to_release_parts(release, weights) sim = linear_combination_of_weights(parts + release_parts) * search_score if sim > result.similarity: rg = release['release-group'] if "release-group" in release else None result = SimMatchTrack(similarity=sim, releasegroup=rg, release=release, track=track) return result
def compare_to_track(self, track, weights): config = QObject.config total = 0.0 parts = [] if 'title' in self: a = self['title'] b = track.title[0].text parts.append((similarity2(a, b), weights["title"])) total += weights["title"] if 'artist' in self: a = self['artist'] b = artist_credit_from_node(track.artist_credit[0], config)[0] parts.append((similarity2(a, b), weights["artist"])) total += weights["artist"] a = self.length if a > 0 and 'length' in track.children: b = int(track.length[0].text) score = 1.0 - min(abs(a - b), 30000) / 30000.0 parts.append((score, weights["length"])) total += weights["length"] releases = [] if "release_list" in track.children and "release" in track.release_list[0].children: releases = track.release_list[0].release if not releases: sim = reduce(lambda x, y: x + y[0] * y[1] / total, parts, 0.0) return (sim, None, None, track) result = (-1,) for release in releases: t, p = self.compare_to_release(release, weights, return_parts=True) sim = reduce(lambda x, y: x + y[0] * y[1] / (total + t), parts + p, 0.0) if sim > result[0]: rg = release.release_group[0] if "release_group" in release.children else None result = (sim, rg, release, track) return result
def compare_to_track(self, track, weights): parts = [] if 'title' in self: a = self['title'] b = track.get('title', '') parts.append((similarity2(a, b), weights["title"])) if 'artist' in self: a = self['artist'] artist_credits = track.get('artist-credit', []) b = artist_credit_from_node(artist_credits)[0] parts.append((similarity2(a, b), weights["artist"])) a = self.length if a > 0 and 'length' in track: b = track['length'] score = self.length_score(a, b) parts.append((score, weights["length"])) releases = [] if "releases" in track: releases = track['releases'] if not releases: sim = linear_combination_of_weights(parts) return (sim, None, None, track) result = (-1, ) for release in releases: release_parts = self.compare_to_release_parts(release, weights) sim = linear_combination_of_weights(parts + release_parts) if 'score' in track: sim *= track['score'] / 100 if sim > result[0]: rg = release[ 'release-group'] if "release-group" in release else None result = (sim, rg, release, track) return result
def compare_to_track(self, track, weights): parts = [] if 'title' in self: a = self['title'] b = track.get('title', '') parts.append((similarity2(a, b), weights["title"])) if 'artist' in self: a = self['artist'] artist_credits = track.get('artist-credit', []) b = artist_credit_from_node(artist_credits)[0] parts.append((similarity2(a, b), weights["artist"])) a = self.length if a > 0 and 'length' in track: b = track['length'] score = self.length_score(a, b) parts.append((score, weights["length"])) releases = [] if "releases" in track: releases = track['releases'] if not releases: sim = linear_combination_of_weights(parts) return SimMatchTrack(similarity=sim, releasegroup=None, release=None, track=track) result = SimMatchTrack(similarity=-1, releasegroup=None, release=None, track=None) for release in releases: release_parts = self.compare_to_release_parts(release, weights) sim = linear_combination_of_weights(parts + release_parts) if 'score' in track: sim *= track['score'] / 100 if sim > result.similarity: rg = release['release-group'] if "release-group" in release else None result = SimMatchTrack(similarity=sim, releasegroup=rg, release=release, track=track) return result
def compare_to_release_parts(self, release, weights): parts = [] if "album" in self: b = release['title'] parts.append((similarity2(self["album"], b), weights["album"])) if "albumartist" in self and "albumartist" in weights: a = self["albumartist"] b = artist_credit_from_node(release['artist-credit'])[0] parts.append((similarity2(a, b), weights["albumartist"])) try: a = int(self["totaltracks"]) b = release['track-count'] score = 0.0 if a > b else 0.3 if a < b else 1.0 parts.append((score, weights["totaltracks"])) except (ValueError, KeyError): pass weights_from_preferred_countries( parts, release, config.setting["preferred_release_countries"], weights["releasecountry"]) weights_from_preferred_formats( parts, release, config.setting["preferred_release_formats"], weights["format"]) if "releasetype" in weights: weights_from_release_type_scores( parts, release, config.setting["release_type_scores"], weights["releasetype"]) rg = QObject.tagger.get_release_group_by_id( release['release-group']['id']) if release['id'] in rg.loaded_albums: parts.append((1.0, 6)) return parts
def compare_to_track(self, track, weights): parts = [] if 'title' in self: a = self['title'] b = track.title[0].text parts.append((similarity2(a, b), weights["title"])) if 'artist' in self: a = self['artist'] b = artist_credit_from_node(track.artist_credit[0])[0] parts.append((similarity2(a, b), weights["artist"])) a = self.length if a > 0 and 'length' in track.children: b = int(track.length[0].text) score = 1.0 - min(abs(a - b), 30000) / 30000.0 parts.append((score, weights["length"])) releases = [] if "release_list" in track.children and "release" in track.release_list[ 0].children: releases = track.release_list[0].release if not releases: sim = linear_combination_of_weights(parts) return (sim, None, None, track) result = (-1, ) for release in releases: release_parts = self.compare_to_release_parts(release, weights) sim = linear_combination_of_weights(parts + release_parts) if sim > result[0]: rg = release.release_group[ 0] if "release_group" in release.children else None result = (sim, rg, release, track) return result
def compare_to_track(self, track, weights): parts = [] if 'title' in self: a = self['title'] b = track['title'] parts.append((similarity2(a, b), weights["title"])) if 'artist' in self: a = self['artist'] b = artist_credit_from_node(track['artist-credit'])[0] parts.append((similarity2(a, b), weights["artist"])) a = self.length if a > 0 and 'length' in track: b = track['length'] score = 1.0 - min(abs(a - b), 30000) / 30000.0 parts.append((score, weights["length"])) releases = [] if "releases" in track: releases = track['releases'] if not releases: sim = linear_combination_of_weights(parts) return (sim, None, None, track) result = (-1,) for release in releases: release_parts = self.compare_to_release_parts(release, weights) sim = linear_combination_of_weights(parts + release_parts) if sim > result[0]: rg = release['release-group'] if "release-group" in release else None result = (sim, rg, release, track) return result
def compare_to_track(self, track, weights): parts = [] if 'title' in self: a = self['title'] b = track.title[0].text parts.append((similarity2(a, b), weights["title"])) if 'artist' in self: a = self['artist'] b = artist_credit_from_node(track.artist_credit[0])[0] parts.append((similarity2(a, b), weights["artist"])) a = self.length if a > 0 and 'length' in track.children: b = int(track.length[0].text) score = 1.0 - min(abs(a - b), 30000) / 30000.0 parts.append((score, weights["length"])) releases = [] if "release_list" in track.children and "release" in track.release_list[0].children: releases = track.release_list[0].release if not releases: sim = linear_combination_of_weights(parts) return (sim, None, None, track) result = (-1,) for release in releases: release_parts = self.compare_to_release_parts(release, weights) sim = linear_combination_of_weights(parts + release_parts) if sim > result[0]: rg = release.release_group[0] if "release_group" in release.children else None result = (sim, rg, release, track) return result
def get_close_matches(search_string, search_set, max_results, min_score): winners = [] matches = [] for items in search_set: score = similarity2(search_string, items) if score > min_score: winners.append([score, items]) if winners == []: return [] winners.sort() winners.reverse() for items in winners: if len(matches) < max_results: matches.append(items[1]) return matches
def compare(self, other): parts = [] if self.length and other.length: score = 1.0 - min(abs(self.length - other.length), 30000) / 30000.0 parts.append((score, 8)) for name, weight in self.__weights: a = self[name] b = other[name] if a and b: if name in ('tracknumber', 'totaltracks'): try: ia = int(a) ib = int(b) except ValueError: ia = a ib = b score = 1.0 - abs(cmp(ia, ib)) else: score = similarity2(a, b) parts.append((score, weight)) return linear_combination_of_weights(parts)
def test_2(self): a = "a b c" b = "A,B•C" self.assertEqual(similarity2(a, b), 1.0)
def test_4(self): a = "a b c" b = "c a b" self.assertEqual(similarity2(a, b), 1.0)
def test_1(self): a = b = "a b c" self.assertEqual(similarity2(a, b), 1.0)
def test_not_a(self): a = "" b = "def" self.assertEqual(similarity2(a, b), 0.0)
def test_a_b_totally_different(self): a = "abc" b = "def" self.assertEqual(similarity2(a, b), 0.0)
def test_empty_lists(self): a = " " b = " " self.assertEqual(similarity2(a, b), 0.0)
def test_not_a_and_not_b(self): a = "" b = "" self.assertEqual(similarity2(a, b), 0.0)
def similarity(a, b): return int(similarity2(to_unicode(a), to_unicode(b)) * 100)
def test_5(self): a = "a b c" b = "a b d" self.assertAlmostEqual(similarity2(a, b), 0.6, 1)
if m: try: original = shs.lookup('recording', int(m.group(1))) if 'performer' in original: shs_artists.append(original['performer']['artist']) except ValueError: pass except urllib2.HTTPError: pass for shs_artist in shs_artists: shs_artist_name = mangle_name(re.sub(' \[\d+\]$', '', shs_artist['commonName'])) mb_artist_name = mangle_name(artist['name']) if shs_artist_name == mb_artist_name: artist_uri = shs_artist['uri'] break elif similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name)) > 0.85: print " * '%s' has a similarity of %.2f" % (shs_artist['commonName'], similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name))) artist_uri = shs_artist['uri'] break if artist_uri: matched_artists.add(artist['gid']) colored_out(bcolors.HEADER, ' * using %s, found artist SHS URL: %s' % (artist['shs_url'], artist_uri)) edit_note = 'Guessing artist SecondHandSongs URL from work https://musicbrainz.org/work/%s linked to %s' % (artist['work_gid'], artist['shs_url']) out(' * edit note: %s' % (edit_note,)) mb.add_url('artist', artist['gid'], str(307), artist_uri, edit_note) else: colored_out(bcolors.NONE, ' * using %s, no artist SHS URL has been found' % (artist['shs_url'],)) if artist['processed'] is None and artist['gid'] not in seen_artists:
def compare_to_release(self, release, weights, config): total = 0.0 parts = [] if "album" in self: b = release.title[0].text parts.append((similarity2(self["album"], b), weights["album"])) total += weights["album"] if "totaltracks" in self: a = int(self["totaltracks"]) if "title" in weights: b = int(release.medium_list[0].medium[0].track_list[0].count) else: b = int(release.medium_list[0].track_count[0].text) if a > b: score = 0.0 elif a < b: score = 0.3 else: score = 1.0 parts.append((score, weights["totaltracks"])) total += weights["totaltracks"] preferred_countries = config.setting["preferred_release_countries"].split(" ") preferred_formats = config.setting["preferred_release_formats"].split(" ") total_countries = len(preferred_countries) if total_countries: score = 0.0 if "country" in release.children: try: i = preferred_countries.index(release.country[0].text) score = float(total_countries - i) / float(total_countries) except ValueError: pass parts.append((score, weights["releasecountry"])) total_formats = len(preferred_formats) if total_formats: score = 0.0 subtotal = 0 for medium in release.medium_list[0].medium: if "format" in medium.children: try: i = preferred_formats.index(medium.format[0].text) score += float(total_formats - i) / float(total_formats) except ValueError: pass subtotal += 1 if subtotal > 0: score /= subtotal parts.append((score, weights["format"])) if "releasetype" in weights: type_scores = load_release_type_scores(config.setting["release_type_scores"]) if 'release_group' in release.children and 'type' in release.release_group[0].attribs: release_type = release.release_group[0].type score = type_scores.get(release_type, type_scores.get('Other', 0.5)) else: score = 0.0 parts.append((score, weights["releasetype"])) total += weights["releasetype"] return (total, parts)
def test_7(self): a = "abc" b = "def" self.assertEqual(similarity2(a, b), 0.0)
def test_6(self): a = "a b c" b = "a f d" self.assertAlmostEqual(similarity2(a, b), 0.3, 1)
def compare_to_release_parts(self, release, weights): parts = [] if "album" in self: b = release.title[0].text parts.append((similarity2(self["album"], b), weights["album"])) if "albumartist" in self and "albumartist" in weights: a = self["albumartist"] b = artist_credit_from_node(release.artist_credit[0])[0] parts.append((similarity2(a, b), weights["albumartist"])) if "totaltracks" in self: try: a = int(self["totaltracks"]) except ValueError: pass else: if "title" in weights: b = int(release.medium_list[0].medium[0].track_list[0].count) else: b = int(release.medium_list[0].track_count[0].text) score = 0.0 if a > b else 0.3 if a < b else 1.0 parts.append((score, weights["totaltracks"])) preferred_countries = config.setting["preferred_release_countries"] preferred_formats = config.setting["preferred_release_formats"] total_countries = len(preferred_countries) if total_countries: score = 0.0 if "country" in release.children: try: i = preferred_countries.index(release.country[0].text) score = float(total_countries - i) / float(total_countries) except ValueError: pass parts.append((score, weights["releasecountry"])) total_formats = len(preferred_formats) if total_formats: score = 0.0 subtotal = 0 for medium in release.medium_list[0].medium: if "format" in medium.children: try: i = preferred_formats.index(medium.format[0].text) score += float(total_formats - i) / float(total_formats) except ValueError: pass subtotal += 1 if subtotal > 0: score /= subtotal parts.append((score, weights["format"])) if "releasetype" in weights: type_scores = dict(config.setting["release_type_scores"]) if 'release_group' in release.children and 'type' in release.release_group[0].attribs: release_type = release.release_group[0].type score = type_scores.get(release_type, type_scores.get('Other', 0.5)) else: score = 0.0 parts.append((score, weights["releasetype"])) rg = QObject.tagger.get_release_group_by_id(release.release_group[0].id) if release.id in rg.loaded_albums: parts.append((1.0, 6)) return parts
def compare_to_release_parts(self, release, weights): parts = [] if "album" in self: b = release['title'] parts.append((similarity2(self["album"], b), weights["album"])) if "albumartist" in self and "albumartist" in weights: a = self["albumartist"] b = artist_credit_from_node(release['artist-credit'])[0] parts.append((similarity2(a, b), weights["albumartist"])) try: a = int(self["totaltracks"]) except (ValueError, KeyError): pass else: try: if "title" in weights: b = release['media'][0]['track-count'] else: b = release['track-count'] except KeyError: b = 0 score = 0.0 if a > b else 0.3 if a < b else 1.0 parts.append((score, weights["totaltracks"])) preferred_countries = config.setting["preferred_release_countries"] preferred_formats = config.setting["preferred_release_formats"] total_countries = len(preferred_countries) if total_countries: score = 0.0 if "country" in release: try: i = preferred_countries.index(release['country']) score = float(total_countries - i) / float(total_countries) except ValueError: pass parts.append((score, weights["releasecountry"])) total_formats = len(preferred_formats) if total_formats and 'media' in release: score = 0.0 subtotal = 0 for medium in release['media']: if "format" in medium: try: i = preferred_formats.index(medium['format']) score += float(total_formats - i) / float(total_formats) except ValueError: pass subtotal += 1 if subtotal > 0: score /= subtotal parts.append((score, weights["format"])) if "releasetype" in weights: # This section generates a score that determines how likely this release will be selected in a lookup. # The score goes from 0 to 1 with 1 being the most likely to be chosen and 0 the least likely # This score is based on the preferences of release-types found in this release # This algorithm works by taking the scores of the primary type (and secondary if found) and averages them # If no types are found, it is set to the score of the 'Other' type or 0.5 if 'Other' doesnt exist type_scores = dict(config.setting["release_type_scores"]) score = 0.0 other_score = type_scores.get('Other', 0.5) if 'release-group' in release and 'primary-type' in release['release-group']: types_found = [release['release-group']['primary-type']] if 'secondary-types' in release['release-group']: types_found += release['release-group']['secondary-types'] for release_type in types_found: score += type_scores.get(release_type, other_score) score /= len(types_found) parts.append((score, weights["releasetype"])) rg = QObject.tagger.get_release_group_by_id(release['release-group']['id']) if release['id'] in rg.loaded_albums: parts.append((1.0, 6)) return parts
def test_full_match(self): a = b = "a b c" self.assertEqual(similarity2(a, b), 1.0)
def compare_to_release(self, release, weights, return_parts=False): """ Compare metadata to a MusicBrainz release. Produces a probability as a linear combination of weights that the metadata matches a certain album. """ total = 0.0 parts = [] if "album" in self: b = release.title[0].text parts.append((similarity2(self["album"], b), weights["album"])) total += weights["album"] if "albumartist" in self and "albumartist" in weights: a = self["albumartist"] b = artist_credit_from_node(release.artist_credit[0])[0] parts.append((similarity2(a, b), weights["albumartist"])) total += weights["albumartist"] if "totaltracks" in self: a = int(self["totaltracks"]) if "title" in weights: b = int(release.medium_list[0].medium[0].track_list[0].count) else: b = int(release.medium_list[0].track_count[0].text) score = 0.0 if a > b else 0.3 if a < b else 1.0 parts.append((score, weights["totaltracks"])) total += weights["totaltracks"] preferred_countries = config.setting[ "preferred_release_countries"].split(" ") preferred_formats = config.setting["preferred_release_formats"].split( " ") total_countries = len(preferred_countries) if total_countries: score = 0.0 if "country" in release.children: try: i = preferred_countries.index(release.country[0].text) score = float(total_countries - i) / float(total_countries) except ValueError: pass parts.append((score, weights["releasecountry"])) total_formats = len(preferred_formats) if total_formats: score = 0.0 subtotal = 0 for medium in release.medium_list[0].medium: if "format" in medium.children: try: i = preferred_formats.index(medium.format[0].text) score += float(total_formats - i) / float(total_formats) except ValueError: pass subtotal += 1 if subtotal > 0: score /= subtotal parts.append((score, weights["format"])) if "releasetype" in weights: type_scores = load_release_type_scores( config.setting["release_type_scores"]) if 'release_group' in release.children and 'type' in release.release_group[ 0].attribs: release_type = release.release_group[0].type score = type_scores.get(release_type, type_scores.get('Other', 0.5)) else: score = 0.0 parts.append((score, weights["releasetype"])) total += weights["releasetype"] rg = QObject.tagger.get_release_group_by_id( release.release_group[0].id) if release.id in rg.loaded_albums: parts.append((1.0, 6)) return (total, parts) if return_parts else \ (reduce(lambda x, y: x + y[0] * y[1] / total, parts, 0.0), release)
if m: try: original = shs.lookup('recording', int(m.group(1))) if 'performer' in original: shs_artists.append(original['performer']['artist']) except ValueError: pass except urllib2.HTTPError: pass for shs_artist in shs_artists: shs_artist_name = mangle_name(re.sub(' \[\d+\]$', '', shs_artist['commonName'])) mb_artist_name = mangle_name(artist['name']) if shs_artist_name == mb_artist_name: artist_uri = shs_artist['uri'] break elif similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name)) > 0.85: print " * '%s' has a similarity of %.2f" % (shs_artist['commonName'], similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name))) artist_uri = shs_artist['uri'] break if artist_uri: matched_artists.add(artist['gid']) colored_out(bcolors.HEADER, ' * using %s, found artist SHS URL: %s' % (artist['shs_url'], artist_uri)) edit_note = 'Guessing artist SecondHandSongs URL from work http://musicbrainz.org/work/%s linked to %s' % (artist['work_gid'], artist['shs_url']) out(' * edit note: %s' % (edit_note,)) mb.add_url('artist', artist['gid'], str(307), artist_uri, edit_note) else: colored_out(bcolors.NONE, ' * using %s, no artist SHS URL has been found' % (artist['shs_url'],)) if artist['processed'] is None and artist['gid'] not in seen_artists:
def test_a_longer_than_b(self): a = "a b c d" b = "a d c" self.assertAlmostEqual(similarity2(a, b), 0.88, 1)
def compare_to_release(self, release, weights, return_parts=False): """ Compare metadata to a MusicBrainz release. Produces a probability as a linear combination of weights that the metadata matches a certain album. """ total = 0.0 parts = [] if "album" in self: b = release.title[0].text parts.append((similarity2(self["album"], b), weights["album"])) total += weights["album"] if "albumartist" in self and "albumartist" in weights: a = self["albumartist"] b = artist_credit_from_node(release.artist_credit[0])[0] parts.append((similarity2(a, b), weights["albumartist"])) total += weights["albumartist"] if "totaltracks" in self: a = int(self["totaltracks"]) if "title" in weights: b = int(release.medium_list[0].medium[0].track_list[0].count) else: b = int(release.medium_list[0].track_count[0].text) score = 0.0 if a > b else 0.3 if a < b else 1.0 parts.append((score, weights["totaltracks"])) total += weights["totaltracks"] preferred_countries = config.setting["preferred_release_countries"] preferred_formats = config.setting["preferred_release_formats"] total_countries = len(preferred_countries) if total_countries: score = 0.0 if "country" in release.children: try: i = preferred_countries.index(release.country[0].text) score = float(total_countries - i) / float(total_countries) except ValueError: pass parts.append((score, weights["releasecountry"])) total_formats = len(preferred_formats) if total_formats: score = 0.0 subtotal = 0 for medium in release.medium_list[0].medium: if "format" in medium.children: try: i = preferred_formats.index(medium.format[0].text) score += float(total_formats - i) / float(total_formats) except ValueError: pass subtotal += 1 if subtotal > 0: score /= subtotal parts.append((score, weights["format"])) if "releasetype" in weights: type_scores = load_release_type_scores(config.setting["release_type_scores"]) if 'release_group' in release.children and 'type' in release.release_group[0].attribs: release_type = release.release_group[0].type score = type_scores.get(release_type, type_scores.get('Other', 0.5)) else: score = 0.0 parts.append((score, weights["releasetype"])) total += weights["releasetype"] rg = QObject.tagger.get_release_group_by_id(release.release_group[0].id) if release.id in rg.loaded_albums: parts.append((1.0, 6)) return (total, parts) if return_parts else \ (reduce(lambda x, y: x + y[0] * y[1] / total, parts, 0.0), release)
def compare_to_release_parts(self, release, weights): parts = [] if "album" in self: b = release['title'] parts.append((similarity2(self["album"], b), weights["album"])) if "albumartist" in self and "albumartist" in weights: a = self["albumartist"] b = artist_credit_from_node(release['artist-credit'])[0] parts.append((similarity2(a, b), weights["albumartist"])) try: a = int(self["totaltracks"]) b = release['track-count'] score = 0.0 if a > b else 0.3 if a < b else 1.0 parts.append((score, weights["totaltracks"])) except (ValueError, KeyError): pass # Date Logic date_match_factor = 0.0 if "date" in release and release['date'] != '': release_date = release['date'] if "date" in self: metadata_date = self['date'] if release_date == metadata_date: # release has a date and it matches what our metadata had exactly. date_match_factor = self.__date_match_factors['exact'] else: release_year = extract_year_from_date(release_date) if release_year is not None: metadata_year = extract_year_from_date(metadata_date) if metadata_year is not None: if release_year == metadata_year: # release has a date and it matches what our metadata had for year exactly. date_match_factor = self.__date_match_factors['year'] elif abs(release_year - metadata_year) <= 2: # release has a date and it matches what our metadata had closely (year +/- 2). date_match_factor = self.__date_match_factors['close_year'] else: # release has a date but it does not match ours (all else equal, # its better to have an unknown date than a wrong date, since # the unknown could actually be correct) date_match_factor = self.__date_match_factors['differed'] else: # release has a date but we don't have one (all else equal, we prefer # tracks that have non-blank date values) date_match_factor = self.__date_match_factors['exists_vs_null'] else: # release has a no date (all else equal, we don't prefer this # release since its date is missing) date_match_factor = self.__date_match_factors['no_release_date'] parts.append((date_match_factor, weights['date'])) config = get_config() weights_from_preferred_countries(parts, release, config.setting["preferred_release_countries"], weights["releasecountry"]) weights_from_preferred_formats(parts, release, config.setting["preferred_release_formats"], weights["format"]) if "releasetype" in weights: weights_from_release_type_scores(parts, release, config.setting["release_type_scores"], weights["releasetype"]) rg = QObject.tagger.get_release_group_by_id(release['release-group']['id']) if release['id'] in rg.loaded_albums: parts.append((1.0, 6)) return parts
if m: try: original = shs.lookup('recording', int(m.group(1))) if 'performer' in original: shs_artists.append(original['performer']['artist']) except ValueError: pass except urllib2.HTTPError: pass for shs_artist in shs_artists: shs_artist_name = mangle_name(re.sub(' \[\d+\]$', '', shs_artist['commonName'])) mb_artist_name = mangle_name(artist['name']) if shs_artist_name == mb_artist_name: artist_uri = shs_artist['uri'] break elif similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name)) > 0.85: print "%s => similarity = %.2f" % (shs_artist['commonName'], similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name))) artist_uri = shs_artist['uri'] break if artist_uri: matched_artists.add(artist['gid']) colored_out(bcolors.HEADER, ' * using %s, found artist SHS URL: %s' % (artist['shs_url'], artist_uri)) edit_note = 'Guessing artist SecondHandSongs URL from work http://musicbrainz.org/work/%s linked to %s' % (artist['work_gid'], artist['shs_url']) out(' * edit note: %s' % (edit_note,)) mb.add_url('artist', artist['gid'], str(307), artist_uri, edit_note) else: colored_out(bcolors.NONE, ' * using %s, no artist SHS URL has been found' % (artist['shs_url'],)) if artist['processed'] is None and artist['gid'] not in seen_artists:
def test_match_various_separators_2(self): a = "a b c" b = ",A, B •C•" self.assertEqual(similarity2(a, b), 1.0)
def test_3(self): a = "a b c" b = ",A, B •C•" self.assertEqual(similarity2(a, b), 1.0)
def compare_to_release_parts(self, release, weights): parts = [] if "album" in self: b = release['title'] parts.append((similarity2(self["album"], b), weights["album"])) if "albumartist" in self and "albumartist" in weights: a = self["albumartist"] b = artist_credit_from_node(release['artist-credit'])[0] parts.append((similarity2(a, b), weights["albumartist"])) try: a = int(self["totaltracks"]) except (ValueError, KeyError): pass else: if "title" in weights: b = release['media'][0]['track-count'] else: b = release['track-count'] score = 0.0 if a > b else 0.3 if a < b else 1.0 parts.append((score, weights["totaltracks"])) preferred_countries = config.setting["preferred_release_countries"] preferred_formats = config.setting["preferred_release_formats"] total_countries = len(preferred_countries) if total_countries: score = 0.0 if "country" in release: try: i = preferred_countries.index(release['country']) score = float(total_countries - i) / float(total_countries) except ValueError: pass parts.append((score, weights["releasecountry"])) total_formats = len(preferred_formats) if total_formats: score = 0.0 subtotal = 0 for medium in release['media']: if "format" in medium: try: i = preferred_formats.index(medium['format']) score += float(total_formats - i) / float(total_formats) except ValueError: pass subtotal += 1 if subtotal > 0: score /= subtotal parts.append((score, weights["format"])) if "releasetype" in weights: type_scores = dict(config.setting["release_type_scores"]) if 'release-group' in release and 'primary-type' in release['release-group']: release_type = release['release-group']['primary-type'] score = type_scores.get(release_type, type_scores.get('Other', 0.5)) else: score = 0.0 parts.append((score, weights["releasetype"])) rg = QObject.tagger.get_release_group_by_id(release['release-group']['id']) if release['id'] in rg.loaded_albums: parts.append((1.0, 6)) return parts
try: original = shs.lookup('recording', int(m.group(1))) if 'performer' in original: shs_artists.append(original['performer']['artist']) except ValueError: pass except urllib2.HTTPError: pass for shs_artist in shs_artists: shs_artist_name = mangle_name( re.sub(' \[\d+\]$', '', shs_artist['commonName'])) mb_artist_name = mangle_name(artist['name']) if shs_artist_name == mb_artist_name: artist_uri = shs_artist['uri'] break elif similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name)) > 0.85: print " * '%s' has a similarity of %.2f" % ( shs_artist['commonName'], similarity2(to_unicode(shs_artist_name), to_unicode(mb_artist_name))) artist_uri = shs_artist['uri'] break if artist_uri: matched_artists.add(artist['gid']) colored_out( bcolors.HEADER, ' * using %s, found artist SHS URL: %s' % (artist['shs_url'], artist_uri)) edit_note = 'Guessing artist SecondHandSongs URL from work http://musicbrainz.org/work/%s linked to %s' % ( artist['work_gid'], artist['shs_url']) out(' * edit note: %s' % (edit_note, ))