Exemplo n.º 1
0
def find_genre_via_discogs(artist, album):
    """
    Try to find the genre from discogs.com using only the artist and album
    as base.

    The following strategy is taken:

        1) Try to find the artist/album combinations using
           levenshtein fuzzy matching.
        2) If the exact combinations was not found the genre
           is taken from the other known albums of this artist.

    The resulting genre may not very informative for humans, but is easily
    split into seperate sub-genres by the GenreTreeProvider, which is good
    for comparasions.

    Future versions might include a version that tries to find a more
    human readable string.

    Example output: ::

        Genre: Non-Music; Folk, World, & Country; Stage & Screen / Comedy, Monolog, Spoken Word, Political

    .. note::

        Tip: Use :class:`munin.distance.GenreTreeAvgDistance` with this data.
        The normal :class:`munin.distance.GenreTreeProvider` uses Single Linkage,
        which may give too good distances often enough.

    :param artist: The artist string to search for (gets normalised)
    :param album: The album string to search for (gets normalised)
    :returns: A genre string like in the example or None.
    """
    # Get the data from discogs
    api_root = DISCOGS_API_SEARCH_URL.format(artist=quote(artist))
    html_doc = urlopen(api_root).read().decode('utf-8')
    json_doc = json.loads(html_doc)

    # Normalize the input artist/album
    artist_normalizer = ArtistNormalizeProvider()
    album_normalizer = AlbumNormalizeProvider()
    artist, *_ = artist_normalizer.do_process(artist)
    album, *_ = album_normalizer.do_process(album)

    genre_set, style_set = _find_right_genre(json_doc, artist, album, True)
    if not (genre_set or style_set):
        # Lower the expectations, just take the genre of
        # all known albums of this artist, if any:
        genre_set, style_set = _find_right_genre(json_doc, artist, album,
                                                 False)

    # Still not? Welp.
    if not (genre_set or style_set):
        return None

    # Bulid a genre string that is formatted this way:
    #  genre1; genre2 [;...] / style1, style2, style3 [,...]
    #  blues, rock / blues rock, country rock, christian blues
    return ' / '.join((', '.join(k for k, v in genre_set.most_common(3)),
                       ', '.join(k for k, v in style_set.most_common(4))))
Exemplo n.º 2
0
def _find_right_genre(json_doc, artist, album, persist_on_album):
    """
    Try to read the correct genre from the json document by discogs.

    :param artist: a normalized artist
    :param album: a normalized album.
    :param persist_on_album: If False, other albums of this artist are valid sources too.
    :returns: A set of music genres (i.e. rock) and a set of styles (i.e. death metal)
    """
    genre_set, style_set = Counter(), Counter()
    for item in json_doc['results']:
        # Some artist items have not a style in them.
        # Skip these items.
        if 'style' not in item:
            continue

        # Get the remote artist/album from the title, also normalise them.
        artist_normalizer = ArtistNormalizeProvider()
        album_normalizer = AlbumNormalizeProvider()
        remote_artist, remote_album = item['title'].split(' - ', maxsplit=1)
        remote_artist, *_ = artist_normalizer.do_process(remote_artist)
        remote_album, *_ = album_normalizer.do_process(remote_album)

        # Try to outweight spelling errors, or small
        # pre/suffixes to the artist. (i.e. 'the beatles' <-> beatles')
        if levenshtein(artist, remote_artist) > 0.5:
            continue

        # Same for the album:
        if persist_on_album and levenshtein(album, remote_album) > 0.5:
            continue

        # Remember the set of all genres and styles.
        genre_set.update(item['genre'])
        style_set.update(item['style'])

    _filter_spam(genre_set)
    _filter_spam(style_set)
    _filter_crosslinks(genre_set, style_set)
    return genre_set, style_set
Exemplo n.º 3
0
def _find_right_genre(json_doc, artist, album, persist_on_album):
    """
    Try to read the correct genre from the json document by discogs.

    :param artist: a normalized artist
    :param album: a normalized album.
    :param persist_on_album: If False, other albums of this artist are valid sources too.
    :returns: A set of music genres (i.e. rock) and a set of styles (i.e. death metal)
    """
    genre_set, style_set = Counter(), Counter()
    for item in json_doc['results']:
        # Some artist items have not a style in them.
        # Skip these items.
        if 'style' not in item:
            continue

        # Get the remote artist/album from the title, also normalise them.
        artist_normalizer = ArtistNormalizeProvider()
        album_normalizer = AlbumNormalizeProvider()
        remote_artist, remote_album = item['title'].split(' - ', maxsplit=1)
        remote_artist, *_ = artist_normalizer.do_process(remote_artist)
        remote_album, *_ = album_normalizer.do_process(remote_album)

        # Try to outweight spelling errors, or small
        # pre/suffixes to the artist. (i.e. 'the beatles' <-> beatles')
        if levenshtein(artist, remote_artist) > 0.5:
            continue

        # Same for the album:
        if persist_on_album and levenshtein(album, remote_album) > 0.5:
            continue

        # Remember the set of all genres and styles.
        genre_set.update(item['genre'])
        style_set.update(item['style'])

    _filter_spam(genre_set)
    _filter_spam(style_set)
    _filter_crosslinks(genre_set, style_set)
    return genre_set, style_set
Exemplo n.º 4
0
def find_genre_via_discogs(artist, album):
    """
    Try to find the genre from discogs.com using only the artist and album
    as base.

    The following strategy is taken:

        1) Try to find the artist/album combinations using
           levenshtein fuzzy matching.
        2) If the exact combinations was not found the genre
           is taken from the other known albums of this artist.

    The resulting genre may not very informative for humans, but is easily
    split into seperate sub-genres by the GenreTreeProvider, which is good
    for comparasions.

    Future versions might include a version that tries to find a more
    human readable string.

    Example output: ::

        Genre: Non-Music; Folk, World, & Country; Stage & Screen / Comedy, Monolog, Spoken Word, Political

    .. note::

        Tip: Use :class:`munin.distance.GenreTreeAvgDistance` with this data.
        The normal :class:`munin.distance.GenreTreeProvider` uses Single Linkage,
        which may give too good distances often enough.

    :param artist: The artist string to search for (gets normalised)
    :param album: The album string to search for (gets normalised)
    :returns: A genre string like in the example or None.
    """
    # Get the data from discogs
    api_root = DISCOGS_API_SEARCH_URL.format(artist=quote(artist))
    html_doc = urlopen(api_root).read().decode('utf-8')
    json_doc = json.loads(html_doc)

    # Normalize the input artist/album
    artist_normalizer = ArtistNormalizeProvider()
    album_normalizer = AlbumNormalizeProvider()
    artist, *_ = artist_normalizer.do_process(artist)
    album, *_ = album_normalizer.do_process(album)

    genre_set, style_set = _find_right_genre(json_doc, artist, album, True)
    if not (genre_set or style_set):
        # Lower the expectations, just take the genre of
        # all known albums of this artist, if any:
        genre_set, style_set = _find_right_genre(json_doc, artist, album, False)

    # Still not? Welp.
    if not (genre_set or style_set):
        return None

    # Bulid a genre string that is formatted this way:
    #  genre1; genre2 [;...] / style1, style2, style3 [,...]
    #  blues, rock / blues rock, country rock, christian blues
    return ' / '.join((
        ', '.join(k for k, v in genre_set.most_common(3)),
        ', '.join(k for k, v in style_set.most_common(4))
    ))