Exemplo n.º 1
0
    def checkSanity(self):
        """Apply field-specific checks to eliminate bogus results.

        The current checks:
            - date is an int between 1600 and current year + 1
            - tracktotal is an int equal to the number of tracks in directory
            - tracknumbers are sequential ints -- none are missing or repeated
        
        Possible checks:
            - master MB check (what exactly does this mean?)"""

        # Check date is an int between 1600 and current year + 1.
        for track in self.release.tracks:
            try:
                year = int(track.metadata["date"])
            except ValueError:
                raise ReleaseManagerError, "Year is not an integer."
            if not functions.isDate(year):
                raise ReleaseManagerError, "Year is not between 1600 and next year."

        # Check tracktotal is equal to number of tracks in the directory.
        for track in self.release.tracks:
            try:
                tracktotal = int(track.metadata["tracktotal"])
            except ValueError:
                raise ReleaseManagerError, "Tracktotal is not an integer."
            if tracktotal != len(track.parent.tracks):
                err = (
                    "Tracktotal does not equal number of tracks in directory"
                    " indicating one or more tracks are missing.")
                raise ReleaseManagerError, err

        # Check tracks are sequential -- none missing or repeated.
        tracknumbers = []
        for track in self.release.tracks:
            try:
                tracknumber = int(track.metadata["tracknumber"])
            except:
                raise ReleaseManagerError, "One or more tracknumber is not an integer."
            if not functions.isTrackNumber(tracknumber):
                raise ReleaseManagerError, "Track number is not between 1 and 99."
            tracknumbers.append(tracknumber)
        tracknumbers.sort()

        for i in range(tracknumbers[-1]):
            if not (i + 1) in tracknumbers:
                s = "The release is missing one or more tracks. "
                s += "Track numbers: " + str(tracknumbers)
                raise ReleaseManagerError, s

        if len(tracknumbers) > tracknumbers[-1]:
            # TODO: Choose the better version of the repeated track
            s = "The release has one or more repeated tracks. "
            s += "Track numbers: " + str(tracknumbers)
            raise ReleaseManagerError, s
Exemplo n.º 2
0
    def checkSanity(self):
        """Apply field-specific checks to eliminate bogus results.

        The current checks:
            - date is an int between 1600 and current year + 1
            - tracktotal is an int equal to the number of tracks in directory
            - tracknumbers are sequential ints -- none are missing or repeated
        
        Possible checks:
            - master MB check (what exactly does this mean?)"""
        
        # Check date is an int between 1600 and current year + 1.
        for track in self.release.tracks:
            try:
                year = int(track.metadata["date"])
            except ValueError:
                raise ReleaseManagerError, "Year is not an integer."
            if not functions.isDate(year):
                raise ReleaseManagerError, "Year is not between 1600 and next year."
        
        # Check tracktotal is equal to number of tracks in the directory.
        for track in self.release.tracks:
            try:
                tracktotal = int(track.metadata["tracktotal"])
            except ValueError:
                raise ReleaseManagerError, "Tracktotal is not an integer."
            if tracktotal != len(track.parent.tracks):
                err = ("Tracktotal does not equal number of tracks in directory"
                       " indicating one or more tracks are missing.")
                raise ReleaseManagerError, err
                    
        # Check tracks are sequential -- none missing or repeated.
        tracknumbers = []
        for track in self.release.tracks:
            try:
                tracknumber = int(track.metadata["tracknumber"])
            except:
                raise ReleaseManagerError, "One or more tracknumber is not an integer."
            if not functions.isTrackNumber(tracknumber):
                raise ReleaseManagerError, "Track number is not between 1 and 99."
            tracknumbers.append(tracknumber)
        tracknumbers.sort()
        
        for i in range(tracknumbers[-1]):
            if not (i + 1) in tracknumbers:
                s = "The release is missing one or more tracks. "
                s += "Track numbers: " + str(tracknumbers)
                raise ReleaseManagerError, s

        if len(tracknumbers) > tracknumbers[-1]:
            # TODO: Choose the better version of the repeated track
            s = "The release has one or more repeated tracks. "
            s += "Track numbers: " + str(tracknumbers)
            raise ReleaseManagerError, s
Exemplo n.º 3
0
def findFuzzyMatch(field, match, track, preFilter, postFilter):
    """Fuzzily match unreliable data (from tags and filename) to MusicBrainz.

    Tags and filenames especially may contain special characters or extraneous
    data which will make a MusicBrainz search fail. This function removes 
    special characters and, if the full string does not match, splits it
    based on a delimiters list and tries the substrings.

    Example:

    Filename: "2000 -*- The Better Life (Advance) -[[EAK-group]]-"
    Initial search for full string fails. String is broken into substrings.
    Substrings: "2000", "The Better Life", "Advance", "EAK", "group"
    Without any other filters "The Better Life" and "Advance" will both match
    and unable to choose one over the other, we will fail.
    With a filter (like the artist or date) then only "The Better Life" will
    match and the search will succeed.
    
    Fuzzy matching is only used for artist, release and title fields, because
    these are the only fields with strings to fuzzily match against."""

    if isinstance(match, FilepathString):
        log("Splitting path into directory and file name, then trying each.")
        dirName, fileName = os.path.split(match)

        dirResult = executeQuery(field, dirName, preFilter, postFilter)
        if dirResult:
            # We won't look to see if the filename matches, because even if it
            # did, the directory generally has better odds of containing
            # an artist or release anyway. (We know we are looking for an
            # artist or release, because only requests for those fields pass in
            # a filepath. Track title requests just pass in the file name.)
            return dirResult

        fileResult = executeQuery(field, fileName, preFilter, postFilter)
        if fileResult:
            return fileResult

    else:
        result = executeQuery(field, match, preFilter, postFilter)
        if result:
            return result

    delimiters = r"[/()\-~+_\[\]\{\}*]"
    substrings = re.split(delimiters, match)
    substrings = [string.strip() for string in substrings if string.strip()]

    log("MB did not find a match for the full string.")
    log("Searching for a match in substrings.")
    log("Substrings: %s\n" % substrings)

    matches = set()
    whatFromWhere = {}
    for substring in substrings:
        result = executeQuery(field, substring, preFilter, postFilter)
        if result:
            whatFromWhere[result] = substring
            matches.add(result)

    if len(matches) > 1:
        # If we have more than one result, attempt to remove matches which
        # probably are not correct until we have only one match left or we run
        # out of methods for removing bogus results.
        # Potentially bogus results are removed in the order of the likelihood that
        # they are incorrect.
        #
        # The current filters (in order):
        #   - result is very different from substring
        #   - result looks like tracknumber or year
        #   - result is digits
        #   - result is (about) equal to already known artist, release or title
        #   - substring was digits

        # TODO:
        #   Order tests correctly.
        #   Use difflib in addition to aboutEqual.
        #   Use two levels of delimiters.
        #   Add filter to remove results which are (about) equal to one another.
        #
        #   Order #1 (filter all results, filter all substring)
        #   - result looks like tracknumber or year
        #   - result is digits
        #   - result is (about) equal to already known artist, release or title
        #
        #   - substring looked like tracknumber or year
        #   - substring was digits
        #   - substring was (about) equal artist, release, title
        #
        #   Order #2 (filter result then substring, then next filter)
        #   - result looks like tracknumber or year
        #   - substring looked like tracknumber or year
        #
        #   - result is digits
        #   - substring was digits
        #
        #   - result is (about) equal to already known artist, release or title
        #   - substring was ... artist, release, title

        log("Multiple substrings matched: %s" % matches)
        log("Removing matches which are probably wrong.")

        # Remove matches which are either a tracknumber or a year.
        # Tracknumbers are identified by being one or two digits (possibly with
        # leading zero) under 99.
        # Years are four consecutive digits between 1600 and current year + 1.
        for match in matches.copy():
            if len(matches) > 1:
                if match.isdigit():
                    num = int(match)
                    if functions.isTrackNumber(num) or functions.isDate(num):
                        matches.remove(match)
            else:
                break

        # Remove matches which are just digits.
        for match in matches.copy():
            if len(matches) > 1:
                if match.isdigit():
                    matches.remove(match)
            else:
                break

        # Remove results which came from strings of digits.
        for match in matches.copy():
            if len(matches) > 1:
                if whatFromWhere[match].isdigit():
                    matches.remove(match)
            else:
                break

        # If we still have more than one result, than we will remove values that
        # are known to be correct for a different field. In particular, we'll
        # look at the artist, album and title fields and remove matches
        # equivalent to those fields - in that order.
        relatedFields = ["artist", "release", "title"]
        relatedFields.remove(field)
        relatedData = []
        for field in relatedFields:
            if field in track.metadata:
                relatedData.append(track.metadata[field])

        # Remove matches which are the same as the already known artist,
        # release or title intelligently.
        # TODO: Figure out how to make TODOs highlighted in yellow.
        def equal(match, datum):
            return match == datum

        def inside(match, datum):
            return datum.lower() in match.lower()

        if len(matches) > 1:
            for datum in relatedData:
                for equivalenceFunc in (equal, aboutEqual, inside):
                    for match in matches.copy():
                        if len(matches) > 1:
                            if equivalenceFunc(match, datum):
                                matches.remove(match)
                        else:
                            break

        # Remove matches which are signficantly different than the substring
        # they came from.
        for match in matches.copy():
            if len(matches) > 1:
                diff = difflib.get_close_matches(whatFromWhere[match], [match])
                if not diff:
                    matches.remove(match)
            else:
                break

    if len(matches) == 1:
        match = matches.pop()
        log("MB matched a string to a %s: %s" % (field, quote(match)))
        return match
    else:
        log("%d substrings matched." % len(matches))
        if matches:
            log("Unable to select between them.")
            log("Filtered matches: %s" % matches)
        log("Fuzzy matching failed.")
        return u""
Exemplo n.º 4
0
def findFuzzyMatch(field, match, track, preFilter, postFilter):
    """Fuzzily match unreliable data (from tags and filename) to MusicBrainz.

    Tags and filenames especially may contain special characters or extraneous
    data which will make a MusicBrainz search fail. This function removes 
    special characters and, if the full string does not match, splits it
    based on a delimiters list and tries the substrings.

    Example:

    Filename: "2000 -*- The Better Life (Advance) -[[EAK-group]]-"
    Initial search for full string fails. String is broken into substrings.
    Substrings: "2000", "The Better Life", "Advance", "EAK", "group"
    Without any other filters "The Better Life" and "Advance" will both match
    and unable to choose one over the other, we will fail.
    With a filter (like the artist or date) then only "The Better Life" will
    match and the search will succeed.
    
    Fuzzy matching is only used for artist, release and title fields, because
    these are the only fields with strings to fuzzily match against."""
    
    if isinstance(match, FilepathString):
        log("Splitting path into directory and file name, then trying each.")
        dirName, fileName = os.path.split(match)
        
        dirResult = executeQuery(field, dirName, preFilter, postFilter)
        if dirResult:
            # We won't look to see if the filename matches, because even if it
            # did, the directory generally has better odds of containing 
            # an artist or release anyway. (We know we are looking for an 
            # artist or release, because only requests for those fields pass in 
            # a filepath. Track title requests just pass in the file name.)
            return dirResult
        
        fileResult = executeQuery(field, fileName, preFilter, postFilter)
        if fileResult:
            return fileResult
    
    else:
        result = executeQuery(field, match, preFilter, postFilter)
        if result:
            return result
    
    delimiters = r"[/()\-~+_\[\]\{\}*]"
    substrings = re.split(delimiters, match)
    substrings = [string.strip() for string in substrings if string.strip()]
    
    log("MB did not find a match for the full string.")    
    log("Searching for a match in substrings.")
    log("Substrings: %s\n" % substrings)

    matches = set()
    whatFromWhere = {}
    for substring in substrings:
        result = executeQuery(field, substring, preFilter, postFilter)
        if result:
            whatFromWhere[result] = substring
            matches.add(result)
    
    if len(matches) > 1:
        # If we have more than one result, attempt to remove matches which 
        # probably are not correct until we have only one match left or we run
        # out of methods for removing bogus results.
        # Potentially bogus results are removed in the order of the likelihood that
        # they are incorrect.
        #
        # The current filters (in order):
        #   - result is very different from substring
        #   - result looks like tracknumber or year
        #   - result is digits
        #   - result is (about) equal to already known artist, release or title
        #   - substring was digits
        
        # TODO:
        #   Order tests correctly.
        #   Use difflib in addition to aboutEqual.
        #   Use two levels of delimiters.
        #   Add filter to remove results which are (about) equal to one another.
        # 
        #   Order #1 (filter all results, filter all substring)
        #   - result looks like tracknumber or year
        #   - result is digits
        #   - result is (about) equal to already known artist, release or title
        #
        #   - substring looked like tracknumber or year
        #   - substring was digits
        #   - substring was (about) equal artist, release, title
        #
        #   Order #2 (filter result then substring, then next filter)
        #   - result looks like tracknumber or year
        #   - substring looked like tracknumber or year    
        #
        #   - result is digits
        #   - substring was digits    
        #
        #   - result is (about) equal to already known artist, release or title
        #   - substring was ... artist, release, title
    
        log("Multiple substrings matched: %s" % matches)
        log("Removing matches which are probably wrong.")
        
        # Remove matches which are either a tracknumber or a year.
        # Tracknumbers are identified by being one or two digits (possibly with
        # leading zero) under 99.
        # Years are four consecutive digits between 1600 and current year + 1.
        for match in matches.copy():
            if len(matches) > 1:        
                if match.isdigit():
                    num = int(match)
                    if functions.isTrackNumber(num) or functions.isDate(num):
                        matches.remove(match)
            else:
                break
        
        # Remove matches which are just digits.
        for match in matches.copy():
            if len(matches) > 1:        
                if match.isdigit():
                    matches.remove(match)
            else:
                break
        
        # Remove results which came from strings of digits.
        for match in matches.copy():
            if len(matches) > 1:
                if whatFromWhere[match].isdigit():
                    matches.remove(match)
            else:
                break
        
        # If we still have more than one result, than we will remove values that
        # are known to be correct for a different field. In particular, we'll
        # look at the artist, album and title fields and remove matches
        # equivalent to those fields - in that order.
        relatedFields = ["artist", "release", "title"]
        relatedFields.remove(field)
        relatedData = []
        for field in relatedFields:
            if field in track.metadata:
                relatedData.append(track.metadata[field])
        
        # Remove matches which are the same as the already known artist, 
        # release or title intelligently.
        # TODO: Figure out how to make TODOs highlighted in yellow.
        def equal(match, datum):
            return match == datum
        
        def inside(match, datum):
            return datum.lower() in match.lower()
        
        if len(matches) > 1:
            for datum in relatedData:
                for equivalenceFunc in (equal, aboutEqual, inside):
                    for match in matches.copy():
                        if len(matches) > 1:
                            if equivalenceFunc(match, datum):
                                matches.remove(match)
                        else:
                            break
                            
        
        # Remove matches which are signficantly different than the substring
        # they came from.
        for match in matches.copy():
            if len(matches) > 1:
                diff = difflib.get_close_matches(whatFromWhere[match], [match])
                if not diff:
                    matches.remove(match)
            else:
                break
    
    if len(matches) == 1:
        match = matches.pop()
        log("MB matched a string to a %s: %s" % (field, quote(match)))
        return match
    else:
        log("%d substrings matched." % len(matches))
        if matches:
            log("Unable to select between them.")
            log("Filtered matches: %s" % matches)
        log("Fuzzy matching failed.")
        return u""