Python similarity示例，stringDuplicates.similarity Python示例

示例#1

0

显示文件

文件： fuzzy_match.py 项目： Robrib/pycompmusic

    def update_clusters(self):
        """
        """
        if len(self.all_terms) == 0:
            print("Calling get_terms function to find all the terms.")
            self.get_terms()

        existing_terms = np.concatenate(self.clusters.values())
        updated_clusters = []
        for term in self.all_terms:
            if term in existing_terms:
                print(term, " exists in our catalogue\n")
                self.log_file.write(term + " exists in our catalogue\n")
                continue

            # find similarities of a new term to each of the existing clusters
            similarities = []
            for key, values in self.clusters.items():
                s = 0
                for existing_term in values:
                    s += sd.similarity(term, existing_term)
                s /= len(values)
                similarities.append([key, s])
            similarities = np.array(similarities)

            tmp = similarities[:, 1].astype("float")
            max_index = np.argmax(tmp)
            if tmp[max_index] < self.similarity_threshold:
                cluster_name = similarities[max_index][0]
                print("Added new cluster for", term, "(Nearest cluster was ",
                      cluster_name, ") \n")
                self.log_file.write("Added new cluster for " + term +
                                    " (Nearest cluster was " + cluster_name +
                                    ") \n")
                self.clusters[term] = [term]
                updated_clusters.append(term)
            else:
                cluster_name = similarities[max_index][0]
                self.clusters[cluster_name].append(term)
                print(term, "is now part of ", cluster_name, "cluster with",
                      str(tmp[max_index]), "confidence\n")
                self.log_file.write(term + " is now part of " + cluster_name +
                                    " cluster with " + str(tmp[max_index]) +
                                    " confidence\n")
                updated_clusters.append(cluster_name)

        updated_clusters = np.unique(updated_clusters)
        self.log_file.write("The following are the updated clusters:\n")
        for i in updated_clusters:
            self.log_file.write(i + "\n")

示例#2

0

显示文件

文件： fuzzy_match.py 项目： EQ4/pycompmusic

    def update_clusters(self):
        """
        """
        if len(self.all_terms) == 0:
            print "Calling get_terms function to find all the terms."
            self.get_terms()

        existing_terms = np.concatenate(self.clusters.values())
        updated_clusters = []
        for term in self.all_terms:
            if term in existing_terms:
                print term, " exists in our catalogue\n"
                self.log_file.write(term+" exists in our catalogue\n")
                continue

            #find similarities of a new term to each of the existing clusters
            similarities = []
            for key, values in self.clusters.items():
                s = 0
                for existing_term in values:
                    s += sd.similarity(term, existing_term)
                s /= len(values)
                similarities.append([key, s])
            similarities = np.array(similarities)

            tmp = similarities[:, 1].astype("float")
            max_index = np.argmax(tmp)
            if tmp[max_index] < self.similarity_threshold:
                cluster_name = similarities[max_index][0]
                print "Added new cluster for", term, "(Nearest cluster was ", cluster_name, ") \n"
                self.log_file.write("Added new cluster for " + term +
                                    " (Nearest cluster was " + cluster_name + ") \n")
                self.clusters[term] = [term]
                updated_clusters.append(term)
            else:
                cluster_name = similarities[max_index][0]
                self.clusters[cluster_name].append(term)
                print term, "is now part of ", cluster_name, "cluster with", str(tmp[max_index]), "confidence\n"
                self.log_file.write(term + " is now part of " + cluster_name +
                                    " cluster with " + str(tmp[max_index]) + " confidence\n")
                updated_clusters.append(cluster_name)

        updated_clusters = np.unique(updated_clusters)
        self.log_file.write("The following are the updated clusters:\n")
        for i in updated_clusters:
            self.log_file.write(i + "\n")

示例#3

0

显示文件

def updateTaalaClusters(taalaClusters, newterms=[], mbids=[], simThresh=0.6):
    """
    newterms: the new taala terms which need to be synced with taalaClusters, if
    these point to empty list, the function expects a valid mbids list.
    mbids: Those mbids which do not already have a key at taalaMBIDs.yaml
    taalaClusters: The clusters discovered using string matching function, and
    manually corrected. It is a dictionary with keys as actual taala names and
    values as all the possible spelling variations (including the key!).

    returns just the updated taalaClusters. Check them and sync with the
    taalaClusters already stored on filesystem.

    Once you get updatedClusters, write them to yaml file, make corrections,
    reload it to a dictionary, and call mergeClusters(updatedClusters,
    allClusters)
    """

    unwantedChars = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "(", \
                     ")", "[", "]", ".", "-", " "]
    if newterms == []:
        if mbids == []:
            print("Either newterms or mbids must be a valid non-empty list!")
            return
        for mbid in mbids:
            newterm = getTag(mbid, tag="taala")
            if newterm:
                newterms.append(newterm)
    newterms = np.unique(newterms)
    # return newterms

    oldterms = np.concatenate(taalaClusters.values())
    updatedClusters = []
    for newterm in newterms:
        if newterm in oldterms:
            print(newterm, "exists in our cluster\n")
            continue

        similarities = [
        ]  # of a new taala term to each of the existing cluster
        for key, values in taalaClusters.items():
            s = []
            for term in values:
                s.append(sd.similarity(sd.stripChars(term, unwantedChars), \
                                       sd.stripChars(newterm, unwantedChars)))
            # s = s/len(values)
            s = max(s)
            similarities.append([key, s])
        similarities = np.array(similarities)

        tmp = similarities[:, 1].astype("float")
        maxIndex = np.argmax(tmp)
        if tmp[maxIndex] < simThresh:
            taala = similarities[maxIndex][0]
            print("Added new cluster for", newterm, "(Nearest cluster was",
                  taala, " with", tmp[maxIndex], "confidence)\n")
            taalaClusters[newterm] = [newterm]
            updatedClusters.append(newterm)
        else:
            taala = similarities[maxIndex][0]
            taalaClusters[taala].append(newterm)
            print(newterm, "is now part of", taala, "cluster with",
                  tmp[maxIndex], "confidence\n")
            updatedClusters.append(taala)

    updatedClusters = np.unique(updatedClusters)
    for key in taalaClusters.keys():
        if key not in updatedClusters:
            taalaClusters.pop(key)

    return taalaClusters

示例#4

0

显示文件

def updateRaagaClusters(raagaClusters, newterms=[], mbids=[], simThresh=0.6):
    """
    newterms: the new raaga terms which need to be synced with raagaClusters, if
    these point to empty list, the function expects a valid mbids list.
    mbids: Those mbids which do not already have a key at raagaMBIDs.yaml
    raagaClusters: The clusters discovered using string matching function, and
    manually corrected. It is a dictionary with keys as actual raaga names and
    values as all the possible spelling variations (including the key!).

    returns just the updated raagaClusters. Check them and sync with the
    raagaClusters already stored on filesystem.

    Once you get updatedClusters, write them to yaml file, make corrections,
    reload it to a dictionary, and call mergeClusters(updatedClusters,
    allClusters)
    """
    if newterms == []:
        if mbids == []:
            print("Either newterms or mbids must be a valid non-empty list!")
            return
        for mbid in mbids:
            newterm = getTag(mbid, tag="raaga")
            if newterm:
                newterms.append(newterm)

        newterms = [i.strip("0123456789()[]-") for i in newterms]
        newterms = np.unique(newterms)
    # return newterms

    oldterms = np.concatenate(raagaClusters.values())
    updatedClusters = []
    for newterm in newterms:
        if newterm in oldterms:
            print(newterm, "exists in our cluster\n")
            continue

        similarities = []  # of a new raaga term to each of existing cluster
        for key, values in raagaClusters.items():
            s = 0
            for term in values:
                s += sd.similarity(term, newterm)
            s = s / len(values)
            similarities.append([key, s])
        similarities = np.array(similarities)

        tmp = similarities[:, 1].astype("float")
        maxIndex = np.argmax(tmp)
        if tmp[maxIndex] < simThresh:
            raaga = similarities[maxIndex][0]
            print("Added new cluster for", newterm, "(Nearest cluster was",
                  raaga, ") \n")
            raagaClusters[newterm] = [newterm]
            updatedClusters.append(newterm)
        else:
            raaga = similarities[maxIndex][0]
            raagaClusters[raaga].append(newterm)
            print(newterm, "is now part of", raaga, "cluster with",
                  tmp[maxIndex], "confidence\n")
            updatedClusters.append(raaga)

    updatedClusters = np.unique(updatedClusters)
    for key in raagaClusters.keys():
        if key not in updatedClusters:
            raagaClusters.pop(key)

    return raagaClusters

示例#5

0

显示文件

文件： utils.py 项目： EQ4/pycompmusic

def updateTaalaClusters(taalaClusters, newterms = [], mbids = [], simThresh=0.6):
    """
    newterms: the new taala terms which need to be synced with taalaClusters, if
    these point to empty list, the function expects a valid mbids list.
    mbids: Those mbids which do not already have a key at taalaMBIDs.yaml
    taalaClusters: The clusters discovered using string matching function, and
    manually corrected. It is a dictionary with keys as actual taala names and
    values as all the possible spelling variations (including the key!).

    returns just the updated taalaClusters. Check them and sync with the
    taalaClusters already stored on filesystem.

    Once you get updatedClusters, write them to yaml file, make corrections,
    reload it to a dictionary, and call mergeClusters(updatedClusters,
    allClusters)
    """

    unwantedChars = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "(",\
                    ")", "[", "]", ".", "-", " "]
    if newterms == []:
        if mbids == []:
            print "Either newterms or mbids must be a valid non-empty list!"
            return
        for mbid in mbids:
            newterm = getTag(mbid, tag="taala")
            if newterm:
                newterms.append(newterm)
    newterms = np.unique(newterms)
    #return newterms

    oldterms = np.concatenate(taalaClusters.values())
    updatedClusters = []
    for newterm in newterms:
        if newterm in oldterms:
            print newterm, "exists in our cluster\n"
            continue

        similarities = [] #of a new taala term to each of the existing cluster
        for key, values in taalaClusters.items():
            s = []
            for term in values:
                s.append(sd.similarity(sd.stripChars(term, unwantedChars),\
                                   sd.stripChars(newterm, unwantedChars)))
            #s = s/len(values)
            s = max(s)
            similarities.append([key, s])
        similarities = np.array(similarities)

        tmp = similarities[:, 1].astype("float")
        maxIndex = np.argmax(tmp)
        if tmp[maxIndex] < simThresh:
            taala = similarities[maxIndex][0]
            print "Added new cluster for", newterm, "(Nearest cluster was",\
            taala, " with", tmp[maxIndex], "confidence)\n"
            taalaClusters[newterm] = [newterm]
            updatedClusters.append(newterm)
        else:
            taala = similarities[maxIndex][0]
            taalaClusters[taala].append(newterm)
            print newterm, "is now part of", taala, "cluster with",\
            tmp[maxIndex], "confidence\n"
            updatedClusters.append(taala)

    updatedClusters = np.unique(updatedClusters)
    for key in taalaClusters.keys():
        if key not in updatedClusters:
            taalaClusters.pop(key)

    return taalaClusters

示例#6

0

显示文件

文件： utils.py 项目： EQ4/pycompmusic

def updateRaagaClusters(raagaClusters, newterms = [], mbids = [], simThresh=0.6):
    """
    newterms: the new raaga terms which need to be synced with raagaClusters, if
    these point to empty list, the function expects a valid mbids list.
    mbids: Those mbids which do not already have a key at raagaMBIDs.yaml
    raagaClusters: The clusters discovered using string matching function, and
    manually corrected. It is a dictionary with keys as actual raaga names and
    values as all the possible spelling variations (including the key!).

    returns just the updated raagaClusters. Check them and sync with the
    raagaClusters already stored on filesystem.

    Once you get updatedClusters, write them to yaml file, make corrections,
    reload it to a dictionary, and call mergeClusters(updatedClusters,
    allClusters)
    """
    if newterms == []:
        if mbids == []:
            print "Either newterms or mbids must be a valid non-empty list!"
            return
        for mbid in mbids:
            newterm = getTag(mbid, tag="raaga")
            if newterm:
                newterms.append(newterm)

        newterms = [i.strip("0123456789()[]-") for i in newterms]
        newterms = np.unique(newterms)
    #return newterms

    oldterms = np.concatenate(raagaClusters.values())
    updatedClusters = []
    for newterm in newterms:
        if newterm in oldterms:
            print newterm, "exists in our cluster\n"
            continue

        similarities = [] #of a new raaga term to each of existing cluster
        for key, values in raagaClusters.items():
            s = 0
            for term in values:
                s += sd.similarity(term, newterm)
            s = s/len(values)
            similarities.append([key, s])
        similarities = np.array(similarities)

        tmp = similarities[:, 1].astype("float")
        maxIndex = np.argmax(tmp)
        if tmp[maxIndex] < simThresh:
            raaga = similarities[maxIndex][0]
            print "Added new cluster for", newterm, "(Nearest cluster was",\
            raaga, ") \n"
            raagaClusters[newterm] = [newterm]
            updatedClusters.append(newterm)
        else:
            raaga = similarities[maxIndex][0]
            raagaClusters[raaga].append(newterm)
            print newterm, "is now part of", raaga, "cluster with",\
            tmp[maxIndex], "confidence\n"
            updatedClusters.append(raaga)

    updatedClusters = np.unique(updatedClusters)
    for key in raagaClusters.keys():
        if key not in updatedClusters:
            raagaClusters.pop(key)

    return raagaClusters