def update_clusters(self): """ """ if len(self.all_terms) == 0: print("Calling get_terms function to find all the terms.") self.get_terms() existing_terms = np.concatenate(self.clusters.values()) updated_clusters = [] for term in self.all_terms: if term in existing_terms: print(term, " exists in our catalogue\n") self.log_file.write(term + " exists in our catalogue\n") continue # find similarities of a new term to each of the existing clusters similarities = [] for key, values in self.clusters.items(): s = 0 for existing_term in values: s += sd.similarity(term, existing_term) s /= len(values) similarities.append([key, s]) similarities = np.array(similarities) tmp = similarities[:, 1].astype("float") max_index = np.argmax(tmp) if tmp[max_index] < self.similarity_threshold: cluster_name = similarities[max_index][0] print("Added new cluster for", term, "(Nearest cluster was ", cluster_name, ") \n") self.log_file.write("Added new cluster for " + term + " (Nearest cluster was " + cluster_name + ") \n") self.clusters[term] = [term] updated_clusters.append(term) else: cluster_name = similarities[max_index][0] self.clusters[cluster_name].append(term) print(term, "is now part of ", cluster_name, "cluster with", str(tmp[max_index]), "confidence\n") self.log_file.write(term + " is now part of " + cluster_name + " cluster with " + str(tmp[max_index]) + " confidence\n") updated_clusters.append(cluster_name) updated_clusters = np.unique(updated_clusters) self.log_file.write("The following are the updated clusters:\n") for i in updated_clusters: self.log_file.write(i + "\n")
def update_clusters(self): """ """ if len(self.all_terms) == 0: print "Calling get_terms function to find all the terms." self.get_terms() existing_terms = np.concatenate(self.clusters.values()) updated_clusters = [] for term in self.all_terms: if term in existing_terms: print term, " exists in our catalogue\n" self.log_file.write(term+" exists in our catalogue\n") continue #find similarities of a new term to each of the existing clusters similarities = [] for key, values in self.clusters.items(): s = 0 for existing_term in values: s += sd.similarity(term, existing_term) s /= len(values) similarities.append([key, s]) similarities = np.array(similarities) tmp = similarities[:, 1].astype("float") max_index = np.argmax(tmp) if tmp[max_index] < self.similarity_threshold: cluster_name = similarities[max_index][0] print "Added new cluster for", term, "(Nearest cluster was ", cluster_name, ") \n" self.log_file.write("Added new cluster for " + term + " (Nearest cluster was " + cluster_name + ") \n") self.clusters[term] = [term] updated_clusters.append(term) else: cluster_name = similarities[max_index][0] self.clusters[cluster_name].append(term) print term, "is now part of ", cluster_name, "cluster with", str(tmp[max_index]), "confidence\n" self.log_file.write(term + " is now part of " + cluster_name + " cluster with " + str(tmp[max_index]) + " confidence\n") updated_clusters.append(cluster_name) updated_clusters = np.unique(updated_clusters) self.log_file.write("The following are the updated clusters:\n") for i in updated_clusters: self.log_file.write(i + "\n")
def updateTaalaClusters(taalaClusters, newterms=[], mbids=[], simThresh=0.6): """ newterms: the new taala terms which need to be synced with taalaClusters, if these point to empty list, the function expects a valid mbids list. mbids: Those mbids which do not already have a key at taalaMBIDs.yaml taalaClusters: The clusters discovered using string matching function, and manually corrected. It is a dictionary with keys as actual taala names and values as all the possible spelling variations (including the key!). returns just the updated taalaClusters. Check them and sync with the taalaClusters already stored on filesystem. Once you get updatedClusters, write them to yaml file, make corrections, reload it to a dictionary, and call mergeClusters(updatedClusters, allClusters) """ unwantedChars = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "(", \ ")", "[", "]", ".", "-", " "] if newterms == []: if mbids == []: print("Either newterms or mbids must be a valid non-empty list!") return for mbid in mbids: newterm = getTag(mbid, tag="taala") if newterm: newterms.append(newterm) newterms = np.unique(newterms) # return newterms oldterms = np.concatenate(taalaClusters.values()) updatedClusters = [] for newterm in newterms: if newterm in oldterms: print(newterm, "exists in our cluster\n") continue similarities = [ ] # of a new taala term to each of the existing cluster for key, values in taalaClusters.items(): s = [] for term in values: s.append(sd.similarity(sd.stripChars(term, unwantedChars), \ sd.stripChars(newterm, unwantedChars))) # s = s/len(values) s = max(s) similarities.append([key, s]) similarities = np.array(similarities) tmp = similarities[:, 1].astype("float") maxIndex = np.argmax(tmp) if tmp[maxIndex] < simThresh: taala = similarities[maxIndex][0] print("Added new cluster for", newterm, "(Nearest cluster was", taala, " with", tmp[maxIndex], "confidence)\n") taalaClusters[newterm] = [newterm] updatedClusters.append(newterm) else: taala = similarities[maxIndex][0] taalaClusters[taala].append(newterm) print(newterm, "is now part of", taala, "cluster with", tmp[maxIndex], "confidence\n") updatedClusters.append(taala) updatedClusters = np.unique(updatedClusters) for key in taalaClusters.keys(): if key not in updatedClusters: taalaClusters.pop(key) return taalaClusters
def updateRaagaClusters(raagaClusters, newterms=[], mbids=[], simThresh=0.6): """ newterms: the new raaga terms which need to be synced with raagaClusters, if these point to empty list, the function expects a valid mbids list. mbids: Those mbids which do not already have a key at raagaMBIDs.yaml raagaClusters: The clusters discovered using string matching function, and manually corrected. It is a dictionary with keys as actual raaga names and values as all the possible spelling variations (including the key!). returns just the updated raagaClusters. Check them and sync with the raagaClusters already stored on filesystem. Once you get updatedClusters, write them to yaml file, make corrections, reload it to a dictionary, and call mergeClusters(updatedClusters, allClusters) """ if newterms == []: if mbids == []: print("Either newterms or mbids must be a valid non-empty list!") return for mbid in mbids: newterm = getTag(mbid, tag="raaga") if newterm: newterms.append(newterm) newterms = [i.strip("0123456789()[]-") for i in newterms] newterms = np.unique(newterms) # return newterms oldterms = np.concatenate(raagaClusters.values()) updatedClusters = [] for newterm in newterms: if newterm in oldterms: print(newterm, "exists in our cluster\n") continue similarities = [] # of a new raaga term to each of existing cluster for key, values in raagaClusters.items(): s = 0 for term in values: s += sd.similarity(term, newterm) s = s / len(values) similarities.append([key, s]) similarities = np.array(similarities) tmp = similarities[:, 1].astype("float") maxIndex = np.argmax(tmp) if tmp[maxIndex] < simThresh: raaga = similarities[maxIndex][0] print("Added new cluster for", newterm, "(Nearest cluster was", raaga, ") \n") raagaClusters[newterm] = [newterm] updatedClusters.append(newterm) else: raaga = similarities[maxIndex][0] raagaClusters[raaga].append(newterm) print(newterm, "is now part of", raaga, "cluster with", tmp[maxIndex], "confidence\n") updatedClusters.append(raaga) updatedClusters = np.unique(updatedClusters) for key in raagaClusters.keys(): if key not in updatedClusters: raagaClusters.pop(key) return raagaClusters
def updateTaalaClusters(taalaClusters, newterms = [], mbids = [], simThresh=0.6): """ newterms: the new taala terms which need to be synced with taalaClusters, if these point to empty list, the function expects a valid mbids list. mbids: Those mbids which do not already have a key at taalaMBIDs.yaml taalaClusters: The clusters discovered using string matching function, and manually corrected. It is a dictionary with keys as actual taala names and values as all the possible spelling variations (including the key!). returns just the updated taalaClusters. Check them and sync with the taalaClusters already stored on filesystem. Once you get updatedClusters, write them to yaml file, make corrections, reload it to a dictionary, and call mergeClusters(updatedClusters, allClusters) """ unwantedChars = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "(",\ ")", "[", "]", ".", "-", " "] if newterms == []: if mbids == []: print "Either newterms or mbids must be a valid non-empty list!" return for mbid in mbids: newterm = getTag(mbid, tag="taala") if newterm: newterms.append(newterm) newterms = np.unique(newterms) #return newterms oldterms = np.concatenate(taalaClusters.values()) updatedClusters = [] for newterm in newterms: if newterm in oldterms: print newterm, "exists in our cluster\n" continue similarities = [] #of a new taala term to each of the existing cluster for key, values in taalaClusters.items(): s = [] for term in values: s.append(sd.similarity(sd.stripChars(term, unwantedChars),\ sd.stripChars(newterm, unwantedChars))) #s = s/len(values) s = max(s) similarities.append([key, s]) similarities = np.array(similarities) tmp = similarities[:, 1].astype("float") maxIndex = np.argmax(tmp) if tmp[maxIndex] < simThresh: taala = similarities[maxIndex][0] print "Added new cluster for", newterm, "(Nearest cluster was",\ taala, " with", tmp[maxIndex], "confidence)\n" taalaClusters[newterm] = [newterm] updatedClusters.append(newterm) else: taala = similarities[maxIndex][0] taalaClusters[taala].append(newterm) print newterm, "is now part of", taala, "cluster with",\ tmp[maxIndex], "confidence\n" updatedClusters.append(taala) updatedClusters = np.unique(updatedClusters) for key in taalaClusters.keys(): if key not in updatedClusters: taalaClusters.pop(key) return taalaClusters
def updateRaagaClusters(raagaClusters, newterms = [], mbids = [], simThresh=0.6): """ newterms: the new raaga terms which need to be synced with raagaClusters, if these point to empty list, the function expects a valid mbids list. mbids: Those mbids which do not already have a key at raagaMBIDs.yaml raagaClusters: The clusters discovered using string matching function, and manually corrected. It is a dictionary with keys as actual raaga names and values as all the possible spelling variations (including the key!). returns just the updated raagaClusters. Check them and sync with the raagaClusters already stored on filesystem. Once you get updatedClusters, write them to yaml file, make corrections, reload it to a dictionary, and call mergeClusters(updatedClusters, allClusters) """ if newterms == []: if mbids == []: print "Either newterms or mbids must be a valid non-empty list!" return for mbid in mbids: newterm = getTag(mbid, tag="raaga") if newterm: newterms.append(newterm) newterms = [i.strip("0123456789()[]-") for i in newterms] newterms = np.unique(newterms) #return newterms oldterms = np.concatenate(raagaClusters.values()) updatedClusters = [] for newterm in newterms: if newterm in oldterms: print newterm, "exists in our cluster\n" continue similarities = [] #of a new raaga term to each of existing cluster for key, values in raagaClusters.items(): s = 0 for term in values: s += sd.similarity(term, newterm) s = s/len(values) similarities.append([key, s]) similarities = np.array(similarities) tmp = similarities[:, 1].astype("float") maxIndex = np.argmax(tmp) if tmp[maxIndex] < simThresh: raaga = similarities[maxIndex][0] print "Added new cluster for", newterm, "(Nearest cluster was",\ raaga, ") \n" raagaClusters[newterm] = [newterm] updatedClusters.append(newterm) else: raaga = similarities[maxIndex][0] raagaClusters[raaga].append(newterm) print newterm, "is now part of", raaga, "cluster with",\ tmp[maxIndex], "confidence\n" updatedClusters.append(raaga) updatedClusters = np.unique(updatedClusters) for key in raagaClusters.keys(): if key not in updatedClusters: raagaClusters.pop(key) return raagaClusters