def main(): if (len(sys.argv) != 2): print("Usage: python ML-Agglomerative.py path-name") sys.exit() path = sys.argv[1] # "../ames/test.csv" df = pd.read_csv(path) arr1 = df.columns.values arr2 = df.columns.values NL = NormalizedLevenshtein() vectors = pd.DataFrame([[NL.distance(i, j) for j in arr2] for i in arr1]) clusters = 5 agg = AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto', connectivity=None, linkage='ward', memory=None, n_clusters=clusters, pooling_func='deprecated').fit(vectors) for i in range(clusters): print("Cluster " + str(i) + ":") print(pd.Series(arr1[agg.labels_ == i])) print('\n')
def simDif(w1,w2): normalized_levenshtein = NormalizedLevenshtein() dis = normalized_levenshtein.distance(w1,w2) sim = normalized_levenshtein.similarity(w1,w2) print('distance: '+str(dis)+' similarity: '+ str(sim)) # 此线表示的是固定文本的一段和活动文本各段比较之后的分隔 print('----------------------------')
def similarity(outputs_batch, labels_batch, dic): norm_lev = NormalizedLevenshtein() outputs_batch = torch.argmax(outputs_batch, -1) avg_sim = 0 for j in range(outputs_batch.size(-1)): pred = [dic[int(k)] for k in outputs_batch[:, j]] pred = utils.clear(pred) avg_sim += norm_lev.distance(pred, labels_batch[j]) avg_sim = 1 - avg_sim / outputs_batch.size(-1) return avg_sim
def similarity_plus(outputs_batch, labels_batch, dic): d = enchant.Dict("en_US") norm_lev = NormalizedLevenshtein() outputs_batch = torch.argmax(outputs_batch, -1) avg_sim = 0 for j in range(outputs_batch.size(-1)): pred = [dic[int(k)] for k in outputs_batch[:, j]] pred = utils.clear(pred) if not d.check(pred): pred = d.suggest(pred) avg_sim += norm_lev.distance(pred, labels_batch[j]) avg_sim = 1 - avg_sim / outputs_batch.size(-1) return avg_sim
def data_classifier(arr, sources): NL = NormalizedLevenshtein() vectors = pd.DataFrame([[NL.distance(i, j) for j in arr] for i in arr]) clusters = int(len(arr)**.5) if clusters <= 1: clusters = 2 kmeans = KMeans(n_clusters=clusters, random_state=0).fit(vectors) field_ids = ["field-" + str(uuid.uuid4()) for field in arr] uncert = pd.DataFrame() for i in range(clusters): uncert[i] = vectors.apply(dist, args=(kmeans.cluster_centers_[i], ), axis=0) uncert.index = field_ids classifications_obj = {} fields_obj = {} for i in range(clusters): cluster = {} fields = pd.Series(field_ids)[kmeans.labels_ == i] for field_id in fields: cluster[field_id] = uncert.loc[field_id][i] idx = field_ids.index(field_id) fields_obj[field_id] = {"name": arr[idx], "source": sources[idx]} cid_obj = {} cid_obj["name"] = "classification" + str(i) cid_obj["metadata"] = None cid_obj["values"] = cluster cid_obj["distribution"] = np.array(list(cluster.values())).mean() classifications_obj["classification-" + str(uuid.uuid4())] = cid_obj data = {} data["Classifications"] = classifications_obj data["Fields"] = fields_obj return (json.dumps(data, sort_keys=True, indent=4))
def similarity(self, question, answer): stopword = self.read_from(folder_path + '上证专用停用词.txt') stopwords = [] for sw in stopword: sw = sw.strip('\n') sw = sw.strip(' ') stopwords.append(sw) # print(stopwords) meaningful_words1 = [] meaningful_words2 = [] words2 = jieba.cut(str(question)) words3 = jieba.cut(str(answer)) for word in words2: if word not in stopwords: meaningful_words1.append(word) for word in words3: if word not in stopwords: meaningful_words2.append(word) s2 = ''.join(meaningful_words1) # print(s2) s3 = ''.join(meaningful_words2) a1 = Cosine(1) b1 = Damerau() c1 = Jaccard(1) d1 = JaroWinkler() e1 = Levenshtein() f1 = LongestCommonSubsequence() g1 = MetricLCS() h1 = NGram(2) i1 = NormalizedLevenshtein() j1 = OptimalStringAlignment() k1 = QGram(1) l1 = SorensenDice(2) m1 = WeightedLevenshtein(character_substitution=CharSub()) line_sim = [] cos_s = a1.similarity(s2, s3) line_sim.append(cos_s) cos_d = a1.distance(s2, s3) line_sim.append(cos_d) dam = b1.distance(s2, s3) line_sim.append(dam) jac_d = c1.distance(s2, s3) line_sim.append(jac_d) jac_s = c1.similarity(s2, s3) line_sim.append(jac_s) jar_d = d1.distance(s2, s3) line_sim.append(jar_d) jar_s = d1.similarity(s2, s3) line_sim.append(jar_s) lev = e1.distance(s2, s3) line_sim.append(lev) lon = f1.distance(s2, s3) line_sim.append(lon) met = g1.distance(s2, s3) line_sim.append(met) ngr = h1.distance(s2, s3) line_sim.append(ngr) nor_d = i1.distance(s2, s3) line_sim.append(nor_d) nor_s = i1.similarity(s2, s3) line_sim.append(nor_s) opt = j1.distance(s2, s3) line_sim.append(opt) qgr = k1.distance(s2, s3) line_sim.append(qgr) sor_d = l1.distance(s2, s3) line_sim.append(sor_d) sor_s = l1.similarity(s2, s3) line_sim.append(sor_s) wei = m1.distance(s2, s3) line_sim.append(wei) return line_sim
from similarity.normalized_levenshtein import NormalizedLevenshtein from similarity.jaccard import Jaccard s1 = '中华人民共和国' s2 = '中国' normalized_levenshtein = NormalizedLevenshtein() print('Levenshtein: ', normalized_levenshtein.distance(s1, s2)) jaccard_distance = Jaccard(1) print('Jaccard: ', jaccard_distance.distance(s1, s2)) # print(jaccard_similarity_score(list(s1), list(s2)))
jarowinkler = JaroWinkler() sim = jarowinkler.similarity(my_string, temp_article[i]) if sim > 0.45: filter_thresh_45.append(data[i]) normalized_levenshtein = NormalizedLevenshtein() filter_normalized_levenshtein = [] for i in range(len(filter_thresh_45)): sim = normalized_levenshtein.distance(my_string, filter_thresh_45[i][0]) if sim >= 0.7: filter_normalized_levenshtein.append(filter_thresh_45[i]) with open('filtered_levenshtein_human_mobility.txt', 'w', encoding="ISO-8859-1") as outfile: json.dump(filter_normalized_levenshtein, outfile) outfile.close() with open("filtered_levenshtein_mobility_prediction.csv", 'w') as resultFile: wr = csv.writer(resultFile, dialect='excel')