def main():
    if (len(sys.argv) != 2):
        print("Usage: python ML-Agglomerative.py path-name")
        sys.exit()
    path = sys.argv[1]  # "../ames/test.csv"

    df = pd.read_csv(path)
    arr1 = df.columns.values
    arr2 = df.columns.values

    NL = NormalizedLevenshtein()
    vectors = pd.DataFrame([[NL.distance(i, j) for j in arr2] for i in arr1])

    clusters = 5
    agg = AgglomerativeClustering(affinity='euclidean',
                                  compute_full_tree='auto',
                                  connectivity=None,
                                  linkage='ward',
                                  memory=None,
                                  n_clusters=clusters,
                                  pooling_func='deprecated').fit(vectors)

    for i in range(clusters):
        print("Cluster " + str(i) + ":")
        print(pd.Series(arr1[agg.labels_ == i]))
        print('\n')
예제 #2
0
def simDif(w1,w2):
  normalized_levenshtein = NormalizedLevenshtein()
  dis = normalized_levenshtein.distance(w1,w2)
  sim = normalized_levenshtein.similarity(w1,w2)
  print('distance: '+str(dis)+' similarity: '+ str(sim))
  # 此线表示的是固定文本的一段和活动文本各段比较之后的分隔
  print('----------------------------')
예제 #3
0
def similarity(outputs_batch, labels_batch, dic):
    norm_lev = NormalizedLevenshtein()
    outputs_batch = torch.argmax(outputs_batch, -1)
    avg_sim = 0
    for j in range(outputs_batch.size(-1)):
        pred = [dic[int(k)] for k in outputs_batch[:, j]]
        pred = utils.clear(pred)
        avg_sim += norm_lev.distance(pred, labels_batch[j])
    avg_sim = 1 - avg_sim / outputs_batch.size(-1)
    return avg_sim
예제 #4
0
def similarity_plus(outputs_batch, labels_batch, dic):
    d = enchant.Dict("en_US")
    norm_lev = NormalizedLevenshtein()
    outputs_batch = torch.argmax(outputs_batch, -1)
    avg_sim = 0
    for j in range(outputs_batch.size(-1)):
        pred = [dic[int(k)] for k in outputs_batch[:, j]]
        pred = utils.clear(pred)
        if not d.check(pred):
            pred = d.suggest(pred)
        avg_sim += norm_lev.distance(pred, labels_batch[j])
    avg_sim = 1 - avg_sim / outputs_batch.size(-1)
    return avg_sim
예제 #5
0
def data_classifier(arr, sources):
    NL = NormalizedLevenshtein()
    vectors = pd.DataFrame([[NL.distance(i, j) for j in arr] for i in arr])

    clusters = int(len(arr)**.5)
    if clusters <= 1:
        clusters = 2
    kmeans = KMeans(n_clusters=clusters, random_state=0).fit(vectors)

    field_ids = ["field-" + str(uuid.uuid4()) for field in arr]
    uncert = pd.DataFrame()
    for i in range(clusters):
        uncert[i] = vectors.apply(dist,
                                  args=(kmeans.cluster_centers_[i], ),
                                  axis=0)
    uncert.index = field_ids

    classifications_obj = {}
    fields_obj = {}
    for i in range(clusters):
        cluster = {}
        fields = pd.Series(field_ids)[kmeans.labels_ == i]
        for field_id in fields:
            cluster[field_id] = uncert.loc[field_id][i]
            idx = field_ids.index(field_id)
            fields_obj[field_id] = {"name": arr[idx], "source": sources[idx]}
        cid_obj = {}
        cid_obj["name"] = "classification" + str(i)
        cid_obj["metadata"] = None
        cid_obj["values"] = cluster
        cid_obj["distribution"] = np.array(list(cluster.values())).mean()
        classifications_obj["classification-" + str(uuid.uuid4())] = cid_obj

    data = {}
    data["Classifications"] = classifications_obj
    data["Fields"] = fields_obj
    return (json.dumps(data, sort_keys=True, indent=4))
예제 #6
0
    def similarity(self, question, answer):

        stopword = self.read_from(folder_path + '上证专用停用词.txt')
        stopwords = []
        for sw in stopword:
            sw = sw.strip('\n')
            sw = sw.strip(' ')
            stopwords.append(sw)
        # print(stopwords)

        meaningful_words1 = []
        meaningful_words2 = []

        words2 = jieba.cut(str(question))
        words3 = jieba.cut(str(answer))
        for word in words2:
            if word not in stopwords:
                meaningful_words1.append(word)
        for word in words3:
            if word not in stopwords:
                meaningful_words2.append(word)
        s2 = ''.join(meaningful_words1)
        # print(s2)
        s3 = ''.join(meaningful_words2)
        a1 = Cosine(1)
        b1 = Damerau()
        c1 = Jaccard(1)
        d1 = JaroWinkler()
        e1 = Levenshtein()
        f1 = LongestCommonSubsequence()
        g1 = MetricLCS()
        h1 = NGram(2)
        i1 = NormalizedLevenshtein()
        j1 = OptimalStringAlignment()
        k1 = QGram(1)
        l1 = SorensenDice(2)
        m1 = WeightedLevenshtein(character_substitution=CharSub())

        line_sim = []

        cos_s = a1.similarity(s2, s3)
        line_sim.append(cos_s)
        cos_d = a1.distance(s2, s3)
        line_sim.append(cos_d)
        dam = b1.distance(s2, s3)
        line_sim.append(dam)
        jac_d = c1.distance(s2, s3)
        line_sim.append(jac_d)
        jac_s = c1.similarity(s2, s3)
        line_sim.append(jac_s)
        jar_d = d1.distance(s2, s3)
        line_sim.append(jar_d)
        jar_s = d1.similarity(s2, s3)
        line_sim.append(jar_s)
        lev = e1.distance(s2, s3)
        line_sim.append(lev)
        lon = f1.distance(s2, s3)
        line_sim.append(lon)
        met = g1.distance(s2, s3)
        line_sim.append(met)
        ngr = h1.distance(s2, s3)
        line_sim.append(ngr)
        nor_d = i1.distance(s2, s3)
        line_sim.append(nor_d)
        nor_s = i1.similarity(s2, s3)
        line_sim.append(nor_s)
        opt = j1.distance(s2, s3)
        line_sim.append(opt)
        qgr = k1.distance(s2, s3)
        line_sim.append(qgr)
        sor_d = l1.distance(s2, s3)
        line_sim.append(sor_d)
        sor_s = l1.similarity(s2, s3)
        line_sim.append(sor_s)
        wei = m1.distance(s2, s3)
        line_sim.append(wei)

        return line_sim
예제 #7
0
from similarity.normalized_levenshtein import NormalizedLevenshtein
from similarity.jaccard import Jaccard

s1 = '中华人民共和国'
s2 = '中国'

normalized_levenshtein = NormalizedLevenshtein()
print('Levenshtein: ', normalized_levenshtein.distance(s1, s2))

jaccard_distance = Jaccard(1)
print('Jaccard: ', jaccard_distance.distance(s1, s2))

# print(jaccard_similarity_score(list(s1), list(s2)))
    jarowinkler = JaroWinkler()

    sim = jarowinkler.similarity(my_string, temp_article[i])

    if sim > 0.45:

        filter_thresh_45.append(data[i])

normalized_levenshtein = NormalizedLevenshtein()

filter_normalized_levenshtein = []

for i in range(len(filter_thresh_45)):

    sim = normalized_levenshtein.distance(my_string, filter_thresh_45[i][0])

    if sim >= 0.7:

        filter_normalized_levenshtein.append(filter_thresh_45[i])

with open('filtered_levenshtein_human_mobility.txt',
          'w',
          encoding="ISO-8859-1") as outfile:
    json.dump(filter_normalized_levenshtein, outfile)
    outfile.close()

with open("filtered_levenshtein_mobility_prediction.csv", 'w') as resultFile:

    wr = csv.writer(resultFile, dialect='excel')