示例#1
0
def RatingSorensen(revista):

    #Get path file
    dirname = os.path.dirname(__file__)
    loc = os.path.join(dirname, r'JCR2018.xlsx')

    #Initialize reader
    workbook = xlrd.open_workbook(loc)
    sheet = workbook.sheet_by_index(0)

    tuplas = []

    start_time = time()

    for i in range(sheet.nrows):
        valor = (sheet.cell_value(i,
                                  1), sorensen(revista, sheet.cell_value(i,
                                                                         1)))
        tuplas.append(valor)

    final_time = time()
    execution_time = round(final_time - start_time, 2)

    tuplas.sort(key=lambda revista: revista[1])

    top_5 = tuplas[:10]

    result = (top_5, execution_time)

    return result
def compute_similarity(X):
    """
    Compute similarity matrix with mean of 3 distances
    :param X: List of contracts ssdeep hashes
    :return: Similarity matrix
    """
    jaccard_matrix = pdist(X, lambda x, y: distance.jaccard(x[0], y[0]))
    np.savetxt("../data/jaccard_matrix.csv",
               np.asarray(squareform(jaccard_matrix)),
               delimiter=",")

    sorensen_matrix = pdist(X, lambda x, y: distance.sorensen(x[0], y[0]))
    np.savetxt("../data/sorensen_matrix.csv",
               np.asarray(squareform(sorensen_matrix)),
               delimiter=",")

    # normalized, so that the results can be meaningfully compared
    # method=1 means the shortest alignment between the sequences is taken as factor
    levenshtein_matrix = pdist(
        X, lambda x, y: distance.nlevenshtein(x[0], y[0], method=1))
    np.savetxt("../data/levenshtein_matrix.csv",
               np.asarray(squareform(levenshtein_matrix)),
               delimiter=",")

    mean_matrix = 1 - np.mean(np.array(
        [jaccard_matrix, sorensen_matrix, levenshtein_matrix]),
                              axis=0)
    np.savetxt("../data/similarity_matrix.csv",
               np.asarray(mean_matrix),
               delimiter=",")

    print("Similarity matrix computed.")
    return mean_matrix
示例#3
0
def sorensen_plus(a: str, b: str) -> float:
    length = min(len(a), len(b))
    ng = [
        distance.sorensen(ngrams(a, n), ngrams(b, n))
        for n in range(1, length + 1)
    ]
    return 1 - np.sum(ng) / length
示例#4
0
def title_similarity_np(row1, row2, method="difflib"):
    if method.lower() == "levenshtein":
        return 1 - distance.nlevenshtein(row1[1], row2[1], method=1)
    if method.lower() == "sorensen":
        return 1 - distance.sorensen(row1[1], row2[1])
    if method.lower() == "jaccard":
        return 1 - distance.jaccard(row1[1], row2[1])
    return difflib.SequenceMatcher(None, row1[1], row2[1]).quick_ratio()
示例#5
0
 def compare_ocr_strings_sorensen(ocr_string1, ocr_string2):
     """
     Sorensen distance
     :param ocr_string1:
     :param ocr_string2:
     :return:
     """
     result = distpkg.sorensen(ocr_string1, ocr_string2)
     return result
示例#6
0
def extract_basic_distance_feat(df):
    ## jaccard coef/dice dist of n-gram
    print "generate jaccard coef and dice dist for n-gram"
    dists = ["jaccard_coef", "dice_dist"]
    grams = ["unigram", "bigram", "trigram"]
    feat_names = ["origsent", "candsent"]
    for stem in ["", "_stem"]:
        for dist in dists:
            for gram in grams:
                for i in range(len(feat_names) - 1):
                    for j in range(i + 1, len(feat_names)):
                        target_name = feat_names[i]
                        obs_name = feat_names[j]
                        df["%s_of_%s_between_%s_%s%s" %
                           (dist, gram, target_name, obs_name, stem)] = list(
                               df.apply(lambda x: compute_dist(
                                   x[target_name + "_" + gram + stem], x[
                                       obs_name + "_" + gram + stem], dist),
                                        axis=1))

    print "generate rest all features"
    gram_ext = [
        "_unigram", "_bigram", "_trigram", "_char_unigram", "_char_bigram",
        "_char_trigram"
    ]
    for stem in ["", "_stem"]:
        for gram in gram_ext:
            df["levenshtein_%s%s" % (gram, stem)] = list(
                df.apply(lambda x: distance.nlevenshtein(
                    x["origsent" + gram + stem],
                    x["candsent" + gram + stem],
                    method=2),
                         axis=1))
            df["sorensen_%s%s" % (gram, stem)] = list(
                df.apply(lambda x: distance.sorensen(
                    x["origsent" + gram + stem], x["candsent" + gram + stem]),
                         axis=1))
            df["cosine_%s%s" % (gram, stem)] = list(
                df.apply(lambda x: cosine(x["origsent" + gram + stem], x[
                    "candsent" + gram + stem]),
                         axis=1))
            df["precision_%s%s" % (gram, stem)] = list(
                df.apply(lambda x: precision_recall(
                    x["origsent" + gram + stem], x["candsent" + gram + stem],
                    x["origsent" + gram + stem]),
                         axis=1))
            df["recall1gram_%s%s" % (gram, stem)] = list(
                df.apply(lambda x: precision_recall(
                    x["origsent" + gram + stem], x["candsent" + gram + stem],
                    x["candsent" + gram + stem]),
                         axis=1))
            df["f1gram_%s%s" % (gram, stem)] = list(
                df.apply(
                    lambda x: fmeasure(x["precision_%s%s" %
                                         (gram, stem)], x["recall1gram_%s%s" %
                                                          (gram, stem)]),
                    axis=1))
def similarity_string(a,b,measure):
	a = a.lower()
	b = b.lower()
	measure = measure.lower()
	if (measure == "matcher"):
		return SequenceMatcher(None,a,b).ratio()
	elif (measure == "sorensen"):
		return 1 - distance.sorensen(a, b)
	else:
		return 0
示例#8
0
def title_similarity_pd(row, method='difflib'):
    if method.lower() == "levenshtein":
        return 1 - distance.nlevenshtein(
            row["title"], row["title_R"], method=1)
    if method.lower() == "sorensen":
        return 1 - distance.sorensen(row["title"], row["title_R"])
    if method.lower() == "jaccard":
        return 1 - distance.jaccard(row["title"], row["title_R"])
    return difflib.SequenceMatcher(None, row["title"],
                                   row["title_R"]).quick_ratio()
示例#9
0
def findPizza(pizzaType):
    lowestScore = 1
    match = ''

    for pizza in PIZZAS:
        score = sorensen(pizza.lower(), pizzaType.lower())
        # print(pizza, score)
        if score < lowestScore:
            lowestScore = score
            match = pizza

    return match
示例#10
0
def get_features(raw_data):
    fet_data = pd.DataFrame()

    print "extracting count features..."
    fet_data["q_len"] = raw_data["query"].map(word_len)
    fet_data["t_len"] = raw_data["product_title"].map(word_len)
    fet_data["d_len"] = raw_data["product_description"].map(word_len)

    print "extracting basic distance features from q and t..."
    fet_data["nleven1"] = raw_data.apply(lambda x: distance.nlevenshtein(x.q, x.t, method=1), axis=1)
    fet_data["nleven2"] = raw_data.apply(lambda x: distance.nlevenshtein(x.q, x.t, method=2), axis=1)
    fet_data["sorensen"] = raw_data.apply(lambda x: distance.sorensen(x.q, x.t), axis=1)
    fet_data["jaccard"] = raw_data.apply(lambda x: distance.jaccard(x.q, x.t), axis=1)
    fet_data["ncd"] = raw_data.apply(lambda x: ncd(x.q, x.t), axis=1)

    print "extracting basic distance features from q_ex and t..."
    fet_data["sorensen_ex"] = raw_data.apply(lambda x: distance.sorensen(get_uniq_words_text(x.q_ex), x.t), axis=1)
    print "extracting basic distance features from q_ex and t..."
    fet_data["jaccard_ex"] = raw_data.apply(lambda x: distance.jaccard(get_uniq_words_text(x.q_ex), x.t), axis=1)
    print "extracting basic distance features from q_ex and t..."
    fet_data["ncd_ex"] = raw_data.apply(lambda x: ncd(get_uniq_words_text(x.q_ex), x.t), axis=1)

    return fet_data
示例#11
0
def calculate_distance_numeric(u1, u2, d_type, weights):
    # if the data is preprocessed and all fields are converted to numeric
    return {
        "jaccard":
        distance.jaccard(u1, u2),
        "euclidean":
        sqrt(sum(
            pow((1 / w) * (a - b), 2) for a, b, w in zip(u1, u2, weights))),
        "cosine":
        spatial.distance.cosine(u1, u2),
        "sorensen":
        distance.sorensen(u1, u2),
        "hamming":
        distance.hamming(u1, u2, normalized=True)
    }[d_type]
    def similarity_sentence_ngram(self, s1, s2):
        ng1 = self.init_list_of_objects(min(len(s1.split()) + 1, self.max_ngrams) - 2)
        ng2 = self.init_list_of_objects(min(len(s2.split()) + 1, self.max_ngrams) - 2)
        for j in range(2, min(len(s1.split()) + 1, self.max_ngrams)):
            for ngram in ngrams(s1.split(), j):
                ng1[j - 2].append(ngram)
        for j in range(2, min(len(s2.split()) + 1, self.max_ngrams)):
            for ngram in ngrams(s2.split(), j):
                ng2[j - 2].append(ngram)
        sum = 0
        for j in range(min(min(len(s1.split()) + 1, len(s2.split()) + 1), self.max_ngrams) - 2):
            sum += np.sum(distance.sorensen(ng1[j][i], ng2[j][i])
                           for i in range(min(len(ng1[j]), len(ng2[j])))) / min(len(ng1[j]), len(ng2[j]))
        sum = sum / min(min(len(s1.split()) + 1, len(s2.split()) + 1), self.max_ngrams)

        return 1 - sum
示例#13
0
def calculate_edit_distance(code_block1,
                            code_block2,
                            ignore_literals,
                            distance_metric,
                            verbose=False):
    if ignore_literals:  # Todo. Just ignore difference in strings if they are substentially different
        block1 = abstract(code_block1)
        block2 = abstract(code_block2)
        if verbose:
            print("[.] Abstracted code blocks:")
            print(block1.strip())
            print(block2.strip())

    else:
        block1 = code_block1
        block2 = code_block2

    # Tokenize
    tokens1 = tokenize_fine_grained(block1, keep_whitespace=False)
    tokens2 = tokenize_fine_grained(block2, keep_whitespace=False)

    if not tokens1 or not tokens2:
        return float('inf')

    if not has_alpha(tokens1) or not has_alpha(tokens2):
        return float('inf')

    if verbose:
        print(tokens1)
        print(tokens2)

    # https://github.com/doukremt/distance
    if distance_metric == "j":
        return distance.jaccard(tokens1, tokens2)
    elif distance_metric == "l":
        return distance.levenshtein(tokens1, tokens2)
    elif distance_metric == "h":
        return distance.hamming(tokens1, tokens2)
    elif distance_metric == "s":
        return distance.sorensen(tokens1, tokens2)
    elif distance_metric == "n":  # Normalized Levenshtein
        return distance.nlevenshtein(tokens1, tokens2)
    elif distance_metric == "c":  # Collapsed Levenshtein edit distance
        return collapse_edit_distance(tokens1, tokens2, verbose=verbose)
    elif distance_metric == "nc":  # Normalized collapsed Levenshtein edit distance
        collapsed = collapse_edit_distance(tokens1, tokens2, verbose=verbose)
        return collapsed / max(len(tokens1), len(tokens2))
示例#14
0
def train(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete = []):
    allTrainX = list()
    allTrainY = list()
    with open("./data/train.csv") as f:
        for line in f:
            lin = line.split(",")
            if len(lin) == 3:
                st1 = lin[0].lower()
                st2 = lin[1].lower()

                temp = [
                        1.-(lev.distance(st1,st2)*2/(len(st1)+len(st2))),
                        lev.jaro(st1,st2),
                        lev.jaro_winkler(st1,st2),
                        lev.ratio(st1,st2),
                        distance.sorensen(st1,st2),
                        jaccard(set(st1),set(st2)),
                        1. - distance.nlevenshtein(st1,st2,method=1),
                        1. - distance.nlevenshtein(st1,st2,method=2),
                        dice_coefficient(st1,st2,lenGram=2),
                        dice_coefficient(st1,st2,lenGram=3),
                        dice_coefficient(st1,st2,lenGram=4),
                        cosineWords(st1,st2,dictTrain,tfidf_matrix_train),
                        cosineBigrams(st1,st2,dictTrainBigrams,tfidf_matrix_trainBigrams,lenGram)
                    ]
                if len(delete) > 0:
                    for elem in delete:
                        temp[elem] = 0.
                allTrainX.append(temp)
                allTrainY.append(int(lin[2]))


    X = np.array(allTrainX,dtype=float)
    y = np.array(allTrainY,dtype=float)
    clf = svm.LinearSVC(C=1.,dual=False,loss='l2', penalty='l1')
    clf2 = linear_model.LogisticRegression(C=1.,dual=False, penalty='l1')
    clf.fit(X, y)
    clf2.fit(X, y)
    weights = np.array(clf.coef_[0])
    print(weights)
    weights = np.array(clf2.coef_[0])
    print(weights)


    return clf,clf2
示例#15
0
文件: models.py 项目: vlasy/skool
 def update_page_recs(cls, document):
     res = {}
     for site in Page.objects():
         if site != document:
             kwa, kwb = document.label_model, site.label_model
             ca = set(kwa)
             cb = set(kwb)
             if len(ca) > 0 and len(cb) > 0:
                 res[site.id] = distance.sorensen(ca, cb)
             else:
                 res[site.id] = 1  # 1 = totally different
     best = sorted(res.iteritems(), key=operator.itemgetter(1), reverse=False)[:10]
     ret = []
     for (obj, score) in best:
         s = Page.objects(id=obj).first()
         ret.append(s)
     document.recs = ret
     document.save()
示例#16
0
文件: models.py 项目: vlasy/skool
 def update_site_recs(cls, document, sites=None):
     res = {}
     if not sites:
         sites = Site.objects()
         sites.timeout(False)
     for site in sites:
         if site != document:
             kwa, kwb = document.keywords, site.keywords
             ca = set(kwa)
             cb = set(kwb)
             if len(ca) > 0 and len(cb) > 0:
                 res[site.id] = distance.sorensen(ca, cb)
             else:
                 res[site.id] = 1  # 1 = totally different
     best = sorted(res.iteritems(), key=operator.itemgetter(1), reverse=False)[:10]
     ret = []
     for (obj, score) in best:
         s = Site.objects(id=obj).first()
         ret.append(s)
     document.recs = ret
     document.save()
示例#17
0
def similarity_sentence_ngram(s1, s2):
    ng1 = init_list_of_objects(min(len(s1.split()) + 1, MAX_NGRAM) - 2)
    ng2 = init_list_of_objects(min(len(s2.split()) + 1, MAX_NGRAM) - 2)
    for j in range(2, min(len(s1.split()) + 1, MAX_NGRAM)):
        for ngram in ngrams(s1.split(), j):
            ng1[j - 2].append(ngram)
    for j in range(2, min(len(s2.split()) + 1, MAX_NGRAM)):
        for ngram in ngrams(s2.split(), j):
            ng2[j - 2].append(ngram)
    summ = 0
    for j in range(
            min(min(len(s1.split()) + 1,
                    len(s2.split()) + 1), MAX_NGRAM) - 2):
        summ += np.sum(
            distance.sorensen(ng1[j][i], ng2[j][i])
            for i in range(min(len(ng1[j]), len(ng2[j])))) / min(
                len(ng1[j]), len(ng2[j]))
    summ = summ / min(min(len(s1.split()) + 1, len(s2.split()) + 1), MAX_NGRAM)

    print(summ)

    return 1 - summ
示例#18
0
def distance_vec(s1, s2):
    edit_distance = distance.levenshtein(s1, s2)
    jaccard_distance = distance.jaccard(s1, s2)
    sorensen_distance = distance.sorensen(s1, s2)
    # hamming_distnace = distance.hamming(s1, s2)
    fc_distance = distance.fast_comp(s1, s2, transpositions=True)
    substring_distince = distance.lcsubstrings(s1, s2, positions=True)[0]
    common_words_distcance = len(get_common_words(s1, s2))
    tf_distance = tf_similarity(s1, s2)
    tfidf_distance = tfidf_similarity(s1, s2)
    vec = np.array([
        edit_distance,  # 编辑距离
        jaccard_distance,  # jaccard距离
        sorensen_distance,  # sorensen
        # hamming_distnace,     # 汉明距离
        fc_distance,  # fast commaon
        substring_distince,  # 最长公共子串长度
        common_words_distcance,  # 公共词个数
        tf_distance,  # 单文本tf
        tfidf_distance  # 单文本tfidf
    ])
    return vec
示例#19
0
def str_sorensen(str1, str2):

    str1_list = str1.split(' ')
    str2_list = str2.split(' ')
    res = distance.sorensen(str1_list, str2_list)
    return res
示例#20
0
def sorensen_word(s1, s2):
    N = min(len(s1), len(s2))
    ng1 = [ngrams_word(s1, j) for j in range(2, min(len(s1) + 1, MAX_NGRAM))]
    ng2 = [ngrams_word(s2, j) for j in range(2, min(len(s2) + 1, MAX_NGRAM))]
    return 1 - np.sum(
        distance.sorensen(ng1[0][i], ng2[0][i]) for i in range(N)) / (N)
示例#21
0
    if not entry.name:
        delete_empty(id, entry)
        continue

    if len(entry.email) < 3:
        # most likely not an email leakage
        continue

    full_name = entry.name.full_name or entry.name.family_name or entry.name.given_name
    if full_name is not None:
        full_name = full_name.text

    if not full_name:
        print "empty full name"
        continue

    l_full_name = full_name.lower()
    min_distance = 0.5
    keep_emails = []
    for email in entry.email:
        username = email.address.split('@')[0]
        #d = distance.nlevenshtein(username.lower(), l_full_name)
        #d2 = distance.jaccard(username.lower(), l_full_name)
        d3 = distance.sorensen(username.lower(), l_full_name)
        if d3 <= min_distance:
            keep_emails.append(email.address)

    if len(keep_emails) != len(entry.email):
        delete_extra_emails(id, keep_emails, full_name, entry)
示例#22
0
 def calculate(self, row):
     seq1 = str(row['question1'])
     seq2 = str(row['question2'])
     jaccard = distance.jaccard(seq1, seq2)
     sorensen = distance.sorensen(seq1, seq2)
     return [jaccard, sorensen]
def sorensen(doc1, doc2):
    z = distance.sorensen(doc1.lower().strip(), doc2.lower().strip())
    return z
示例#24
0
def str_sorensen(str1, str2):
    res = distance.sorensen(str1, str2)
    return res
示例#25
0
 def dist_fn(self, xs, ys):
     try:
         return distance.sorensen(xs, ys)
     except ZeroDivisionError:
         return 1
def str_sorensen(str1, str2):

    str1_list = str1.split(' ')
    str2_list = str2.split(' ')
    res = distance.sorensen(str1_list, str2_list)
    return res
示例#27
0
def calsulateDistances(st1, st2):
    diffl = difflib.SequenceMatcher(None, st1, st2).ratio()
    lev = Levenshtein.ratio(st1, st2)
    sor = 1 - distance.sorensen(st1, st2)
    jac = 1 - distance.jaccard(st1, st2)
    return diffl, lev, sor, jac
示例#28
0
def sorensen_similarity_ratio(actual_content, expected_content):
    return 1 - distance.sorensen(actual_content, expected_content)
 def sorensen_word(self, ng1, ng2):
     #ng1 = [ngrams(a, i) for i in range(1, min(len(a), len(b)))]
     #ng2 = [ngrams(b, i) for i in range(1, min(len(a), len(b)))]
     N = min(len(ng1), len(ng2))
     return 1 - np.sum(distance.sorensen(ng1[i], ng2[i]) for i in range(N)) / N
示例#30
0
affprop = sklearn.cluster.AffinityPropagation(affinity="precomputed", damping=0.5)
affprop.fit(lev_similarity)
for cluster_id in np.unique(affprop.labels_):
    exemplar = words[affprop.cluster_centers_indices_[cluster_id]]
    cluster = np.unique(words[np.nonzero(affprop.labels_==cluster_id)])
    cluster_str = ", ".join(cluster)
    print(" - *%s:* %s" % (exemplar, cluster_str))
    

t1 = ("de", "ci", "si", "ve")
t2 = ("de", "ri", "si", "ve")
distance.levenshtein(t1, t2)

sent1 = ['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']
sent2 = ['the', 'lazy', 'fox', 'jumps', 'over', 'the', 'crazy', 'dog']
distance.levenshtein(sent1, sent2)

distance.hamming("fat", "cat", normalized=True)
#0.3333333333333333
distance.nlevenshtein("abc", "acd", method=1)  # shortest alignment
#0.6666666666666666
distance.nlevenshtein("abc", "acd", method=2)  # longest alignment
#0.5


distance.sorensen("decide", "resize")
#0.5555555555555556
distance.jaccard("decide", "resize")
#0.7142857142857143
示例#31
0
def stats(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete = [],plotX=False):
    with open("./data/stats.csv") as infile:
        for i,line in enumerate(infile):
            pass

    dimMatrix = 16
    predict = np.zeros((i+1,dimMatrix))


    clf1,clf2 = train(tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram,delete=delete)

    with open("./data/stats.csv") as infile:
        for i,line in enumerate(infile):
            a = line.rstrip().split("\t")

            ## create same vector with more distances
            st1 = a[0].lower()
            st2 = a[1].lower()

            temp = [
            1.-(lev.distance(st1,st2)*2/(len(st1)+len(st2))),
            lev.jaro(st1,st2),
            lev.jaro_winkler(st1,st2),
            lev.ratio(st1,st2),
            distance.sorensen(st1,st2),
            jaccard(set(st1),set(st2)),
            1. - distance.nlevenshtein(st1,st2,method=1),
            1. - distance.nlevenshtein(st1,st2,method=2),
            dice_coefficient(st1,st2,lenGram=2),
            dice_coefficient(st1,st2,lenGram=3),
            dice_coefficient(st1,st2,lenGram=4),
            cosineWords(st1,st2),
            cosineBigrams(st1,st2)]

            if len(delete) > 0:
                for elem in delete:
                    temp[elem] = 0.

            predict[i,:-3] = temp
            predict[i,-3] = clf1.decision_function(np.array(temp,dtype=float))
            predict[i,-2] = clf2.decision_function(np.array(temp,dtype=float))
            predict[i,-1] = a[-1]


    if plotX:
        labelsM = ["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"]
        f1matrix = np.zeros((100,dimMatrix-1))

        fig = plt.figure()
        fig.set_size_inches(9,6)
        ax = fig.add_subplot(111)
        iC = -1
        for i in np.linspace(0,1,100):
            iC += 1
            for j in range(dimMatrix-1):
                t = np.array(predict[:,j])
                if j >= dimMatrix-3:
                    t = (t - np.min(t))/(np.max(t)-np.min(t))
                f1matrix[iC,j] = f1_score(y_pred=t>i ,y_true=predict[:,-1])
        F1scores = []
        for j in range(dimMatrix-1):
            F1scores.append(np.max(f1matrix[:,j]))
            #ax.plot(np.linspace(0,1,100),f1matrix[:,j],label=labelsM[j],color=tableau20[j])
        ax.bar(range(dimMatrix-1),F1scores)
        plt.xticks(np.arange(dimMatrix-1)+0.5,["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"],rotation=45)
        ax.set_ylabel("F1 score")
        ax.set_xlabel("Parameter")
        plt.legend(loc=2)
        customaxis(ax)
        plt.savefig("f1_bar.pdf")
        plt.show()

        fig = plt.figure()
        fig.set_size_inches(9, 6)
        ax = fig.add_subplot(111)

        AUCScores = []
        for j in range(dimMatrix-1):
            # Compute ROC curve and area the curve
            fpr, tpr, thresholds = roc_curve(predict[:,-1], predict[:,j])
            AUCScores.append(auc(fpr, tpr))


            # Plot ROC curve
            ax.plot(fpr, tpr, label=labelsM[j],color=tableau20[j])
            ax.plot([0, 1], [0, 1], 'k--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.0])
            ax.set_xlabel('False Positive Rate')
            ax.set_ylabel('True Positive Rate')
            ax.set_title('ROC Curve')

        plt.legend(loc=2)
        customaxis(ax)
        plt.savefig("roc.pdf")
        plt.show()

        fig = plt.figure()
        fig.set_size_inches(9, 6)
        ax = fig.add_subplot(111)
        ax.bar(range(dimMatrix-1),AUCScores)
        ax.set_ylabel('Area Under Curve')
        plt.xticks(np.arange(dimMatrix-1)+0.5,["Lev","Jaro","Jaro-Winkler","Ratio","Sorensen","Jaccard","Lev1","Lev2","Dice_2","Dice_3","Dice_4","cosineWords","cosineBigrams","SVM","Logit"],rotation=45)
        customaxis(ax)
        plt.savefig("roc_bar.pdf")
        plt.show()
示例#32
0
def feature_extraction(features):

    try:
        features["origsent_unigram"] = list(
            features.apply(lambda x: preprocess_token(x["origsent"]), axis=1))
    except:
        features["origsent_unigram"] = list(
            features.apply(lambda x: preprocess_data2(x["origsent"]), axis=1))
    try:
        features["candsent_unigram"] = list(
            features.apply(lambda x: preprocess_token(x["candsent"]), axis=1))
    except:
        features["candsent_unigram"] = list(
            features.apply(lambda x: preprocess_data2(x["candsent"]), axis=1))

    features["origsent_unigram_stem"] = list(
        features.apply(lambda x: preprocess_data(x["origsent"]), axis=1))
    features["candsent_unigram_stem"] = list(
        features.apply(lambda x: preprocess_data(x["candsent"]), axis=1))
    features["origsent_stem"] = list(features["origsent"].apply(preprocess))
    features["candsent_stem"] = list(features["candsent"].apply(preprocess))

    print "generate bigram"
    join_str = "_"
    try:
        features["origsent_bigram"] = list(
            features.apply(
                lambda x: getBigram(x["origsent_unigram"], join_str), axis=1))
    except:
        templist = []
        for x in features["origsent_unigram"].iteritems():
            templist.append(getBigram(x, join_str))
        features["origsent_unigram"] = templist

    try:
        features["origsent_bigram_stem"] = list(
            features.apply(
                lambda x: getBigram(x["origsent_unigram_stem"], join_str),
                axis=1))
    except:
        templist = []
        for x in features["origsent_unigram_stem"].iteritems():
            templist.append(getBigram(x, join_str))
        features["origsent_unigram_stem"] = templist

    features["candsent_bigram"] = list(
        features.apply(lambda x: getBigram(x["candsent_unigram"], join_str),
                       axis=1))
    features["candsent_bigram_stem"] = list(
        features.apply(
            lambda x: getBigram(x["candsent_unigram_stem"], join_str), axis=1))
    ## trigram
    print "generate trigram"
    join_str = "_"
    features["origsent_trigram"] = list(
        features.apply(lambda x: getTrigram(x["origsent_unigram"], join_str),
                       axis=1))
    features["candsent_trigram"] = list(
        features.apply(lambda x: getTrigram(x["candsent_unigram"], join_str),
                       axis=1))
    features["origsent_trigram_stem"] = list(
        features.apply(
            lambda x: getTrigram(x["origsent_unigram_stem"], join_str),
            axis=1))
    features["candsent_trigram_stem"] = list(
        features.apply(
            lambda x: getTrigram(x["candsent_unigram_stem"], join_str),
            axis=1))

    #print "Generate Wordnet Features"
    #features["wordnet-similarity"] = list(features.apply(lambda x: ss.similarity(x["origsent"], x["candsent"], False), axis=1))
    #features["wordnet-similarity-norm"] = list(features.apply(lambda x: ss.similarity(x["origsent"], x["candsent"], True), axis=1))

    print "generate char gram"
    feat_names = ["origsent", "candsent"]
    grams = ["unigram", "bigram", "trigram"]
    for stem in ["", "_stem"]:
        for feat in feat_names:
            for gram in grams:
                try:
                    features["%s_char_%s%s" % (feat, gram, stem)] = list(
                        features.apply(
                            lambda x: word2ngrams(x[feat + stem], gram),
                            axis=1))
                except:
                    continue
                nonnumeric_columns.add("%s_char_%s%s" % (feat, gram, stem))

    features["candsent_char_trigram"] = list(
        features.apply(lambda x: word2ngrams(x["candsent"], "trigram"),
                       axis=1))
    features["origsent_char_bigram_stem"] = list(
        features.apply(lambda x: word2ngrams(x["candsent_stem"], "bigram"),
                       axis=1))
    features["origsent_char_trigram_stem"] = list(
        features.apply(lambda x: word2ngrams(x["candsent_stem"], "trigram"),
                       axis=1))

    print "generate common word features"
    gram_ext = [
        "_unigram", "_bigram", "_trigram", "_char_unigram", "_char_bigram",
        "_char_trigram"
    ]
    for stem in ["", "_stem"]:
        for gram in gram_ext:
            features["common-words_%s%s" % (gram, stem)] = list(
                features.apply(lambda x: len(
                    intersect(x["origsent" + gram + stem], x["candsent" + gram
                                                             + stem])),
                               axis=1))

    features["origsent_tag"] = list(
        features.apply(lambda x: preprocess_tag(x["origsenttag"]), axis=1))
    features["candsent_tag"] = list(
        features.apply(lambda x: preprocess_tag(x["candsenttag"]), axis=1))

    features["origsent_tag_unigram"] = features["origsent_tag"]
    features["candsent_tag_unigram"] = features["candsent_tag"]

    features["origsent_tag_left"] = list(
        features.apply(
            lambda x: word_left(x["origsent_unigram"], x["origsent_tag"]),
            axis=1))
    features["candsent_tag_left"] = list(
        features.apply(
            lambda x: word_left(x["candsent_unigram"], x["candsent_tag"]),
            axis=1))

    features["origsent_tag_right"] = list(
        features.apply(
            lambda x: word_right(x["origsent_unigram"], x["origsent_tag"]),
            axis=1))
    features["candsent_tag_right"] = list(
        features.apply(
            lambda x: word_right(x["candsent_unigram"], x["candsent_tag"]),
            axis=1))

    features["origsent_NER"] = list(
        features.apply(lambda x: preprocess_NER(x["origsenttag"]), axis=1))
    features["candsent_NER"] = list(
        features.apply(lambda x: preprocess_NER(x["candsenttag"]), axis=1))

    features["origsent_NER_unigram"] = features["origsent_NER"]
    features["candsent_NER_unigram"] = features["candsent_NER"]

    features["origsent_Event"] = list(
        features.apply(lambda x: preprocess_Event(x["origsenttag"]), axis=1))
    features["candsent_Event"] = list(
        features.apply(lambda x: preprocess_Event(x["candsenttag"]), axis=1))

    features["origsent_Event_unigram"] = features["origsent_Event"]
    features["candsent_Event_unigram"] = features["candsent_Event"]

    print "generate bigram for Tags"
    feattag = [
        "origsent_tag", "candsent_tag", "origsent_NER", "candsent_NER",
        "origsent_Event", "candsent_Event"
    ]
    for feat in feattag:
        join_str = "_"
        features["%s_bigram" % (feat)] = list(
            features.apply(lambda x: getBigram(x["%s_unigram" %
                                                 (feat)], join_str),
                           axis=1))
        features["%s_trigram" % (feat)] = list(
            features.apply(lambda x: getTrigram(x["%s_unigram" %
                                                  (feat)], join_str),
                           axis=1))

    gram_tags = ["_tag_unigram", "_tag_bigram", "_tag_trigram"]

    for gram in gram_tags:
        features["common-words_%s" % (gram)] = list(
            features.apply(lambda x: len(
                intersect(x["origsent" + gram], x["candsent" + gram])),
                           axis=1))
        features["levenshtein_%s" % (gram)] = list(
            features.apply(lambda x: distance.nlevenshtein(
                x["origsent" + gram], x["candsent" + gram], method=2),
                           axis=1))
        features["sorensen_%s" % (gram)] = list(
            features.apply(lambda x: distance.sorensen(x["origsent" + gram], x[
                "candsent" + gram]),
                           axis=1))
        features["cosine_%s" % (gram)] = list(
            features.apply(
                lambda x: cosine(x["origsent" + gram], x["candsent" + gram]),
                axis=1))
        features["precision_%s" % (gram)] = list(
            features.apply(lambda x: precision_recall(x["origsent" + gram], x[
                "candsent" + gram], x["origsent" + gram]),
                           axis=1))
        features["recall1gram_%s" % (gram)] = list(
            features.apply(lambda x: precision_recall(x["origsent" + gram], x[
                "candsent" + gram], x["candsent" + gram]),
                           axis=1))
        features["f1gram_%s" % (gram)] = list(
            features.apply(lambda x: fmeasure(x["precision_%s" %
                                                (gram)], x["recall1gram_%s" %
                                                           (gram)]),
                           axis=1))

    features["common_Event"] = list(
        features.apply(
            lambda x: len(intersect(x["origsent_Event"], x["candsent_Event"])),
            axis=1))

    features["common_NER"] = list(
        features.apply(
            lambda x: len(intersect(x["origsent_Event"], x["candsent_Event"])),
            axis=1))

    return features
示例#33
0
def similar_sorensen(a, b):
    return (1 - distance.sorensen(a, b))
def sorencen(q1, q2):
    return distance.sorensen(q1, q2)
def sorensen_plus(a, b):
    ng1 = [ngrams(a, i) for i in range(1, min(len(a), len(b)) + 1)]
    ng2 = [ngrams(b, i) for i in range(1, min(len(a), len(b)) + 1)]
    N = min(len(ng1), len(ng2))
    return 1 - np.sum(distance.sorensen(ng1[i], ng2[i]) for i in range(N)) / N
示例#36
0
     if checked == 0:
         if a == b:
             checked += 1
             for v1 in gt[a]:
                 partials = []
                 levs = []
                 jacs = []
                 sors = []
                 for v2 in pc[b]:
                     v2 = str(v2).translate(None, string.punctuation)
                     v2 = str(v2).replace('\t',' ')
                     try:
                         partials.append((1-(fuzz.partial_ratio(v1, v2)/100.0)))
                         levs.append(distance.levenshtein(v1,v2, normalized=True))
                         jacs.append(distance.jaccard(v1, v2))
                         sors.append(distance.sorensen(v1, v2))
                     except UnicodeDecodeError:
                         partials.append(1)
                         levs.append(1)
                         jacs.append(1)
                         sors.append(1)
                 ls_partials.append(partials)
                 ls_levs.append(levs)
                 ls_jacs.append(jacs)
                 ls_sors.append(sors)
         else:
             pass
     else:
         pass
 # create distance score matrices with row index as hand coded titles and 
 # column index as parscit coded titles
示例#37
0
def sor_tok_distance(q1, q2, t1, t2):
    return distance.sorensen(t1, t2)