Пример #1
0
def comparison(file1, file2):
    from nltk.metrics import distance as dst
    f1 = codecs.open(file1, 'r', 'utf-8')
    text1 = f1.read()
    t1 = text1.split()
    d1 = freqdict(t1)
    f1.close()
    f2 = codecs.open(file2, 'r', 'utf-8')
    text2 = f2.read()
    print dst.binary_distance(text1, text2)
    t2 = text2.split()
    d2 = freqdict(t2)
    text1 = text1.split(' ')
    text2 = text2.split(' ')
    f2.close()
    arr = []
    for i in text1:
        if i not in text2:
            arr.append(i)
    for i in arr:
        print i
    print len(arr)
    print len(text2)
    print 'by words: ', float(len(arr)) / len(text2)
    arr2 = []
    for i in t1:
        if i not in t2:
            arr2.append(i)
    print 'by symbols: ', float(len(arr2)) / len(t2)
Пример #2
0
def comparison(file1, file2):
    from nltk.metrics import distance as dst
    f1 = codecs.open(file1, 'r', 'utf-8')
    text1 = f1.read()
    t1 = text1.split()
    d1=freqdict(t1)
    f1.close()
    f2 = codecs.open(file2, 'r', 'utf-8')
    text2 = f2.read()
    print dst.binary_distance(text1, text2)
    t2 = text2.split()
    d2 = freqdict(t2)
    text1 = text1.split(' ')
    text2 = text2.split(' ')
    f2.close()
    arr = []
    for i in text1:
        if i not in text2:
            arr.append(i)
    for i in arr:
        print i
    print len(arr)
    print len(text2)
    print 'by words: ', float(len(arr)) / len(text2)
    arr2=[]
    for i in t1:
        if i not in t2:
            arr2.append(i)
    print 'by symbols: ', float(len(arr2)) / len(t2)
Пример #3
0
    def get_ngram_stats(row, n, qcolumns, char=False):

        if char == True:
            q1 = ''.join(row[qcolumns[0]].split())
            q2 = ''.join(row[qcolumns[1]].split())
        else:
            q1 = row[qcolumns[0]].split()
            q2 = row[qcolumns[1]].split()

        q1_ngram_list = list(ngrams(q1, n))
        q2_ngram_list = list(ngrams(q2, n))

        q1_ngram_set = set(q1_ngram_list)
        q2_ngram_set = set(q2_ngram_list)

        q1_sum = len(q1_ngram_list)
        q2_sum = len(q2_ngram_list)

        diff = abs(q1_sum - q2_sum)

        if q1_sum + q2_sum != 0:
            diff_norm = diff / (q1_sum + q2_sum) * 2
        else:
            diff_norm = -1
        maximum = max([q1_sum, q2_sum])
        minimum = min([q1_sum, q2_sum])

        q1_unique = len(q1_ngram_set)
        q2_unique = len(q2_ngram_set)

        diff_unique = abs(q1_unique - q2_unique)

        intersect_r = Counter(q1_ngram_list) & Counter(q2_ngram_list)

        if q1_sum + q2_sum != 0:
            intersect_r = sum(intersect_r.values()) / (q1_sum + q2_sum) * 2
            intersect_unique_r = len(
                q1_ngram_set.intersection(q2_ngram_set)) / (q1_unique +
                                                            q2_unique) * 2
            masi_dist = distance.masi_distance(q1_ngram_set, q2_ngram_set)
        else:
            intersect_r = -1
            intersect_unique_r = -1
            masi_dist = -1

        if 0 != len(q1_ngram_set.union(q2_ngram_set)):
            jaccard_dist = (len(q1_ngram_set.union(q2_ngram_set)) - len(
                q1_ngram_set.intersection(q2_ngram_set))) / len(
                    q1_ngram_set.union(q2_ngram_set))
        else:
            jaccard_dist = 1

        bin_dist = distance.binary_distance(q1_ngram_set, q2_ngram_set)

        listout = [
            q1_sum, q2_sum, diff, diff_norm, maximum, minimum, q1_unique,
            q2_unique, diff_unique, intersect_r, intersect_unique_r,
            jaccard_dist, bin_dist, masi_dist
        ]

        return listout
Пример #4
0
    def get_ngram_stats(row, n, qcolumns, char=False, append=''):

        if char == True:
            q1 = ''.join(row[qcolumns[0]].split())
            q2 = ''.join(row[qcolumns[1]].split())
        else:
            q1 = row[qcolumns[0]].split()
            q2 = row[qcolumns[1]].split()

        q1_ngram_list = list(ngrams(q1, n))
        q2_ngram_list = list(ngrams(q2, n))

        q1_ngram_set = set(q1_ngram_list)
        q2_ngram_set = set(q2_ngram_list)

        q1_sum = len(q1_ngram_list)
        q2_sum = len(q2_ngram_list)

        diff = abs(q1_sum - q2_sum)

        if q1_sum + q2_sum != 0:
            diff_norm = diff / (q1_sum + q2_sum) * 2
        else:
            diff_norm = -1
        maximum = max([q1_sum, q2_sum])
        minimum = min([q1_sum, q2_sum])

        q1_unique = len(q1_ngram_set)
        q2_unique = len(q2_ngram_set)

        diff_unique = abs(q1_unique - q2_unique)

        intersect_r = Counter(q1_ngram_list) & Counter(q2_ngram_list)

        if q1_sum + q2_sum != 0:
            intersect_r = sum(intersect_r.values()) / (q1_sum + q2_sum) * 2
            intersect_unique_r = len(
                q1_ngram_set.intersection(q2_ngram_set)) / (q1_unique +
                                                            q2_unique) * 2
        else:
            intersect_r = -1
            intersect_unique_r = -1

        if 0 != len(q1_ngram_set.union(q2_ngram_set)):
            jaccard_dist = (len(q1_ngram_set.union(q2_ngram_set)) - len(
                q1_ngram_set.intersection(q2_ngram_set))) / len(
                    q1_ngram_set.union(q2_ngram_set))
        else:
            jaccard_dist = 1

        bin_dist = distance.binary_distance(q1_ngram_set, q2_ngram_set)
        masi_dist = distance.masi_distance(q1_ngram_set, q2_ngram_set)

        listout = [
            q1_sum, q2_sum, diff, diff_norm, maximum, minimum, q1_unique,
            q2_unique, diff_unique, intersect_r, intersect_unique_r,
            jaccard_dist, bin_dist, masi_dist
        ]

        keys = [
            'q1_sum', 'q2_sum', 'diff', 'diff_norm', 'max', 'min', 'q1_uni',
            'q2_uni', 'diff_uni', 'intersect_r', 'inter_uni_r', 'jaccard_dist',
            'bin_dist', 'masi_dist'
        ]
        keys = [x + str(n) + append for x in keys]
        dictout = dict(zip(keys, listout))

        return pd.Series(dictout)