def comparison(file1, file2): from nltk.metrics import distance as dst f1 = codecs.open(file1, 'r', 'utf-8') text1 = f1.read() t1 = text1.split() d1 = freqdict(t1) f1.close() f2 = codecs.open(file2, 'r', 'utf-8') text2 = f2.read() print dst.binary_distance(text1, text2) t2 = text2.split() d2 = freqdict(t2) text1 = text1.split(' ') text2 = text2.split(' ') f2.close() arr = [] for i in text1: if i not in text2: arr.append(i) for i in arr: print i print len(arr) print len(text2) print 'by words: ', float(len(arr)) / len(text2) arr2 = [] for i in t1: if i not in t2: arr2.append(i) print 'by symbols: ', float(len(arr2)) / len(t2)
def comparison(file1, file2): from nltk.metrics import distance as dst f1 = codecs.open(file1, 'r', 'utf-8') text1 = f1.read() t1 = text1.split() d1=freqdict(t1) f1.close() f2 = codecs.open(file2, 'r', 'utf-8') text2 = f2.read() print dst.binary_distance(text1, text2) t2 = text2.split() d2 = freqdict(t2) text1 = text1.split(' ') text2 = text2.split(' ') f2.close() arr = [] for i in text1: if i not in text2: arr.append(i) for i in arr: print i print len(arr) print len(text2) print 'by words: ', float(len(arr)) / len(text2) arr2=[] for i in t1: if i not in t2: arr2.append(i) print 'by symbols: ', float(len(arr2)) / len(t2)
def get_ngram_stats(row, n, qcolumns, char=False): if char == True: q1 = ''.join(row[qcolumns[0]].split()) q2 = ''.join(row[qcolumns[1]].split()) else: q1 = row[qcolumns[0]].split() q2 = row[qcolumns[1]].split() q1_ngram_list = list(ngrams(q1, n)) q2_ngram_list = list(ngrams(q2, n)) q1_ngram_set = set(q1_ngram_list) q2_ngram_set = set(q2_ngram_list) q1_sum = len(q1_ngram_list) q2_sum = len(q2_ngram_list) diff = abs(q1_sum - q2_sum) if q1_sum + q2_sum != 0: diff_norm = diff / (q1_sum + q2_sum) * 2 else: diff_norm = -1 maximum = max([q1_sum, q2_sum]) minimum = min([q1_sum, q2_sum]) q1_unique = len(q1_ngram_set) q2_unique = len(q2_ngram_set) diff_unique = abs(q1_unique - q2_unique) intersect_r = Counter(q1_ngram_list) & Counter(q2_ngram_list) if q1_sum + q2_sum != 0: intersect_r = sum(intersect_r.values()) / (q1_sum + q2_sum) * 2 intersect_unique_r = len( q1_ngram_set.intersection(q2_ngram_set)) / (q1_unique + q2_unique) * 2 masi_dist = distance.masi_distance(q1_ngram_set, q2_ngram_set) else: intersect_r = -1 intersect_unique_r = -1 masi_dist = -1 if 0 != len(q1_ngram_set.union(q2_ngram_set)): jaccard_dist = (len(q1_ngram_set.union(q2_ngram_set)) - len( q1_ngram_set.intersection(q2_ngram_set))) / len( q1_ngram_set.union(q2_ngram_set)) else: jaccard_dist = 1 bin_dist = distance.binary_distance(q1_ngram_set, q2_ngram_set) listout = [ q1_sum, q2_sum, diff, diff_norm, maximum, minimum, q1_unique, q2_unique, diff_unique, intersect_r, intersect_unique_r, jaccard_dist, bin_dist, masi_dist ] return listout
def get_ngram_stats(row, n, qcolumns, char=False, append=''): if char == True: q1 = ''.join(row[qcolumns[0]].split()) q2 = ''.join(row[qcolumns[1]].split()) else: q1 = row[qcolumns[0]].split() q2 = row[qcolumns[1]].split() q1_ngram_list = list(ngrams(q1, n)) q2_ngram_list = list(ngrams(q2, n)) q1_ngram_set = set(q1_ngram_list) q2_ngram_set = set(q2_ngram_list) q1_sum = len(q1_ngram_list) q2_sum = len(q2_ngram_list) diff = abs(q1_sum - q2_sum) if q1_sum + q2_sum != 0: diff_norm = diff / (q1_sum + q2_sum) * 2 else: diff_norm = -1 maximum = max([q1_sum, q2_sum]) minimum = min([q1_sum, q2_sum]) q1_unique = len(q1_ngram_set) q2_unique = len(q2_ngram_set) diff_unique = abs(q1_unique - q2_unique) intersect_r = Counter(q1_ngram_list) & Counter(q2_ngram_list) if q1_sum + q2_sum != 0: intersect_r = sum(intersect_r.values()) / (q1_sum + q2_sum) * 2 intersect_unique_r = len( q1_ngram_set.intersection(q2_ngram_set)) / (q1_unique + q2_unique) * 2 else: intersect_r = -1 intersect_unique_r = -1 if 0 != len(q1_ngram_set.union(q2_ngram_set)): jaccard_dist = (len(q1_ngram_set.union(q2_ngram_set)) - len( q1_ngram_set.intersection(q2_ngram_set))) / len( q1_ngram_set.union(q2_ngram_set)) else: jaccard_dist = 1 bin_dist = distance.binary_distance(q1_ngram_set, q2_ngram_set) masi_dist = distance.masi_distance(q1_ngram_set, q2_ngram_set) listout = [ q1_sum, q2_sum, diff, diff_norm, maximum, minimum, q1_unique, q2_unique, diff_unique, intersect_r, intersect_unique_r, jaccard_dist, bin_dist, masi_dist ] keys = [ 'q1_sum', 'q2_sum', 'diff', 'diff_norm', 'max', 'min', 'q1_uni', 'q2_uni', 'diff_uni', 'intersect_r', 'inter_uni_r', 'jaccard_dist', 'bin_dist', 'masi_dist' ] keys = [x + str(n) + append for x in keys] dictout = dict(zip(keys, listout)) return pd.Series(dictout)