def query(self, ltoks, mingram, maxngram, nbest, sortByEDist, idx_tst): if idx_tst >= 0: #single sentence print("[{}]\t{}".format(idx_tst,' '.join(ltoks[0]))) query_idx = [] for toks in ltoks: query_idx.append(self.convert(toks)) subphrases = self.getSubPhrase(query_idx, mingram, maxngram) counts = defaultdict(int) for subphrase in subphrases: result = self.getSentenceIds(subphrase) for idx_trn in result: counts[idx_trn] += 1 if not sortByEDist: ### sort by counts sorted_counts = sorted(counts.items(), key = lambda x:x[1], reverse=True) for idx_trn, ngrams_count in sorted_counts[:nbest]: entry = [] entry.append("{}".format(ngrams_count)) ### ngrams_counts trn_vec_idx = self.corpus[self.sentences[idx_trn]:self.sentences[idx_trn+1]-1] if idx_tst >= 0: #single sentence sm = edit_distance.SequenceMatcher(a=query_idx[0], b=trn_vec_idx) entry.append("{:.4f}".format(sm.ratio())) ### edit distance entry.append("{}".format(idx_tst)) ### index tst entry.append("{}".format(idx_trn)) ### index trn entry.append(' '.join(self.convert(trn_vec_idx))) ### trn print('\t'.join(entry)) else: ### sort by edit_distance if idx_tst == -1: sys.stderr.write('error: -testSet cannot be used with -sortByEDist') sys.exit() edist = {} #defaultdict(float) mbest = nbest*10 sorted_counts = sorted(counts.items(), key = lambda x:x[1], reverse=True) for idx_trn, ngrams_count in sorted_counts[:mbest]: trn_vec_idx = self.corpus[self.sentences[idx_trn]:self.sentences[idx_trn+1]-1] sm = edit_distance.SequenceMatcher(a=query_idx[0], b=trn_vec_idx) edist[idx_trn] = sm.ratio() sorted_edist = sorted(edist.items(), key = lambda x:x[1], reverse=True) for idx_trn, edist in sorted_edist[:nbest]: entry = [] entry.append("{}".format(counts[idx_trn])) ### ngrams_counts trn_vec_idx = self.corpus[self.sentences[idx_trn]:self.sentences[idx_trn+1]-1] entry.append("{:.4f}".format(edist)) ### edit distance entry.append("{}".format(idx_tst)) ### index tst entry.append("{}".format(idx_trn)) ### index trn entry.append(' '.join(self.convert(trn_vec_idx))) ### trn print('\t'.join(entry))
def evaluate_all(self): num_samples = len(self.all_recognition_text) def _normalize_text(text): text = ''.join(filter(lambda x: x in (string.digits + string.ascii_letters), text)) return text.lower() num_correct = 0 num_incorrect = 0 total_edit_distance = 0 incorrect_pairs = [] for i in range(num_samples): recogition = _normalize_text(self.all_recognition_text[i]) groundtruth = _normalize_text(self.all_groundtruth_text[i]) if recogition == groundtruth: num_correct += 1 else: num_incorrect += 1 incorrect_pairs.append((recogition, groundtruth)) sm = edit_distance.SequenceMatcher(a=recogition, b=groundtruth) normalized_ed = sm.distance() / len(groundtruth) total_edit_distance += normalized_ed num_print = min(len(incorrect_pairs), 100) # print('*** Groundtruth => Prediction ***') # for i in range(num_print): # recogition, groundtruth = incorrect_pairs[i] # print('{} => {}'.format(groundtruth, recogition)) # print('**********************************') case_insensitive_accuracy = 1.0 * num_correct / (num_correct + num_incorrect) metrics = { 'WordAccuracy': case_insensitive_accuracy, 'TotalEditDistance': total_edit_distance, } return metrics
def target_sentence_sampling(s_cluster, candidates, edit_distance_matrix, sent2idx_dict): longest = -1 longest_p = [] longest_p_nearest_p = [] for point in candidates: idx_p = sent2idx_dict[' '.join(point)] min_dist = 999999 min_p = [] for s_point in s_cluster: idx_sp = sent2idx_dict[' '.join(s_point)] if edit_distance_matrix[idx_p][idx_sp] >= 0: dist = edit_distance_matrix[idx_p][idx_sp] else: sm = edit_distance.SequenceMatcher(a=point, b=s_point) dist = sm.distance() edit_distance_matrix[idx_p][idx_sp] = dist edit_distance_matrix[idx_sp][idx_p] = dist if dist < min_dist: min_p = s_point min_dist = min(dist, min_dist) if min_dist > longest: longest = min_dist longest_p = point longest_p_nearest_p = min_p return longest_p, longest, longest_p_nearest_p
def update_centers(data_set, assignments, edit_distance_matrix, sent2idx_dict): new_means = {} centers = [] for assignment, point in zip(assignments, data_set): if assignment not in new_means: new_means[assignment] = [point] else: new_means[assignment].append(point) for center in new_means: points = new_means[center] shortest = 999999 # positive infinity shortest_p = [] for i, point in enumerate(points): total_dist = 0 for j, point2 in enumerate(points): idx_p = sent2idx_dict[' '.join(point)] idx_p2 = sent2idx_dict[' '.join(point2)] if edit_distance_matrix[idx_p][idx_p2] >= 0: dist = edit_distance_matrix[idx_p][idx_p2] else: sm = edit_distance.SequenceMatcher(a=point, b=point2) dist = sm.distance() edit_distance_matrix[idx_p][idx_p2] = dist edit_distance_matrix[idx_p2][idx_p] = dist total_dist += dist if total_dist < shortest: shortest = total_dist shortest_p = point centers.append(shortest_p) return centers
def edrs(a, b): #Edit Distance with Real sequences a = a.tolist() b = b.tolist() sm = edit_distance.SequenceMatcher(a, b) return sm.distance()
def get_similar_words(input_word, num_of_words): #print('get_similar_words') global vectors, ids p = np.array([nlp.vocab[input_word].vector]) closest_index = distance.cdist(p, vectors) #print('closest_index',closest_index) output_list = [] closest_indexes = closest_index.argsort() #print('closest_indexes',closest_indexes) closest_indexes = np.squeeze(closest_indexes) closest_indexes = closest_indexes[0:105] for i in closest_indexes: word_id = ids[i] output_word = nlp.vocab[word_id] output_word = output_word.text.lower() #print('in',type(input_word)) #print('out',type(output_word)) sm = edit_distance.SequenceMatcher(input_word.lower(), output_word.lower()) levin_dist = sm.distance() if ((output_word.lower() != input_word.lower()) and (levin_dist > 2)): output_word = output_word output_list.append(output_word) if len(output_list) >= num_of_words: return output_list return output_list
def get_top_songs(artist): top_songs = [] #might want to have this number be dynamic based on attribute ie. set time/popularity? song_count = 5 artist_search = spotify.search(q='artist:' + artist, type='artist') if len(artist_search['artists']['items']) == 0: return [] result_name = artist_search['artists']['items'][0]['name'] #comparing edit distance might be useful if (result_name.lower() == artist.lower() or edit_distance.SequenceMatcher( artist.lower(), result_name.lower()).ratio() >= 66): artist_id = artist_search['artists']['items'][0]['id'] top_tracks = spotify.artist_top_tracks(artist_id)['tracks'] for track in top_tracks: top_songs.append(track['id']) if len(top_songs) >= song_count: break else: print("error could not find artist spotify name {} scraped name {}". format(result_name, artist)) return top_songs
def find_teachers(subject, target): with open(filename, 'r') as f: subjects_fo = json.load(f) teachers = list(subjects_fo[subject].keys()) target = target.lower() teachers.sort(key=lambda t: edit_distance.SequenceMatcher( a=target, b=t.lower()).distance()) return teachers
def similarity(self, predicted: List[int], targets: torch.LongTensor, target_mask: torch.LongTensor) -> float: # remove padding ones actual_len = target_mask.sum() targets_trimmed = targets[:actual_len] targets_trimmed = list(targets_trimmed.cpu().data.numpy()) sm = edit_distance.SequenceMatcher(a=predicted, b=targets_trimmed) # get the edit distance similarity between two lists return sm.ratio()
def __call__(self, l1, l2): if (len(l1) == 1 and len(l1[0]) == 0) or (len(l2) == 1 and len(l2[0]) == 0): return 0.0, [''], [''] ### initially all discarded L1 = [self.u] * len(l1) L2 = [self.u] * len(l2) if self.lc: ### use .lower() or .casefold() sm = edit_distance.SequenceMatcher(a=[s.casefold() for s in l1], b=[s.casefold() for s in l2], action_function=edit_distance.highest_match_action) else: sm = edit_distance.SequenceMatcher(a=l1, b=l2, action_function=edit_distance.highest_match_action) for (code, b1, e1, b2, e2) in sm.get_opcodes(): if code == 'equal': ### keep words L1[b1] = l1[b1] L2[b2] = l2[b2] return sm.ratio(), L1, L2
def get_sequence_matches(sequence_1, sequence_2): if sequence_1 and sequence_2: sm = edit_distance.SequenceMatcher(a=sequence_1, b=sequence_2) sm.get_opcodes() sm.ratio() sm.get_matching_blocks() distance = sm.distance() num_matches = sm.matches() return num_matches else: return 0
def calc_global_alignment(file_f,file_s): opcodes_f = getOpcodeForFile(file_f) opcodes_s = getOpcodeForFile(file_s) # alignments = pairwise2.align.globalms(opcodes_f,opcodes_s,2,-1,-0.5,-0.1, gap_char=["-"],one_alignment_only=True) # # for a in alignments: # score = a[2] # print (score) sm = edit_distance.SequenceMatcher(a=opcodes_f, b=opcodes_s) return (sm.ratio())
def cal_edit(self, data0, data1): edit_result = [] for query in data0.keys(): if query in data1.keys(): list_0 = data0[query] list_1 = data1[query] sm = edit_distance.SequenceMatcher(list_0, list_1) edit_score = sm.distance() # print(edit_score) edit_result.append(edit_score) return edit_result
def calcurate_edit_distance(): ratio = 0.0 ds = random.sample(test_data, 100) for fr,_,to in ds: result = model.translate([model.xp.array(fr)])[0] result = tree2normalizedsentense(result) to = tree2normalizedsentense(to) #print(to,result) ratio += 1.0 - edit_distance.SequenceMatcher(to,result).ratio() #print(ratio) ratio /= len(ds) return ratio
def get_neg_candidates_edit_distance(candidate_term, terms): closest_term_dist = 5 closest_term = "" for term in terms: sequence_matcher = edit_distance.SequenceMatcher(a=candidate_term, b=term) edit_dist = sequence_matcher.distance() if edit_dist <= closest_term_dist and candidate_term != term: closest_term = term break return closest_term
def parallel_decoding(data): posteriors, true_length, text, hmm = data posteriors = posteriors[:true_length] best_path, pstar = hmm.viterbi_decode(posteriors) word_seq = hmm.getTranscription(best_path) ref_seq = text.split(' ') # edit distance res = edit_distance.SequenceMatcher(a=ref_seq, b=word_seq) return word_seq, best_path, pstar, res.distance()
def edit_dist_with_repl_similarity(tx_numb, rx_numb, word2numb): """ This function aligns two seq according to edit distance and then subtracts the similarity measure between replaced words from the edit distance. Wu Parmer similarity measure is used for this task. Args: tx_numb: the number representation of the tx sentence rx_numb: the number representation fo the rx sentence word2numb: word to numb object Returns: dist_measur: returns the distance measure """ # get the word representation tx_txt = word2numb.convert_n2w(tx_numb) rx_txt = word2numb.convert_n2w(rx_numb) ed_aligned = edit_distance.SequenceMatcher(a=tx_numb, b=rx_numb) dist_measur = ed_aligned.distance() # this is the edit distance indx_tx = 0 indx_rx = 0 # go through insertions and deletions and replacements in the alignment for i, op in enumerate(ed_aligned.get_opcodes()): # print(op) if op[0] == 'equal': indx_tx += 1 indx_rx += 1 continue elif op[0] == 'replace': # if replacement discount similarity tx_syn = get_synset(pos_tag([tx_txt[indx_tx]])) rx_syn = get_synset(pos_tag([rx_txt[indx_rx]])) sim = 0 if (tx_syn is not None) and (rx_syn is not None): sim = tx_syn.wup_similarity( rx_syn) # use Wu Palmer similarity measure if sim is None: sim = 0 dist_measur -= sim indx_tx += 1 indx_rx += 1 elif op[0] == 'delete': indx_tx += 1 elif op[0] == 'insert': indx_rx += 1 else: print("****************** ERROR ***************") break return dist_measur
def cal_distance(label_list, pre_list): y = ed.SequenceMatcher(a = label_list, b = pre_list) yy = y.get_opcodes() insert = 0 delete = 0 replace = 0 for item in yy: if item[0] == 'insert': insert += item[-1]-item[-2] if item[0] == 'delete': delete += item[2]-item[1] if item[0] == 'replace': replace += item[-1]-item[-2] distance = insert+delete+replace return distance, (delete, replace, insert)
def compute_pdists_in_docs(docs, length_ratio_threshold=None): pdists = [] n = len(docs) for j in range(n - 1): for i in range(j + 1, n): len_i = len(docs[i]) len_j = len(docs[j]) len_ratio = min(len_i, len_j) / max(len_i, len_j) if (length_ratio_threshold is not None and len_ratio < length_ratio_threshold): ratio = 1.0 else: sm = edit_distance.SequenceMatcher(a=docs[j], b=docs[i]) ratio = sm.distance() * 2 / (len_j + len_i) pdists.append(ratio) return pdists
def validation(model, val_fn, decode_fn, datagen, mb_size=64): """ Validation routine for speech-models Params: model (keras.model): Constructed keras model val_fn (theano.function): A theano function that calculates the cost over a validation set datagen (DataGenerator) mb_size (int): Size of each minibatch Returns: val_cost (float): Average validation cost over the whole validation set """ avg_cost = 0.0 avg_acc = 0.0 i = 0 for batch in datagen.iterate_validation(mb_size): inputs = batch['x'] labels = batch['y'] input_lengths = batch['input_lengths'] label_lengths = batch['label_lengths'] texts = batch['texts'] # print('labels:'+str(labels)) # Due to convolution, the number of timesteps of the output # is different from the input length. Calculate the resulting # timesteps ctc_input_lens = ctc_input_length(model, input_lengths) # print('ctc_input_lens_pre:'+str(ctc_input_lens)) prediction, ctc_cost = val_fn( [inputs, ctc_input_lens, labels, label_lengths, True]) # print(labels) # prediction = np.swapaxes(prediction, 0, 1) predict_str = argmax_decode(prediction, decode_fn, ctc_input_lens) # print('predict_str:'+str(predict_str)) avg_cost += ctc_cost.mean() print('predict_str:' + str(predict_str)) print('texts:' + str(texts)) acc_sum = 0 for index, text in enumerate(texts): sm = edit_distance.SequenceMatcher(a=text, b=predict_str[index]) acc = 1.0 - sm.distance() / len(text) acc_sum = acc_sum + acc avg_acc += acc_sum * 1.0 / (index + 1) i += 1 if i == 0: return 0.0, 0.0 return avg_cost / i, avg_acc / i
def correctErrors(newStr, dic): for i in range(0, len(newStr), 4): str1 = newStr[i:i + 4] error = 1 for j in range(8): if str1 == dic[str(j)]: error = 0 break if error == 1: edit_dist = [] for k in range(8): edit_dist.append( edit_distance.SequenceMatcher(a=str1, b=dic[str(k)]).distance()) min_index = edit_dist.index(min(edit_dist)) newStr = newStr[:i] + dic[str(min_index)] + newStr[i + 4:] return newStr
def test(model, test_fn, decode_fn, datagen, mb_size=16, conv_context=11, conv_border_mode='valid', conv_stride=2): """ Testing routine for speech-models Params: model (keras.model): Constructed keras model test_fn (theano.function): A theano function that calculates the cost over a test set datagen (DataGenerator) mb_size (int): Size of each minibatch conv_context (int): Convolution context conv_border_mode (str): Convolution border mode conv_stride (int): Convolution stride Returns: test_cost (float): Average test cost over the whole test set """ avg_cost = 0.0 i = 0 acc_list = [] for batch in datagen.iterate_test(mb_size): inputs = batch['x'] labels = batch['y'] input_lengths = batch['input_lengths'] label_lengths = batch['label_lengths'] ground_truth = batch['texts'] # Due to convolution, the number of timesteps of the output # is different from the input length. Calculate the resulting # timesteps # output_lengths = [conv_output_length(l, conv_context, # conv_border_mode, conv_stride) # for l in input_lengths] ctc_input_lens = ctc_input_length(model, input_lengths) prediction, ctc_cost = test_fn([inputs, ctc_input_lens, labels, label_lengths, True]) # predictions = np.swapaxes(predictions, 0, 1) prediction_str = argmax_decode(prediction, decode_fn, ctc_input_lens) for i, prediction in enumerate(prediction_str): truth = ground_truth[i] sm = edit_distance.SequenceMatcher(a=truth,b=prediction) acc = 1 - sm.distance()/len(truth) acc_list.append(acc) print("Truth: {}, Prediction: {}, acc: {}".format(truth, prediction, acc)) print(acc_list) print('avg_acc:'+str(np.array(acc_list).mean())) return ''
def EDAlignment(self): self.tst2src = [-1] * len( self.tst ) ### this vector points to the corresponsing src word for each tst word (or -1 if there is no correspondence) self.src2tst = [-1] * len(self.src) sm = edit_distance.SequenceMatcher(self.tst, self.src) blocks = sm.get_matching_blocks() for block in blocks: self.tst2src[block[0]] = block[1] self.src2tst[block[1]] = block[0] if self.verbose: for x in range(len(self.tst2src)): if self.tst2src[x] != -1: print('TST2SRC [{}:{} {}:{}]'.format( x, self.tst[x], self.tst2src[x], self.src[self.tst2src[x]])) return
def assign_points(data_points, centers, edit_distance_matrix, sent2idx_dict): assignments = [] for point in data_points: shortest = 999999 # positive infinity shortest_index = 0 for i, center in enumerate(centers): idx_p = sent2idx_dict[' '.join(point)] idx_c = sent2idx_dict[' '.join(center)] if edit_distance_matrix[idx_p][idx_c] >= 0: dist = edit_distance_matrix[idx_p][idx_c] else: sm = edit_distance.SequenceMatcher(a=point, b=center) dist = sm.distance() edit_distance_matrix[idx_p][idx_c] = dist edit_distance_matrix[idx_c][idx_p] = dist if dist < shortest: shortest = dist shortest_index = i assignments.append(shortest_index) return assignments
def test(self, dataset): # init stats E, N = 0, 0 with tqdm(total=len(dataset), bar_format=' {l_bar}{bar:30}{r_bar}') as pbar: for x, y, text, y_true_length in dataset.generator(): posteriors = self.features_to_posteriors(x) # run viterbi to get recognized words best_path, pstar = self.hmm.viterbi_decode(posteriors) word_seq = self.hmm.getTranscription(best_path) # get original text ref_seq = text.split(' ') # edit distance res = edit_distance.SequenceMatcher(a=ref_seq, b=word_seq) E += res.distance() N += len(ref_seq) accuracy = (N - E) / N # update progress bar pbar.set_description(f'Test acc. {accuracy:.6f}') pbar.update(1) return accuracy
def _get_operation_counts( source_string: str, destination_string: str ) -> Tuple[int, int, int, int]: """ Check how many edit operations (delete, insert, replace) are required to transform the source string into the destination string. The number of hits can be given by subtracting the number of deletes and substitutions from the total length of the source string. :param source_string: the source string to transform into the destination string :param destination_string: the destination to transform the source string into :return: a tuple of #hits, #substitutions, #deletions, #insertions """ #editops = Levenshtein.editops(source_string, destination_string) editops = edit_distance.SequenceMatcher(a=source_string, b=destination_string).get_opcodes() substitutions = sum(1 if op[0] == "replace" else 0 for op in editops) deletions = sum(1 if op[0] == "delete" else 0 for op in editops) insertions = sum(1 if op[0] == "insert" else 0 for op in editops) hits = len(source_string) - (substitutions + deletions) return hits, substitutions, deletions, insertions
adv_digit = attack_dir.parent.parent.parent.name[-1] target_filename = attack_dir.parent.parent.parent.parent.name target_speaker = '-'.join(target_filename.split("-")[:-1]) original_digit = target_filename[-2] E, N = 0, 0 for test_filename, r in attack_res['test_res'].items(): pred_word_seq, label_word_seq = r['pred_word_seq'], r['label_word_seq'] label_word_seq = ' '.join([str(d) for d in tools.str_to_digits(label_word_seq.split())]) pred_word_seq = ' '.join([str(d) for d in tools.str_to_digits(pred_word_seq.split())]) if test_filename.startswith(target_speaker): continue else: res = edit_distance.SequenceMatcher(a=label_word_seq, b=pred_word_seq) E += res.distance() N += len(label_word_seq.split(" ")) speaker_E, speaker_N = 0, 0 speaker_target_file_num = 0 speaker_succeeded_targets = [] for test_filename, r in attack_res['speaker_res'].items(): pred_word_seq, label_word_seq = r['pred_word_seq'], r['label_word_seq'] label_word_seq = ' '.join([str(d) for d in tools.str_to_digits(label_word_seq.split())]) pred_word_seq = ' '.join([str(d) for d in tools.str_to_digits(pred_word_seq.split())]) if test_filename == target_filename: continue if test_filename.startswith(target_speaker):
output = [] net_accuracy = [] edit_distance_output = [] for j in range(5): net_score = 0 total_words = 0 edit_distance_score = 0 accuracy = 0.0 for i in range(int(floor(j * len(a)) / 5.0), int(floor((j + 1) * len(a)) / 5.0)): reference = [a[i]] candidate = s[i] score = sentence_bleu(reference, candidate) net_score += score total_words += 1 sm = edit_distance.SequenceMatcher(a=a[i], b=s[i]) edit_distance_score += sm.ratio() if a[i] == s[i][:-1]: accuracy += 1 net_accuracy.append(accuracy / total_words) edit_distance_output.append(edit_distance_score / total_words) output.append(net_score / total_words) print(output) plt.plot([1, 2, 3, 4, 5], output, 'ro') plt.axis([0, 6, 0.7, 0.8]) plt.xlabel('Test Set') plt.ylabel('Bleu Score') plt.show() print(net_accuracy)
def editDistance(s1, s2): return edit_distance.SequenceMatcher(a=s1, b=s2).distance()
def codeBookGen(): #dic = {"1":"ATTC","2":"ACTA","3":"ATTA","4":"TATA","5":"AATC","6":"ACAA","7":"TTTC","0":"TCTA",} #generate all possible sequences of length 4 pool = [] print("Making dictionary") for i in range(256): temp = format(i,'08b') str2 = "" x = "" for j in range(4): x = temp[2*j:2*j+2] #print(x) if x == '00': str2 += 'A' elif x == '01': str2 += 'C' elif x == '10': str2 += 'G' else : str2 += 'T' #print(temp) #print(temp) #print(str2) pool.append(str2); len(pool) #pool has been created #now test for repetitivenesss pool1 = [] for str2 in pool: myset = set() #print(str2) for i in range(16): ss = "" #subsequence b = format(i,'04b') for j in range(4): if b[j] == '1': ss += str2[j] #print(ss) myset.add(ss) #print(len(myset)) ratio = len(myset)/16 if ratio > 0.75: pool1.append(str2) #print(myset) #print(str2) #time.sleep(10) #print(len(pool1)) #The sequences with high repititiveness have been removed. #Now we will remove ones with undesirable GC content(<40 or >60) """ pool2 = [] for str2 in pool1: countGC = str2.count('G') + str2.count('C') if countGC == 2: pool2.append(str2) print(str2) print(len(pool2)) """ pool2 = pool1 #REMOVE THIS IF GC CONTENT CONSTRAINT NEEDS TO BE INCLUDED no = 0 while no != 8: #now we use edit distance constraint x = random.randint(0,len(pool2)-1) codewords = [] codewords.append(pool2[x]) str1 = codewords[0] pool3 = [] temppool = pool2 for i in range(len(pool2)): str1 = codewords[i] for str2 in temppool: dist = edit_distance.SequenceMatcher(a=str2,b=str1).distance() if dist >= 3: pool3.append(str2) """ else: print(dist) print(str2,str1) """ #print('This is the length of pool 3:',len(pool3)) if len(pool3) == 0: break x = random.randint(0,len(pool3)-1) codewords.append(pool3[x]) temppool = pool3 pool3 = [] no = len(codewords) # print(codewords) return codewords