def run402_01(): A = " abcd" B = "abcd abcd" def show_results(match): print('a = {}'.format(match.a)) print('b = {}'.format(match.b)) print('size = {}'.format(match.size)) i, j, k = match print('A[a:a+size] = {!r}'.format(A[i:i + k])) print('B[b:b+size] = {!r}'.format(B[j:j + k])) print('A = {!r}'.format(A)) print('B = {!r}'.format(B)) print('\nWithout junk detection:') s1 = SequenceMatcher(None, A, B) m1 = s1.find_longest_match(0, len(A), 0, len(B)) show_results(m1) print('\nTreat spaces as junk:') s1 = SequenceMatcher(lambda x: x == " ", A, B) m1 = s1.find_longest_match(0, len(A), 0, len(B)) show_results(m1)
def get_close_matches_indexes(word, possibilities, junk_seq=None, n=3, cutoff=0.5): if not n > 0: raise ValueError("n must be > 0: %r" % (n, )) if not 0.0 <= cutoff <= 1.0: raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff, )) result_ratio = [] result_idx = [] # se vogliamo escludere le "junk sequence" (in questo caso lo spazio) SequenceMatcher(lambda x: x==" "), if junk_seq: s = SequenceMatcher(isjunk=lambda x: x == junk_seq) else: s = SequenceMatcher() s.set_seq2(word) for idx, x in enumerate(possibilities): s.set_seq1(x) longest_match = s.find_longest_match(0, len(x), 0, len(word)) our_ratio = (longest_match.size) / len(word) if our_ratio >= cutoff: match = s.find_longest_match(0, len(x), 0, len(word)) #print(match) #print (word[match.a:match.a + match.size]) result_ratio.append(our_ratio) result_idx.append(idx) max_idx = [] if result_ratio: ratios = np.asarray(result_ratio) max_idx = [] for i in range(n): arg_max_idx = np.argmax(result_ratio) if result_ratio[arg_max_idx] < cutoff: max_idx.append((-1, 0)) else: max_idx.append( (result_idx[arg_max_idx], result_ratio[arg_max_idx])) #messo a zero cosi la prossima volta prendiamo il prossimo "massimo" result_ratio[arg_max_idx] = 0 #max_idx e una lista con #[(indice del match, score), (indice match, score)] final_result = [] for i, s in max_idx: final_result.append((possibilities[i], s)) return final_result
def markBIO(annotated_text): text, span_n_label = annotated_text[0], annotated_text[1] span, type = span_n_label[0], span_n_label[1] span = span.strip() seq_match = SequenceMatcher(None, text, span, autojunk=False) match = seq_match.find_longest_match(0, len(text), 0, len(span)) # Single match only start = match.a end = start + match.size print("start: {}, end: {}".format(start, end)) temp_str = text[start:end] temp_str_tokens = temp_str.split() word_dict = {} pointer = 0 for word in text.split(): if pointer < start: word_dict[word] = 'O' elif pointer >= start and pointer < end: if len(temp_str_tokens) > 1: word_dict[temp_str_tokens[0]] = 'B-' + type for w in temp_str_tokens[1:]: word_dict[w] = 'I-' + type else: word_dict[temp_str] = 'B-' + type else: word_dict[word] = 'O' pointer += (len(word) + 1) print(word_dict) return word_dict
def main(str1: str, str2: str) -> str: sequence_matcher = SequenceMatcher(None, str1, str2) longest_match = sequence_matcher.find_longest_match( 0, len(str1), 0, len(str2)) return str1[longest_match.a:longest_match.b]
def _process_internal(self, sentence_list: List) -> List: print('up to know did not help a lot, due to wrong implementation, condenses too much!') # split sentences first to a and b sent_list_a = sentence_list[:len(sentence_list) // 2] sent_list_b = sentence_list[len(sentence_list) // 2:] new_sentence_list_a = [] new_sentence_list_b = [] total_shortend_sentences = [] for idx, (sent_a, sent_b) in enumerate(zip(sent_list_a, sent_list_b)): sent_a = [item for sublist in sent_a for item in sublist] sent_b = [item for sublist in sent_b for item in sublist] s = SequenceMatcher(None, sent_a, sent_b) m = s.find_longest_match(0, len(sent_a), 0, len(sent_b)) if m.size > self.ngrams_num: to_add_a = sent_a[:m.a] + sent_a[m.a+m.size:] to_add_b = sent_b[:m.b] + sent_b[m.b+m.size:] if len(to_add_a) <= self.n_min or len(to_add_b) <= self.n_min: to_add_a = sent_a to_add_b = sent_b else: total_shortend_sentences.append(idx) else: to_add_a = sent_a to_add_b = sent_b new_sentence_list_a.append([to_add_a]) new_sentence_list_b.append([to_add_b]) print(total_shortend_sentences) print(len(total_shortend_sentences)) return new_sentence_list_a + new_sentence_list_b
def is_acceptable_similar(name): for i, comp_name in enumerable(df['names']): sequence = SequenceMatcher(None, text, comp) longest_match = sequence.find_longest_match(0, len(name), 0, len(comp_name)) percent = (longest_match.size / len(text)) * 100 return (percent >= affordable_rate)
def diff(fp1, fp2): reward = 0 with open(fp1) as f1, open(fp2) as f2: for line1, line2 in zip(f1, f2): # Example scoring #line1 = 'ABC' #line2 = '1ABC' # score #line2 = 'A12' # score # Find longest matching block s = SequenceMatcher(None, line1.rstrip(), line2.rstrip()) longest_matching_block = s.find_longest_match( 0, len(line1.rstrip()), 0, len(line2.rstrip())) # print(longest_matching_block) # print(longest_matching_block.size) reward += longest_matching_block.size * 0.1 # Find exact matching characters correct = 0 line1 = t2a(line1) line2 = t2a(line2) #shortest_length = min chars = zip_longest(line1, line2) for c, d in chars: #print(c, d) if c == None: c = 0 if d == None: d = 0 if c == d: correct += 5 else: if chr(c) in alphas and chr(d) in alphas: # Baseline correct += 2 # Test distance from each other in alphabet. # Range [1, 3] # indexa = alphas.index(chr(c)) # indexb = alphas.index(chr(d)) # ind_diff = abs(indexa - indexb) # corrrect = (2/26) * ind_diff + 1 elif chr(c) in numbers and chr(d) in numbers: correct += 2 elif chr(c) in space and chr(d) in space: correct += 2 elif chr(c) in symbols and chr(d) in symbols: correct += 2 #print(correct) reward += correct * 0.1 # Penalize extra characters penalty = abs(len(line2) - len(line1)) * 0.25 if reward - penalty > 0.1: reward -= penalty else: reward = 0.1 return reward
def lcs(str1, str2): s = SequenceMatcher(None, str1, str2) m = s.find_longest_match(0, len(str1), 0, len(str2)) if m.size > 0: return str1[m.a:m.a + m.size] else: return ''
def longest_sequence_match_with_query(self, query, passage): match = SequenceMatcher(None, query.lower().split(), passage.lower().replace(".", "").split()) matching_block = match.find_longest_match(0, len(query.split()), 0, len(passage.split())) return matching_block[2]
def get_best_match(first_name, possible_matches): """ Takes in a string first_name and an array of strings possible_matches. Rules for determining the best match: - The first characters of the first name and possible match must match - the match should have the greatist common subsequence Returns a string that is the best match to the name """ best_match = "" best_match_len = 0 seqMatch = SequenceMatcher(None, first_name, '') # Loop through and find the longest substring match for pmatch in possible_matches: seqMatch.set_seq2(pmatch) gcs = seqMatch.find_longest_match(0, len(first_name), 0, len(pmatch)) if (gcs.size == 0): continue elif (pmatch[0] == first_name[0] and gcs.size >= best_match_len): best_match = pmatch best_match_len = gcs.size return best_match
def longest_substring_mine(str1, str2): len1 = len(str1) len2 = len(str2) if len1 == 0 or len2 == 0: return 0, 0 current_longest_match_len = ALLOWED_MATCH_LEN total_match_len = 0 while current_longest_match_len >= ALLOWED_MATCH_LEN: seqMatch = SequenceMatcher(None, str1, str2) match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) if match.size != 0: longest = str(str1[match.a:match.a + match.size]) print(longest) current_longest_match_len = len(longest.strip()) total_match_len += current_longest_match_len str1 = str1.replace(longest, '') str2 = str2.replace(longest, '') else: current_longest_match_len = 0 print('str1 ratio', total_match_len / len1) print('str2 ratio', total_match_len / len2) return (total_match_len * 100) / len1, (total_match_len * 100) / len2
def longestSubstring(str1, str2): LS = "" if str1 == str2: return True # initialize SequenceMatcher object with # input string seqMatch = SequenceMatcher(None, str1, str2) # find match of longest sub-string # output will be like Match(a=0, b=0, size=5) match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) # print longest substring if (match.size != 0): LS = str1[match.a:match.a + match.size] else: return False #if (str1.startswith(LS) and str2.startswith(LS) and len(LS) >= 5) or len(LS) >= 6 or ( LS == str1 or LS == str2): if len(LS) >= 5: if str1.startswith(LS) and str2.startswith(LS): return True if str1.startswith(LS) and str2.endswith(LS): return True if str1.endswith(LS) and str2.startswith(LS): return True if str1.endswith(LS) and str2.endswith(LS): return True if LS == str1 or LS == str2: return True return False
def compute_similarity_sentence(self, dst, src): dst, src = dst.lower(), src.lower() dst, src = str(ViUtils.remove_accents(dst.strip())), str( ViUtils.remove_accents(src.strip())) seq_match = SequenceMatcher(None, src, dst) match = seq_match.find_longest_match(0, len(src), 0, len(dst)) return 0 if match.size == 0 else match.size / len(src)
def longestSubstring(str1,str2): seqMatch = SequenceMatcher(None,str1,str2) match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) if (match.size!=0): print (str1[match.a: match.a + match.size]) else: print ("I can't find it.")
def match(str1, str2): matched = SequenceMatcher(isjunk=lambda x: x in " ", a=str1.lower(), b=str2.lower()) pos1, pos2, size = matched.find_longest_match(0, len(str1), 0, len(str2)) matched_string = str1[pos1:pos1 + size] return matched_string
def common_longest(a, b): seqMatch = SequenceMatcher(None, a, b) match = seqMatch.find_longest_match(0, len(a), 0, len(b)) if (match.size != 0): return a[match.a:match.a + match.size] else: return ""
def longest_Substring(s1, s2): seq_match = SequenceMatcher(None, s1, s2) match = seq_match.find_longest_match(0, len(s1), 0, len(s2)) if (match.size != 0): return (s1[match.a:match.a + match.size]) else: return ('Longest common sub-string not present.')
def longestSubstring(str1,str2): seqMatch = SequenceMatcher(None,str1.lower(),str2.lower()) match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) if (match.size>=len(str1)*2/3): return (str1[match.a: match.a + match.size]) else: return (None )
def lcs1(str1: str, str2: str) -> str: """ difflib package solution """ """ lcs problem explenation: https://www.youtube.com/watch?v=HgUOWB0StNE""" seqMatch = SequenceMatcher(None, str1, str2) match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) res = str1[match.a: match.a + match.size] return(res)
def longestSubstringNormalized(str1, str2): # initialize SequenceMatcher object with # input string str1 = str1.lower() str2 = str2.lower() seqMatch = SequenceMatcher(None, str1, str2) # find match of longest sub-string # output will be like Match(a=0, b=0, size=5) match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) # print longest substring if (match.size != 0): #print(match.size) #print(len(str1)) #print(len(str2)) a = len(str1) + len(str2) #print("size of thge two strings - ", a) b = a / 2 #print(b) #print("size of thge two strings divided by 2: %f" % b) #print(match.size / b) return (match.size / b) return 0
def grade_memo(self, memo_text): #get length length = len(memo_text) #get longest common sequence seqMatch = SequenceMatcher(None, self.text, memo_text) match = seqMatch.find_longest_match(0, len(self.text), 0, len(memo_text)) lcs = match.size #get semantic similarity memo_embedding = self.bert_model.encode([memo_text]) semantic_similarity = cosine_similarity(memo_embedding, self.slides_embedding) if semantic_similarity <= 0: semantic_similarity = 0.1 #get keyword hit ratio tokens = jieba.cut(memo_text) hit_kw = list(set(tokens).intersection(set(self.keywords))) kw_hit_ratio = 0.8 + len(hit_kw) / len(self.keywords) memo_score = (length - lcs)*semantic_similarity*kw_hit_ratio return memo_score[0][0] # class ShortAnswerGrader(object):
def _longest_match_size(str1, str2): """ find the longest matching block, and return the string length """ sq = SequenceMatcher(lambda x: x == " ", str1, str2) match = sq.find_longest_match(0, len(str1), 0, len(str2)) return match.size
def common(str1, str2): seqMatch = SequenceMatcher(None, str1, str2) match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) if (match.size != 0): return str1[match.a:match.a + match.size] else: return -1
def get_prefix(lis): first_el = lis.pop(0) for i in lis: match = SequenceMatcher(None, first_el, i) match = match.find_longest_match(0, len(first_el), 0, len(i)) first_el = first_el[match.a:match.a + match.size] return first_el
def score(definition1, definition2): result = 0 definition1_tokens = tokenize_and_preprocess(definition1) definition2_tokens = tokenize_and_preprocess(definition2) while True: sequence_matcher = SequenceMatcher(None, definition1_tokens, definition2_tokens) match_results = sequence_matcher.find_longest_match( 0, len(definition1_tokens), 0, len(definition2_tokens)) if match_results.size == 0: break result += match_results.size**2 definition1_tokens = definition1_tokens[:( match_results.a)] + definition1_tokens[(match_results.a + match_results.size):] definition2_tokens = definition2_tokens[:( match_results.b)] + definition2_tokens[(match_results.b + match_results.size):] return result
def longestSubstring(str1, str2): seqMatch = SequenceMatcher(None, str1, str2) match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) if (match.size != 0): print(str1[match.a:match.a + match.size]) else: print('No longest common sub-string found')
def predict(args): """ Handle view commands. :param args: Args from command. """ headers = ["Filename"] output = [] models = [] prefix = "" for i, mn in enumerate(args.models): model = MEGNetModel.from_file(mn) models.append(model) if i == 0: prefix = mn else: sm = SequenceMatcher(None, prefix, mn) match = sm.find_longest_match(0, len(prefix), 0, len(mn)) prefix = prefix[0:match.size] headers.append( f"{mn} ({model.metadata.get('unit', '').strip('log10')}") headers = [h.lstrip(prefix) for h in headers] for fn in args.structures: structure = Structure.from_file(fn) row = [fn] for model in models: val = model.predict_structure(structure).ravel() if "log10" in str(model.metadata.get("unit", "")): val = 10**val row.append(val) output.append(row) print(tabulate(output, headers=headers))
def longestCommonSubsequence(str1, str2): seqMatch = SequenceMatcher(None, str1, str2) match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) if (match.size != 0): return str1[match.a:match.a + match.size] else: print("No common substring between two strings.")
def _longest_match_ratio(str1, str2): """ find the longest matching block between string1 and string2, and then calculate the string length divide by min(string1, string2) """ sq = SequenceMatcher(lambda x: x == " ", str1, str2) match = sq.find_longest_match(0, len(str1), 0, len(str2)) return np_utils._try_divide(match.size, min(len(str1), len(str2)))
def lcs(s1, s2): from difflib import SequenceMatcher seq = SequenceMatcher(None, s1, s2) lcs = seq.find_longest_match(0, len(s1), 0, len(s2)) if (lcs.size != 0): return str(s1[lcs.a:lcs.a + lcs.size]) else: return ""
def colorize_match(t, field, matches=None): field_val = t[field] if not field_val: return None match = matches.get(field, '') if matches else '' seqmatch = SequenceMatcher(None, field_val, match) a, b, size = seqmatch.find_longest_match(0, len(field_val), 0, len(match)) return (field_val[:a] + TERM.green(field_val[a:a + size]) + field_val[a + size:])
def find_match(question, doc): """Finds occurences of question in docs Returns the longest match, where a match is a Named Tuple. Use unwrap_match(doc, match) to unwrap them or for more documentation. """ question = question.lower() doc = doc.lower() sm = SequenceMatcher() sm.set_seqs(doc, question) return sm.find_longest_match(0, len(doc), 0, len(question))
def match_query(self, query_obj): """Checks whether this contact matches the given query @param query_obj Dict containing key/value pairs of the required matches @return Accuracy of the match, ranging from 0.0 (no match) to 1.0 (complete match)""" overall_match = 1.0 matcher = SequenceMatcher() for field_name in query_obj.keys(): # Skip fields only meaningful to the parser if field_name[:1] == "_": continue field_value = query_obj[field_name] best_field_match = 0.0 # The matcher internally caches details about seq2, so let's make use of that matcher.set_seq2(field_value) seq2_len = len(field_value) # Check if field value(s) of this contact match(es) the query field try: field_ids = self._field_idx[field_name] for field_id in field_ids: # A field is (Key,Value,Comp_Value,Source), so [2] is the value we usually use for comparison comp_value = self._fields[field_id][2] if not comp_value: # Use the real value if no comparison value given comp_value = self._fields[field_id][1] # Compare and determine the best match ratio matcher.set_seq1(comp_value) match = matcher.find_longest_match(0, len(comp_value), 0, seq2_len) match_len = match[2] field_match = float(match_len) / seq2_len if field_match > best_field_match: best_field_match = field_match syslog(LOG_DEBUG, "Contacts: Field match for %s / %s: %f" % (comp_value, field_value, field_match)) except KeyError: # Contact has no data for this field contained in the query, so this entry cannot match return 0.0 # Aggregate the field match value into the overall match # We don't use the average of all field matches as one # non-match *must* result in a final value of 0.0 overall_match *= best_field_match # Stop comparing if there is too little similarity if overall_match < _MIN_MATCH_TRESHOLD: break return overall_match
def main(argv): fname = argv[0] with open(fname, 'r') as f: lines = f.read().splitlines() for line in lines: (string_1, string_2) = line.split() sequence_matcher = SequenceMatcher( None, string_1, string_2 ) match = sequence_matcher.find_longest_match( 0, len(string_1), 0, len(string_2) ) # Get next digit after the match to determine the regexp range start_digits = string_1[ match.a + match.size: None ] end_digits = string_2[ match.b + match.size: None ] # If the range begins with all zeros and ends with all nines # then include the entire range from the longest string match if re.match('^[0]+$', start_digits) and re.match('^[9]+$', end_digits): print string_1, string_2, string_1[match.a: match.a + match.size] # We need to generate custom ranges for everything else else: print 'Custom range START' try: m = re.search('(.+?)[0]+$', start_digits) range_start = m.group(1) m = re.search('(.+?)[9]+$', end_digits) range_end = m.group(1) # Create ranges by appending the range to the longest match for r in range(int(range_start), int(range_end) + 1): print string_1, string_2, string_1[match.a: match.a + match.size] + str(r), range_start, range_end except AttributeError: print 'FAILED', string_1, string_2, start_digits, end_digits finally: print 'Custom range END'
def diff(_str, candidate, term, as_error=False): """ Return a string representing how well candidate matches str : matching words are green, partial matches (chars) are orange. If as_error is True, non matching chars are red. """ match = SequenceMatcher(None, _str.lower(), candidate.lower()) match_indexes = match.find_longest_match(0, len(_str), 0, len(candidate)) _, beg, end = match_indexes match = candidate[beg:beg + end] words = match.split(' ') res = term.red(candidate[:beg]) if as_error else candidate[:beg] for w in words: res += (term.green(w) if tags.is_match(w, _str) else term.yellow(w)) + ' ' res += '\b' + (term.red(candidate[beg + end:]) if as_error else candidate[beg + end:]) return res
def alignment_indices(template, primer): """ Finds the optimal alignment between template and primer. Inputs: ======= - str1, str2: (str) Returns (int1, int2), (int3, int4), where: - int1, int2 = start, stop on str1 - int3, int4 = start, stop on str2 For the DNA case, we are assuming that the 5'->3' directionality of the two strings are identical. """ s = SequenceMatcher(None, template, primer) m = s.find_longest_match(0, len(template), 0, len(primer)) return (m.a, m.a + m.size), (m.b, m.b + m.size)
def find_longest_common_sequence(self, l, anywhere=False): common_text = "" insert_text = "" for c in l: text = self.editor.text()[self.items[c].start_pos:self.editor.pos] if text: c = ''.join(c.partition(text)[1:]) if common_text: if anywhere and (not text): s = SequenceMatcher(None, common_text, c) match = s.find_longest_match(0, len(common_text), 0, len(c)) common_text = c[match.b:match.b + match.size] else: common_text = find_longest_match_at_start(common_text, c) else: common_text = c insert_text = common_text[len(text):] if not common_text: break return common_text, insert_text
def longest_match_size(str1, str2): sq = SequenceMatcher(lambda x: x == " ", str1, str2) match = sq.find_longest_match(0, len(str1), 0, len(str2)) return match.size
def remove_ref_punctuation(lines_list,grnd): #lines_list is list of transcript file lines reg123=['\'','\"','\.*','\,','\?','\!'] reg_list=[] for r in reg123: reg_list.append(re.compile(r)) paragraphs=lines_list filtered=[] for p in paragraphs: fil=p for regex in reg_list: fil=regex.sub('', fil) filtered.append(fil) t=[] prev='' count=0 for i in range(len(filtered)): f = filtered[i].lstrip().rstrip().rstrip('\n') if f == '': continue seq=SequenceMatcher(None,prev,f) a=seq.find_longest_match(0,len(prev),0,len(f)) x=a[0] y=a[1] size=a[2] str1=prev[x:x+size].lstrip().rstrip() str2=f[y:y+size].lstrip().rstrip() ## print(str1,' ||| ', str2) ## print(a) if y==0: if prev[x:x+size] == prev[-size:]: c=str1+' '+str2 ## print(prev[x:x+size]) ## print(prev[-size:]) ## print(c, c not in grnd) if c not in grnd: ## print(a) ## print(prev) ## print(f) ## print(c) t.append(prev[:-size-1]) else: t.append(prev) else: t.append(prev) else: t.append(prev) prev=f #not all repetition is removed the first time b/c longest match #isn't always the beginning/end #this below helps get some of the smaller beginnig/end repetitions in transcript #lines, but not all.. I'm not sure how to fix it prev='' s='' count=0 for i in range(len(t)): f = t[i].lstrip().rstrip() ## print('CURR:', f) ## print('PREV:',prev) if f == '': continue ps=prev.split() fs=f.split() j=0 w=fs[j] #print(w) w=fs.pop(0) while w in ps and len(fs) !=0: w+=' '+fs[0] a=prev.find(w.strip()) if a>=0: ## print(prev) ## print(f) ## print(prev[a:],' |||| ',w) ## print(a) c=prev[a:]+' '+w #print(c) if prev[a:].strip()==w.strip() and c not in grnd: #print(w,len(w)) #print(a) #print(prev[a:],' |||| ',w) #print('repeat') #print(s) s=s[:-len(w.strip())-1] #print(s) s+=f+' ' prev=f return s.lower().lstrip().rstrip()
from difflib import SequenceMatcher as SQMA argquan=len(sys.argv) if argquan != 2: print "This script requires one argument: a file listing file" sys.exit(2) with open(sys.argv[1]) as f: fl=f.read().splitlines() flsz=len(fl) f0sz=len(fl[0]) ssz=f0sz for i in xrange(1,flsz): fisz=len(fl[i]) s = SQMA(None, fl[0], fl[i]) (ast, bst, csz)=s.find_longest_match(0, f0sz, 0, fisz) # print "*.s" % (fl[0], 3) # print ":.*".format(2, fl[0]) if fisz > f0sz: ssz=fisz print '{:>{width}.{prec}}'.format(fl[0][ast:], width=ssz,prec=csz) # the > right aligns # print '{:>{width}.{prec}}'.format(fl[i][ast:], width=ssz,prec=csz) print "{}".format(fl[i]) else: print '{:>{width}.{prec}}'.format(fl[0][ast:], width=ssz,prec=csz) print '{:>{width}.{prec}}'.format(fl[i][ast:], width=ssz,prec=csz) # s = SQMA(None, " abcd", "abcd abcd") # res=s.find_longest_match(0, 5, 0, 9) # print res
def lcs(a, b): sm = SequenceMatcher(None, a, b, False) match = sm.find_longest_match(0, len(a), 0, len(b)) matchstr = a[match.a:match.a + match.size] return matchstr
# This example is adapted from the source for difflib.py. from difflib import SequenceMatcher def show_results(match): print(' a = {}'.format(match.a)) print(' b = {}'.format(match.b)) print(' size = {}'.format(match.size)) i, j, k = match print(' A[a:a+size] = {!r}'.format(A[i:i + k])) print(' B[b:b+size] = {!r}'.format(B[j:j + k])) A = " abcd" B = "abcd abcd" print('A = {!r}'.format(A)) print('B = {!r}'.format(B)) print('\nWithout junk detection:') s1 = SequenceMatcher(None, A, B) match1 = s1.find_longest_match(0, len(A), 0, len(B)) show_results(match1) print('\nTreat spaces as junk:') s2 = SequenceMatcher(lambda x: x == " ", A, B) match2 = s2.find_longest_match(0, len(A), 0, len(B)) show_results(match2)
def longest_match_ratio(str1, str2): sq = SequenceMatcher(lambda x: x == " ", str1, str2) match = sq.find_longest_match(0, len(str1), 0, len(str2)) return MathUtil.try_divide(match.size, min(len(str1), len(str2)))
def longest_common_subseq(a, b): s = SequenceMatcher(None, a, b) return s.find_longest_match(0, len(a), 0, len(b)).size
def lcs(cleanString1, cleanString2): matcher = SequenceMatcher(None, cleanString1, cleanString2) match = matcher.find_longest_match(0, len(cleanString1), 0, len(cleanString2)) return match.size
def LongestCommonSubstringSize(S1, S2): cleanString1 = cleanString(S1) cleanString2 = cleanString(S2) matcher = SequenceMatcher(None, cleanString1, cleanString2) match = matcher.find_longest_match(0, len(cleanString1), 0, len(cleanString2)) return match.size
def calculate(base, term): sequnce_matcher = SequenceMatcher(None, base, term) longest_common_substring_match = sequnce_matcher.find_longest_match(0, len(base), 0, len(term)) longest_common_substring_size = longest_common_substring_match.size return longest_common_substring_size
# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, # NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. # """Using the junk filter feature. This example is taken from the source for difflib.py. """ __version__ = "$Id$" #end_pymotw_header from difflib import SequenceMatcher A = " abcd" B = "abcd abcd" print 'A = "%s"' % A print 'B = "%s"' % B s = SequenceMatcher(None, A, B) i, j, k = s.find_longest_match(0, 5, 0, 9) print 'isjunk=None :', (i, j, k), '"%s"' % A[i:i+k], '"%s"' % B[j:j+k] s = SequenceMatcher(lambda x: x==" ", A, B) i, j, k = s.find_longest_match(0, 5, 0, 9) print 'isjunk=(x==" ") :', (i, j, k), '"%s"' % A[i:i+k], '"%s"' % B[j:j+k]
def rf(inputLevelWithBFM,targetLevelName): #inputLevelWithBFM = pd.read_csv(inputFile, header=0) # inputLevelWithBFM = pd.read_csv('data/HierarchyInputWithBFMResult_brandFamily.csv', header=0) ## Read input datasets - input labels and target labels #targetLevelName = pd.read_csv(targetFile, header=0) # targetLevelName = pd.read_csv('data/target_brandFamily.csv', header=0) targetLevelName = pd.DataFrame({'targetLevel': pd.unique(targetLevelName.targetLevel)}) ## Convert the labels as character string #inputLevelWithBFM.astype(str) ## Take top 200 for test inputLevelWithBFM = inputLevelWithBFM[:50] ########################################################################################## ###### Predicting 'All Other' versus Non-All Other ########################################################################################## ## Define Train and Test data. Train data holds randomly chosen 70% records ## Rest of 30% records are considered as test data np.random.seed([101]) trainData = inputLevelWithBFM.sample(frac=.7, replace=False) ## Store the data for prediction predDataAllOther = inputLevelWithBFM ################# Prepare TRAIN data ##### ## Train Data - Pre-processing inputLabel_df = pd.DataFrame(trainData.loc[:, 'inputLabel']) # labelCompare = list(trainData['inputLabel'].str.replace('[^0-9A-Za-z]', '')) inputLabel_df.loc[:, 'key'] = 0 ## Make tuple of input labels for which longest common substring needs ## to be identified labelCompare = pd.merge(inputLabel_df, inputLabel_df, how='outer', on='key').drop(['key'], axis=1) labelCompare.columns = ['inputLabel', 'inputLabel_compare'] labelCompare = labelCompare.drop(labelCompare[labelCompare['inputLabel'] == labelCompare['inputLabel_compare']].index) labelCompare.loc[:, 'inputNoSplChr'] = labelCompare.loc[:, 'inputLabel'].str.replace('[^0-9A-Za-z]', '') labelCompare.loc[:, 'inputCompareNoSplChr'] = labelCompare.loc[:, 'inputLabel_compare'].str.replace('[^0-9A-Za-z]', '') ## Prepare an empty data frame to store the longest common substring ## corresponding to each input label ## Check whether any number is present in input label ## Determine the length of input lable labelWithLCS = pd.DataFrame() labelCompareLength = len(labelCompare) # def getLongestCommonSubstring(x, y): # s = SequenceMatcher(None, x, y) # return s.find_longest_match(0, len(x), 0, len(y)).size ## Store the longest common substring, flag to determine whether a number is present and ## the length for each input label for i in range(0, (labelCompareLength)): a = pd.DataFrame(labelCompare.iloc[i, [2]]) a = pd.DataFrame.to_string(a, header=False, index=False)[1:] b = pd.DataFrame(labelCompare.iloc[i, [3]]) b = pd.DataFrame.to_string(b, header=False, index=False)[1:] c = list(labelCompare.iloc[i, [0]]) # c = pd.DataFrame.to_string(c, header=False, index=False) # m = longest_common_substring(a,b) s = SequenceMatcher(None, a, b) result = s.find_longest_match(0, len(a), 0, len(b)) if result.size > 3: lcs = a[(result.a):(result.a + result.size)] labelWithLCS_tmp = pd.DataFrame({'longestCommonString': lcs, 'inputLabel': c}) labelWithLCS = labelWithLCS.append(labelWithLCS_tmp, ignore_index=True) ## Create dummy variables for all longest common substring categorical variables ## Remove 'longestCommonString' string from the dummy variable names dummies = pd.get_dummies(labelWithLCS.loc[:, 'longestCommonString']) labelWithLCS_withdummy = pd.concat([labelWithLCS, dummies], axis=1) labelWithLCS_withdummy = labelWithLCS_withdummy.drop(['longestCommonString'], axis=1) labelWithLCSVariable = labelWithLCS_withdummy.groupby('inputLabel').max().reset_index() ## Prepare the train data for random forest model ## Define the class flag for 'ALL OTHER' and 'ALL_OTHER' trainDataWithLCS = pd.merge(labelWithLCSVariable, trainData, on='inputLabel', how='inner') trainDataWithLCS['inputLength'] = map(len, trainDataWithLCS['inputLabel']) trainDataWithLCS['numPresence'] = trainDataWithLCS['inputLabel'].str.contains('\d', na=False).astype(float) trainDataWithLCS['splChrPresence'] = trainDataWithLCS['inputLabel'].str.replace(' ', '').str.contains('[^0-9A-Za-z]', na=False).astype( float) trainDataWithLCS.loc[:, 'AllOtherFlag'] = trainDataWithLCS.loc[:, 'manualMappedLabel'].apply( lambda x: 1 if x in ["ALL OTHER", "ALL_OTHER"] else 0) trainDataWithLCS = trainDataWithLCS.drop(['inputLabel', 'bfmMappedLabel', 'manualMappedLabel'], axis=1) ## Fit a random forest model using down sampling techniques for unbalanced data ## Check the class for low frequency and use them for sample size features1 = trainDataWithLCS.columns[:-1] y, _ = pd.factorize(trainDataWithLCS.loc[:, 'AllOtherFlag'], sort=True) lowClassFreq = len(trainDataWithLCS) - trainDataWithLCS['AllOtherFlag'].sum(axis=0) forest = RandomForestClassifier(n_estimators=1000, class_weight="balanced") forest.fit(trainDataWithLCS[features1], y) ############### Prepare TEST data / Prediction data ## Remove special characters from input labels predDataAllOther.loc[:, 'inputNoSplChr'] = predDataAllOther.loc[:, 'inputLabel'].str.replace('[^0-9A-Za-z]', '') ## Define the class variable and make it as categorical/factor predDataAllOther.loc[:, 'AllOtherFlag'] = predDataAllOther.loc[:, 'manualMappedLabel'].apply( lambda x: 1 if x in ["ALL OTHER", "ALL_OTHER"] else 0) predDataAllOther.loc[:, 'AllOtherFlag'], _ = pd.factorize(predDataAllOther.loc[:, 'AllOtherFlag'], sort=True) ## Define longest common string dummies predDataLength = len(predDataAllOther) impVars = features1 for i in range(0, len(impVars)): dummyx = str(impVars[i]) predDataAllOther[dummyx] = predDataAllOther['inputNoSplChr'].str.contains(dummyx, na=False).astype(float) ## Define input length, number presence flag in input, special character ## presence flag in input predDataAllOther['inputLength'] = map(len, predDataAllOther['inputLabel']) predDataAllOther['numPresence'] = predDataAllOther['inputLabel'].str.contains('\d', na=False).astype(float) predDataAllOther['splChrPresence'] = predDataAllOther['inputLabel'].str.replace(' ', '').str.contains('[^0-9A-Za-z]', na=False).astype( float) ## Predict the class using the trained random forest model preds = forest.predict(predDataAllOther[impVars]) predDataAllOther['predAllOtherFlag'] = preds predDataAllOther['shConfidence'] = 1 - forest.predict_proba(predDataAllOther[impVars])[:, 0] ## Define the mapped label as 'ALL OTHER' if the prediction class is 1 predDataAllOther['shMappedLabel'] = predDataAllOther['predAllOtherFlag'].apply( lambda x: 'All OTHER' if x == 1 else 'Non All Other') ## Validate the results using confusion matrix pd.crosstab(predDataAllOther['AllOtherFlag'], preds, rownames=['actual'], colnames=['preds']) ## Save the predicted results ########################################################################################## ###### Predicting labels for all Non-All Other ########################################################################################## ## Extract data for only non-all other labels # inputLevelNAOWithBFM = subset(inputLevelWithBFM, !(inputLevelWithBFM$manualMappedLabel %in% c("ALL OTHER", "ALL_OTHER") ) ) inputLevelNAOWithBFM = predDataAllOther.query('predAllOtherFlag == 0')[ ['manualMappedLabel', 'inputLabel', 'bfmMappedLabel']] ## Define Train and Test data. Train data holds randomly chosen 70% records. ## Rest of 30% records are considered as test data trainData = inputLevelNAOWithBFM.sample(frac=.7, replace=False).reset_index() # testData = inputLevelNAOWithBFM.drop(trainData.index) ################################################################## ##part2 ################################################################## ## Extract data for only non-all other labels # inputLevelNAOWithBFM = subset(inputLevelWithBFM, !(inputLevelWithBFM$manualMappedLabel %in% c("ALL OTHER", "ALL_OTHER") ) ) inputLevelNAOWithBFM = predDataAllOther.query('predAllOtherFlag == 0')[ ['manualMappedLabel', 'inputLabel', 'bfmMappedLabel']] ## Define Train and Test data. Train data holds randomly chosen 70% records. ## Rest of 30% records are considered as test data trainData = inputLevelNAOWithBFM.sample(frac=.7, replace=False).reset_index() # testData = inputLevelNAOWithBFM.drop(trainData.index) ############## Prepare the TRAIN data ## Create variable as input labels without any special character trainData['inputNoSplChr'] = trainData['inputLabel'].str.replace('[^0-9A-Za-z]', '') trainData = trainData.reset_index(drop=True) targetLevelName['targetNoSplChr'] = targetLevelName['targetLevel'].str.replace('[^0-9A-Za-z]', '') ## Create the empty similarity matrix similarMatrix = pd.DataFrame() ## Update similarity matrix with all input labels and corresponding mapped labels ## having maximum similarity measures. ## The maximum similarity might return multiple mapped labels. ## Rules needs to be applied later to select the best mapped one ## Measure the similarity between the target label and the input label after removing the special characters lengthTrain = len(trainData) for i in range(0, lengthTrain): inputx = trainData.loc[i, 'inputNoSplChr'] ## Measure Jaro-Winker similarity, equals to 1 - jw in R jarowinkerSim = targetLevelName['targetNoSplChr'].apply(lambda x: jf.jaro_winkler(unicode(inputx), unicode(x))) kmax = max(jarowinkerSim) targetmax = targetLevelName.loc[jarowinkerSim.idxmax(), 'targetNoSplChr'] tempDF = pd.DataFrame({'targetNoSplChr': [targetmax]}) tempDF['similarityScore'] = kmax tempDF['measures'] = 'JaroWinkler' tempDF['inputNoSplChr'] = inputx similarMatrix = similarMatrix.append(tempDF, ignore_index=True) ## Measure Damerau-Levenshtein similarity equals to dl in R dlavenshteinSim = targetLevelName['targetNoSplChr'].apply( lambda x: jf.damerau_levenshtein_distance(unicode(inputx), unicode(x))) / targetLevelName[ 'targetNoSplChr'].apply( lambda x: max(len(inputx), len(x))) kmax = max(dlavenshteinSim) targetmax = targetLevelName.loc[dlavenshteinSim.idxmax(), 'targetNoSplChr'] tempDF = pd.DataFrame({'targetNoSplChr': [targetmax]}) tempDF['similarityScore'] = kmax tempDF['measures'] = 'DamerauLevenshtein' tempDF['inputNoSplChr'] = inputx similarMatrix = similarMatrix.append(tempDF, ignore_index=True) ## Measure Jaccard similarity equals to 1 - jaccard in R jaccardSim = targetLevelName['targetNoSplChr'].apply(lambda x: jaccard_similarity(inputx, x)) kmax = max(dlavenshteinSim) targetmax = targetLevelName.loc[jaccardSim.idxmax(), 'targetNoSplChr'] tempDF = pd.DataFrame({'targetNoSplChr': [targetmax]}) tempDF['similarityScore'] = kmax tempDF['measures'] = 'Jaccard' tempDF['inputNoSplChr'] = inputx similarMatrix = similarMatrix.append(tempDF, ignore_index=True) ## Measure Cosine similarity # cosineSimMatrix = # kmax = max(dlavenshteinSim) # tempDF = pd.DataFrame() # tempDF['targetNoSplChr'] = # tempDF['similarityScore'] = kmax # tempDF['measures'] = 'Cosine' # tempDF['inputNoSplChr'] = inputx # similarMatrix = similarMatrix.append(tempDF, ignore_index=True) # # ## Measure Longest Common Substring similarity # lcsSimMatrix = # kmax = max(dlavenshteinSim) # tempDF = pd.DataFrame() # tempDF['targetNoSplChr'] = # tempDF['similarityScore'] = kmax # tempDF['measures'] = 'LongestCommonSubstr' # tempDF['inputNoSplChr'] = inputx # similarMatrix = similarMatrix.append(tempDF, ignore_index=True) similarMatrix = pd.merge(similarMatrix, targetLevelName, how='left', on='targetNoSplChr') similarMatrix = similarMatrix.rename(columns={'targetLevel': 'mappedLabel'}) similarMatrix = similarMatrix.rename(columns={'targetNoSplChr': 'mappedNoSplChr'}) ## Merge the similarity score with training data trainDataWithSimilarity = pd.merge(trainData, similarMatrix, how='inner', on='inputNoSplChr') ## Find the maximum length of the longest common string (without the special character) between the ## input label and mapped label, corresponding to each similarity score##?mapped=target? dataLength = len(trainDataWithSimilarity) for i in range(0, dataLength): a = trainDataWithSimilarity.loc[i, 'inputNoSplChr'] b = trainDataWithSimilarity.loc[i, 'mappedNoSplChr'] s = SequenceMatcher(None, a, b) result = s.find_longest_match(0, len(a), 0, len(b)) trainDataWithSimilarity.loc[i, 'maxLengthOfLCS'] = result.size trainDataWithSimilarity['inputLength'] = map(len, trainDataWithSimilarity['inputNoSplChr']) trainDataWithSimilarity['lcsLengthRatio'] = trainDataWithSimilarity['maxLengthOfLCS'] / trainDataWithSimilarity[ 'inputLength'] ## Create similarity measure dummy variables dummies = pd.get_dummies(trainDataWithSimilarity['measures']) trainDataWithSimilarity = pd.concat([trainDataWithSimilarity, dummies], axis=1) ## Define the class varibales and make it as categorical variables for i in range(0, len(trainDataWithSimilarity)): a = trainDataWithSimilarity.loc[i, 'mappedLabel'].replace(' ', '') b = trainDataWithSimilarity.loc[i, 'manualMappedLabel'].replace(' ', '') if a == b: trainDataWithSimilarity.loc[i, 'matchSimilarityClass'] = 1 else: trainDataWithSimilarity.loc[i, 'matchSimilarityClass'] = 0 ## Store data for measuring variable importance using random forest trainDataForVarImp = trainDataWithSimilarity.drop(['measures', 'bfmMappedLabel', 'manualMappedLabel', 'inputLabel', 'inputNoSplChr', 'mappedLabel', 'mappedNoSplChr', 'similarityScore', 'index'], axis=1) ## Fit a random forest model for feature selection ## Train the random forest model using important vriables features2 = trainDataForVarImp.columns[:-1] y, _ = pd.factorize(trainDataWithSimilarity.loc[:, 'matchSimilarityClass'], sort=True) trainDataNAORF = RandomForestClassifier(n_estimators=500) trainDataNAORF.fit(trainDataForVarImp[features2], y) print trainDataNAORF with open(modelPath + '/' + Client + Category + 'model', 'wb') as f: cPickle.dump([features1, features2, forest, trainDataNAORF], f)