Exemplo n.º 1
0
def run402_01():
    A = " abcd"
    B = "abcd abcd"

    def show_results(match):
        print('a       = {}'.format(match.a))
        print('b       = {}'.format(match.b))
        print('size    = {}'.format(match.size))

        i, j, k = match
        print('A[a:a+size] = {!r}'.format(A[i:i + k]))
        print('B[b:b+size] = {!r}'.format(B[j:j + k]))

    print('A = {!r}'.format(A))
    print('B = {!r}'.format(B))

    print('\nWithout junk detection:')
    s1 = SequenceMatcher(None, A, B)
    m1 = s1.find_longest_match(0, len(A), 0, len(B))
    show_results(m1)

    print('\nTreat spaces as junk:')
    s1 = SequenceMatcher(lambda x: x == " ", A, B)
    m1 = s1.find_longest_match(0, len(A), 0, len(B))
    show_results(m1)
Exemplo n.º 2
0
def get_close_matches_indexes(word,
                              possibilities,
                              junk_seq=None,
                              n=3,
                              cutoff=0.5):
    if not n > 0:
        raise ValueError("n must be > 0: %r" % (n, ))
    if not 0.0 <= cutoff <= 1.0:
        raise ValueError("cutoff must be in [0.0, 1.0]: %r" % (cutoff, ))
    result_ratio = []
    result_idx = []
    # se vogliamo escludere le "junk sequence" (in questo caso lo spazio) SequenceMatcher(lambda x: x==" "),
    if junk_seq:
        s = SequenceMatcher(isjunk=lambda x: x == junk_seq)
    else:
        s = SequenceMatcher()
    s.set_seq2(word)

    for idx, x in enumerate(possibilities):
        s.set_seq1(x)

        longest_match = s.find_longest_match(0, len(x), 0, len(word))
        our_ratio = (longest_match.size) / len(word)
        if our_ratio >= cutoff:
            match = s.find_longest_match(0, len(x), 0, len(word))
            #print(match)
            #print (word[match.a:match.a + match.size])
            result_ratio.append(our_ratio)
            result_idx.append(idx)

    max_idx = []
    if result_ratio:
        ratios = np.asarray(result_ratio)
        max_idx = []
        for i in range(n):
            arg_max_idx = np.argmax(result_ratio)
            if result_ratio[arg_max_idx] < cutoff:
                max_idx.append((-1, 0))
            else:
                max_idx.append(
                    (result_idx[arg_max_idx], result_ratio[arg_max_idx]))
            #messo a zero cosi la prossima volta prendiamo il prossimo "massimo"
            result_ratio[arg_max_idx] = 0

    #max_idx e una lista con
    #[(indice del match, score), (indice match, score)]
    final_result = []
    for i, s in max_idx:
        final_result.append((possibilities[i], s))
    return final_result
Exemplo n.º 3
0
def markBIO(annotated_text):
    text, span_n_label = annotated_text[0], annotated_text[1]
    span, type = span_n_label[0], span_n_label[1]

    span = span.strip()
    seq_match = SequenceMatcher(None, text, span, autojunk=False)
    match = seq_match.find_longest_match(0, len(text), 0, len(span))
    # Single match only
    start = match.a
    end = start + match.size
    print("start: {}, end: {}".format(start, end))

    temp_str = text[start:end]
    temp_str_tokens = temp_str.split()

    word_dict = {}
    pointer = 0
    for word in text.split():
        if pointer < start:
            word_dict[word] = 'O'
        elif pointer >= start and pointer < end:
            if len(temp_str_tokens) > 1:
                word_dict[temp_str_tokens[0]] = 'B-' + type
                for w in temp_str_tokens[1:]:
                    word_dict[w] = 'I-' + type
            else:
                word_dict[temp_str] = 'B-' + type
        else:
            word_dict[word] = 'O'
        pointer += (len(word) + 1)
    print(word_dict)
    return word_dict
Exemplo n.º 4
0
def main(str1: str, str2: str) -> str:

    sequence_matcher = SequenceMatcher(None, str1, str2)
    longest_match = sequence_matcher.find_longest_match(
        0, len(str1), 0, len(str2))

    return str1[longest_match.a:longest_match.b]
Exemplo n.º 5
0
    def _process_internal(self, sentence_list: List) -> List:
        print('up to know did not help a lot, due to wrong implementation, condenses too much!')

        # split sentences first to a and b
        sent_list_a = sentence_list[:len(sentence_list) // 2]
        sent_list_b = sentence_list[len(sentence_list) // 2:]

        new_sentence_list_a = []
        new_sentence_list_b = []
        total_shortend_sentences = []
        for idx, (sent_a, sent_b) in enumerate(zip(sent_list_a, sent_list_b)):
            sent_a = [item for sublist in sent_a for item in sublist]
            sent_b = [item for sublist in sent_b for item in sublist]
            s = SequenceMatcher(None, sent_a, sent_b)
            m = s.find_longest_match(0, len(sent_a), 0, len(sent_b))
            if m.size > self.ngrams_num:
                to_add_a = sent_a[:m.a] + sent_a[m.a+m.size:]
                to_add_b = sent_b[:m.b] + sent_b[m.b+m.size:]
                if len(to_add_a) <= self.n_min or len(to_add_b) <= self.n_min:
                    to_add_a = sent_a
                    to_add_b = sent_b
                else:
                    total_shortend_sentences.append(idx)

            else:
                to_add_a = sent_a
                to_add_b = sent_b
            new_sentence_list_a.append([to_add_a])
            new_sentence_list_b.append([to_add_b])
        print(total_shortend_sentences)
        print(len(total_shortend_sentences))
        return new_sentence_list_a + new_sentence_list_b
Exemplo n.º 6
0
def is_acceptable_similar(name):
    for i, comp_name in enumerable(df['names']):
        sequence = SequenceMatcher(None, text, comp)
        longest_match = sequence.find_longest_match(0, len(name), 0,
                                                    len(comp_name))
        percent = (longest_match.size / len(text)) * 100
        return (percent >= affordable_rate)
Exemplo n.º 7
0
def diff(fp1, fp2):
    reward = 0
    with open(fp1) as f1, open(fp2) as f2:
        for line1, line2 in zip(f1, f2):
            # Example scoring
            #line1 = 'ABC'
            #line2 = '1ABC' # score
            #line2 = 'A12' # score

            # Find longest matching block
            s = SequenceMatcher(None, line1.rstrip(), line2.rstrip())
            longest_matching_block = s.find_longest_match(
                0, len(line1.rstrip()), 0, len(line2.rstrip()))
            #            print(longest_matching_block)
            #            print(longest_matching_block.size)
            reward += longest_matching_block.size * 0.1

            # Find exact matching characters
            correct = 0
            line1 = t2a(line1)
            line2 = t2a(line2)
            #shortest_length = min
            chars = zip_longest(line1, line2)
            for c, d in chars:
                #print(c, d)
                if c == None:
                    c = 0
                if d == None:
                    d = 0
                if c == d:
                    correct += 5
                else:
                    if chr(c) in alphas and chr(d) in alphas:
                        # Baseline
                        correct += 2

                        # Test distance from each other in alphabet.
                        # Range [1, 3]


#                        indexa = alphas.index(chr(c))
#                        indexb = alphas.index(chr(d))
#                        ind_diff = abs(indexa - indexb)
#                        corrrect = (2/26) * ind_diff + 1
                    elif chr(c) in numbers and chr(d) in numbers:
                        correct += 2
                    elif chr(c) in space and chr(d) in space:
                        correct += 2
                    elif chr(c) in symbols and chr(d) in symbols:
                        correct += 2
            #print(correct)
            reward += correct * 0.1

            # Penalize extra characters
            penalty = abs(len(line2) - len(line1)) * 0.25
            if reward - penalty > 0.1:
                reward -= penalty
            else:
                reward = 0.1
    return reward
Exemplo n.º 8
0
def lcs(str1, str2):
    s = SequenceMatcher(None, str1, str2)
    m = s.find_longest_match(0, len(str1), 0, len(str2))
    if m.size > 0:
        return str1[m.a:m.a + m.size]
    else:
        return ''
Exemplo n.º 9
0
 def longest_sequence_match_with_query(self, query, passage):
     match = SequenceMatcher(None,
                             query.lower().split(),
                             passage.lower().replace(".", "").split())
     matching_block = match.find_longest_match(0, len(query.split()), 0,
                                               len(passage.split()))
     return matching_block[2]
Exemplo n.º 10
0
def get_best_match(first_name, possible_matches):
    """
    Takes in a string first_name and an array of strings possible_matches.

    Rules for determining the best match:
    - The first characters of the first name and possible match must match
    - the match should have the greatist common subsequence

    Returns a string that is the best match to the name
        """
    best_match = ""
    best_match_len = 0
    seqMatch = SequenceMatcher(None, first_name, '')

    # Loop through and find the longest substring match
    for pmatch in possible_matches:
        seqMatch.set_seq2(pmatch)
        gcs = seqMatch.find_longest_match(0, len(first_name), 0, len(pmatch))
        if (gcs.size == 0):
            continue
        elif (pmatch[0] == first_name[0] and gcs.size >= best_match_len):
            best_match = pmatch
            best_match_len = gcs.size

    return best_match
Exemplo n.º 11
0
def longest_substring_mine(str1, str2):
    len1 = len(str1)
    len2 = len(str2)

    if len1 == 0 or len2 == 0:
        return 0, 0

    current_longest_match_len = ALLOWED_MATCH_LEN
    total_match_len = 0

    while current_longest_match_len >= ALLOWED_MATCH_LEN:
        seqMatch = SequenceMatcher(None, str1, str2)
        match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
        if match.size != 0:
            longest = str(str1[match.a:match.a + match.size])
            print(longest)
            current_longest_match_len = len(longest.strip())
            total_match_len += current_longest_match_len
            str1 = str1.replace(longest, '')
            str2 = str2.replace(longest, '')
        else:
            current_longest_match_len = 0

    print('str1 ratio', total_match_len / len1)
    print('str2 ratio', total_match_len / len2)

    return (total_match_len * 100) / len1, (total_match_len * 100) / len2
Exemplo n.º 12
0
def longestSubstring(str1, str2):
    LS = ""
    if str1 == str2:
        return True
    # initialize SequenceMatcher object with
    # input string
    seqMatch = SequenceMatcher(None, str1, str2)

    # find match of longest sub-string
    # output will be like Match(a=0, b=0, size=5)
    match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))

    # print longest substring
    if (match.size != 0):
        LS = str1[match.a:match.a + match.size]
    else:
        return False

    #if (str1.startswith(LS) and str2.startswith(LS) and len(LS) >= 5) or len(LS) >= 6 or ( LS == str1 or LS == str2):
    if len(LS) >= 5:
        if str1.startswith(LS) and str2.startswith(LS):
            return True
        if str1.startswith(LS) and str2.endswith(LS):
            return True
        if str1.endswith(LS) and str2.startswith(LS):
            return True
        if str1.endswith(LS) and str2.endswith(LS):
            return True
    if LS == str1 or LS == str2:
        return True
    return False
 def compute_similarity_sentence(self, dst, src):
     dst, src = dst.lower(), src.lower()
     dst, src = str(ViUtils.remove_accents(dst.strip())), str(
         ViUtils.remove_accents(src.strip()))
     seq_match = SequenceMatcher(None, src, dst)
     match = seq_match.find_longest_match(0, len(src), 0, len(dst))
     return 0 if match.size == 0 else match.size / len(src)
Exemplo n.º 14
0
def longestSubstring(str1,str2): 
    seqMatch = SequenceMatcher(None,str1,str2) 
    match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) 
    if (match.size!=0): 
        print (str1[match.a: match.a + match.size]) 
    else: 
        print ("I can't find it.") 
Exemplo n.º 15
0
def match(str1, str2):
    matched = SequenceMatcher(isjunk=lambda x: x in " ",
                              a=str1.lower(),
                              b=str2.lower())
    pos1, pos2, size = matched.find_longest_match(0, len(str1), 0, len(str2))
    matched_string = str1[pos1:pos1 + size]
    return matched_string
Exemplo n.º 16
0
def common_longest(a, b):
    seqMatch = SequenceMatcher(None, a, b)
    match = seqMatch.find_longest_match(0, len(a), 0, len(b))
    if (match.size != 0):
        return a[match.a:match.a + match.size]
    else:
        return ""
def longest_Substring(s1, s2):
    seq_match = SequenceMatcher(None, s1, s2)
    match = seq_match.find_longest_match(0, len(s1), 0, len(s2))
    if (match.size != 0):
        return (s1[match.a:match.a + match.size])
    else:
        return ('Longest common sub-string not present.')
Exemplo n.º 18
0
def longestSubstring(str1,str2): 
	seqMatch = SequenceMatcher(None,str1.lower(),str2.lower()) 
	match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
	if (match.size>=len(str1)*2/3):
		return (str1[match.a: match.a + match.size])
	else:
		return (None )
Exemplo n.º 19
0
def lcs1(str1: str, str2: str) -> str:
    """ difflib package solution """
    """ lcs problem explenation: https://www.youtube.com/watch?v=HgUOWB0StNE"""
    seqMatch = SequenceMatcher(None, str1, str2)
    match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
    res = str1[match.a: match.a + match.size]
    return(res)
Exemplo n.º 20
0
def longestSubstringNormalized(str1, str2):
    # initialize SequenceMatcher object with
    # input string

    str1 = str1.lower()
    str2 = str2.lower()

    seqMatch = SequenceMatcher(None, str1, str2)
    # find match of longest sub-string
    # output will be like Match(a=0, b=0, size=5)
    match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
    # print longest substring
    if (match.size != 0):
        #print(match.size)
        #print(len(str1))
        #print(len(str2))
        a = len(str1) + len(str2)
        #print("size of thge two strings - ", a)
        b = a / 2
        #print(b)
        #print("size of thge two strings divided by 2: %f" % b)
        #print(match.size / b)
        return (match.size / b)

    return 0
Exemplo n.º 21
0
    def grade_memo(self, memo_text):
        #get length
        length = len(memo_text) 

        #get longest common sequence
        seqMatch = SequenceMatcher(None, self.text, memo_text)
        match = seqMatch.find_longest_match(0, len(self.text), 0, len(memo_text))
        lcs = match.size 

        #get semantic similarity
        memo_embedding = self.bert_model.encode([memo_text])
        semantic_similarity = cosine_similarity(memo_embedding, self.slides_embedding)
        if semantic_similarity <= 0:
            semantic_similarity = 0.1

        #get keyword hit ratio
        tokens = jieba.cut(memo_text)
        hit_kw = list(set(tokens).intersection(set(self.keywords)))
        kw_hit_ratio = 0.8 + len(hit_kw) / len(self.keywords)

        memo_score = (length - lcs)*semantic_similarity*kw_hit_ratio
        return memo_score[0][0]


# class ShortAnswerGrader(object):
Exemplo n.º 22
0
def _longest_match_size(str1, str2):
    """
    find the longest matching block, and return the string length
    """
    sq = SequenceMatcher(lambda x: x == " ", str1, str2)
    match = sq.find_longest_match(0, len(str1), 0, len(str2))
    return match.size
Exemplo n.º 23
0
def common(str1, str2):
    seqMatch = SequenceMatcher(None, str1, str2)
    match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
    if (match.size != 0):
        return str1[match.a:match.a + match.size]
    else:
        return -1
def get_prefix(lis):
    first_el = lis.pop(0)
    for i in lis:
        match = SequenceMatcher(None, first_el, i)
        match = match.find_longest_match(0, len(first_el), 0, len(i))
        first_el = first_el[match.a:match.a + match.size]
    return first_el
Exemplo n.º 25
0
def score(definition1, definition2):
    result = 0

    definition1_tokens = tokenize_and_preprocess(definition1)

    definition2_tokens = tokenize_and_preprocess(definition2)

    while True:
        sequence_matcher = SequenceMatcher(None, definition1_tokens,
                                           definition2_tokens)
        match_results = sequence_matcher.find_longest_match(
            0, len(definition1_tokens), 0, len(definition2_tokens))

        if match_results.size == 0:
            break

        result += match_results.size**2

        definition1_tokens = definition1_tokens[:(
            match_results.a)] + definition1_tokens[(match_results.a +
                                                    match_results.size):]
        definition2_tokens = definition2_tokens[:(
            match_results.b)] + definition2_tokens[(match_results.b +
                                                    match_results.size):]

    return result
Exemplo n.º 26
0
def longestSubstring(str1, str2):
    seqMatch = SequenceMatcher(None, str1, str2)
    match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
    if (match.size != 0):
        print(str1[match.a:match.a + match.size])
    else:
        print('No longest common sub-string found')
Exemplo n.º 27
0
def predict(args):
    """
    Handle view commands.

    :param args: Args from command.
    """
    headers = ["Filename"]
    output = []
    models = []
    prefix = ""
    for i, mn in enumerate(args.models):
        model = MEGNetModel.from_file(mn)
        models.append(model)
        if i == 0:
            prefix = mn
        else:
            sm = SequenceMatcher(None, prefix, mn)
            match = sm.find_longest_match(0, len(prefix), 0, len(mn))
            prefix = prefix[0:match.size]
        headers.append(
            f"{mn} ({model.metadata.get('unit', '').strip('log10')}")
    headers = [h.lstrip(prefix) for h in headers]

    for fn in args.structures:
        structure = Structure.from_file(fn)
        row = [fn]
        for model in models:
            val = model.predict_structure(structure).ravel()
            if "log10" in str(model.metadata.get("unit", "")):
                val = 10**val
            row.append(val)
        output.append(row)
    print(tabulate(output, headers=headers))
Exemplo n.º 28
0
def longestCommonSubsequence(str1, str2):
    seqMatch = SequenceMatcher(None, str1, str2)
    match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
    if (match.size != 0):
        return str1[match.a:match.a + match.size]
    else:
        print("No common substring between two strings.")
Exemplo n.º 29
0
def _longest_match_ratio(str1, str2):
    """
    find the longest matching block between string1 and string2, 
    and then calculate the string length divide by min(string1, string2)
    """
    sq = SequenceMatcher(lambda x: x == " ", str1, str2)
    match = sq.find_longest_match(0, len(str1), 0, len(str2))
    return np_utils._try_divide(match.size, min(len(str1), len(str2)))
Exemplo n.º 30
0
def lcs(s1, s2):
    from difflib import SequenceMatcher
    seq = SequenceMatcher(None, s1, s2)
    lcs = seq.find_longest_match(0, len(s1), 0, len(s2))
    if (lcs.size != 0):
        return str(s1[lcs.a:lcs.a + lcs.size])
    else:
        return ""
Exemplo n.º 31
0
def colorize_match(t, field, matches=None):
    field_val = t[field]
    if not field_val:
        return None
    match = matches.get(field, '') if matches else ''
    seqmatch = SequenceMatcher(None, field_val, match)
    a, b, size = seqmatch.find_longest_match(0, len(field_val), 0, len(match))
    return (field_val[:a] + TERM.green(field_val[a:a + size]) +
            field_val[a + size:])
Exemplo n.º 32
0
def find_match(question, doc):
    """Finds occurences of question in docs

    Returns the longest match, where a match is a Named Tuple. Use
    unwrap_match(doc, match) to unwrap them or for more documentation.
    """
    question = question.lower()
    doc = doc.lower()
    sm = SequenceMatcher()
    sm.set_seqs(doc, question)
    return sm.find_longest_match(0, len(doc), 0, len(question))
Exemplo n.º 33
0
	def match_query(self, query_obj):
		"""Checks whether this contact matches the given query
		
		@param query_obj Dict containing key/value pairs of the required matches
		@return Accuracy of the match, ranging from 0.0 (no match) to 1.0 (complete match)"""
		
		overall_match = 1.0
		matcher = SequenceMatcher()
		
		for field_name in query_obj.keys():
			# Skip fields only meaningful to the parser
			if field_name[:1] == "_": continue
			
			field_value = query_obj[field_name]
			best_field_match = 0.0
			
			# The matcher internally caches details about seq2, so let's make use of that
			matcher.set_seq2(field_value)
			seq2_len = len(field_value)
			
			# Check if field value(s) of this contact match(es) the query field
			try:
				field_ids = self._field_idx[field_name]
				
				for field_id in field_ids:
					
					# A field is (Key,Value,Comp_Value,Source), so [2] is the value we usually use for comparison
					comp_value = self._fields[field_id][2]
					if not comp_value:
						# Use the real value if no comparison value given
						comp_value = self._fields[field_id][1]
					
					# Compare and determine the best match ratio
					matcher.set_seq1(comp_value)
					match = matcher.find_longest_match(0, len(comp_value), 0, seq2_len)
					match_len = match[2]
					field_match = float(match_len) / seq2_len
					
					if field_match > best_field_match: best_field_match = field_match
					syslog(LOG_DEBUG, "Contacts: Field match for %s / %s: %f" % (comp_value, field_value, field_match))
			
			except KeyError:
				# Contact has no data for this field contained in the query, so this entry cannot match
				return 0.0
			
			# Aggregate the field match value into the overall match
			# We don't use the average of all field matches as one
			# non-match *must* result in a final value of 0.0
			overall_match *= best_field_match
			
			# Stop comparing if there is too little similarity
			if overall_match < _MIN_MATCH_TRESHOLD: break
		
		return overall_match
def main(argv):
	fname = argv[0]

	with open(fname, 'r') as f:
		lines = f.read().splitlines()

	for line in lines:
		(string_1, string_2) = line.split()

		sequence_matcher = SequenceMatcher(
			None, 
			string_1, 
			string_2
		)

		match = sequence_matcher.find_longest_match(
			0, 
			len(string_1), 
			0, 
			len(string_2)
		)

		# Get next digit after the match to determine the regexp range
		start_digits = string_1[
			match.a + match.size: None
		]
		end_digits = string_2[
			match.b + match.size: None
		]

		# If the range begins with all zeros and ends with all nines
		# then include the entire range from the longest string match
		if re.match('^[0]+$', start_digits) and re.match('^[9]+$', end_digits):
			print string_1, string_2, string_1[match.a: match.a + match.size]

		# We need to generate custom ranges for everything else
		else:
			print 'Custom range START'
			try:
				m = re.search('(.+?)[0]+$', start_digits)
				range_start = m.group(1) 

				m = re.search('(.+?)[9]+$', end_digits)
				range_end = m.group(1)

				# Create ranges by appending the range to the longest match
				for r in range(int(range_start), int(range_end) + 1):
					print string_1, string_2, string_1[match.a: match.a + match.size] + str(r), range_start, range_end

			except AttributeError:
				print 'FAILED', string_1, string_2, start_digits, end_digits

			finally:
				print 'Custom range END'
Exemplo n.º 35
0
Arquivo: ui.py Projeto: flywire/qifqif
def diff(_str, candidate, term, as_error=False):
    """ Return a string representing how well candidate matches str : matching
        words are green, partial matches (chars) are orange.
        If as_error is True, non matching chars are red.
    """

    match = SequenceMatcher(None, _str.lower(), candidate.lower())
    match_indexes = match.find_longest_match(0, len(_str), 0, len(candidate))
    _, beg, end = match_indexes
    match = candidate[beg:beg + end]
    words = match.split(' ')
    res = term.red(candidate[:beg]) if as_error else candidate[:beg]
    for w in words:
        res += (term.green(w) if tags.is_match(w, _str)
                else term.yellow(w)) + ' '
    res += '\b' + (term.red(candidate[beg + end:]) if as_error
                   else candidate[beg + end:])
    return res
Exemplo n.º 36
0
def alignment_indices(template, primer):
    """
    Finds the optimal alignment between template and primer.

    Inputs:
    =======
    - str1, str2:  (str)

    Returns (int1, int2), (int3, int4), where:
    - int1, int2 = start, stop on str1
    - int3, int4 = start, stop on str2

    For the DNA case, we are assuming that the 5'->3' directionality of the
    two strings are identical.
    """
    s = SequenceMatcher(None, template, primer)
    m = s.find_longest_match(0, len(template), 0, len(primer))
    return (m.a, m.a + m.size), (m.b, m.b + m.size)
Exemplo n.º 37
0
 def find_longest_common_sequence(self, l, anywhere=False):
     common_text = ""
     insert_text = ""
     for c in l:
         text = self.editor.text()[self.items[c].start_pos:self.editor.pos]
         if text:
             c = ''.join(c.partition(text)[1:])
             
         if common_text:
             if anywhere and (not text):
                 s = SequenceMatcher(None, common_text, c)
                 match = s.find_longest_match(0, len(common_text), 0, len(c))
                 common_text = c[match.b:match.b + match.size]
             else:
                 common_text = find_longest_match_at_start(common_text, c)            
         else:
             common_text = c
         
         insert_text = common_text[len(text):]
         if not common_text:
             break
 
             
     return common_text, insert_text
 def longest_match_size(str1, str2):
     sq = SequenceMatcher(lambda x: x == " ", str1, str2)
     match = sq.find_longest_match(0, len(str1), 0, len(str2))
     return match.size
Exemplo n.º 39
0
def remove_ref_punctuation(lines_list,grnd): #lines_list is list of transcript file lines
    reg123=['\'','\"','\.*','\,','\?','\!']
    reg_list=[]
    for r in reg123:
        reg_list.append(re.compile(r))

    paragraphs=lines_list
    filtered=[]
    for p in paragraphs:
        fil=p
        for regex in reg_list:
            fil=regex.sub('', fil)
        filtered.append(fil)
    t=[]
    prev=''
    count=0
    for i in range(len(filtered)):

        f = filtered[i].lstrip().rstrip().rstrip('\n')
        if f == '':
            continue
        seq=SequenceMatcher(None,prev,f)
        a=seq.find_longest_match(0,len(prev),0,len(f))
        x=a[0]
        y=a[1]
        size=a[2]
        str1=prev[x:x+size].lstrip().rstrip()
        str2=f[y:y+size].lstrip().rstrip()
##            print(str1,' ||| ', str2)
##            print(a)
        if y==0:
            if prev[x:x+size] == prev[-size:]:
                c=str1+' '+str2
##                    print(prev[x:x+size])
##                    print(prev[-size:])
##                    print(c, c not in grnd)
                if c not in grnd:
##                        print(a)
##                        print(prev)
##                        print(f)
##                        print(c)
                    t.append(prev[:-size-1])

                else:
                    t.append(prev)

            else:
                t.append(prev)

        else:
            t.append(prev)
        prev=f

    #not all repetition is removed the first time b/c longest match
    #isn't always the beginning/end
    #this below helps get some of the smaller beginnig/end repetitions in transcript
    #lines, but not all.. I'm not sure how to fix it
    prev=''
    s=''
    count=0
    for i in range(len(t)):
        f = t[i].lstrip().rstrip()
##            print('CURR:', f)
##            print('PREV:',prev)
        if f == '':
            continue
        ps=prev.split()
        fs=f.split()
        j=0
        w=fs[j]
        #print(w)
        w=fs.pop(0)
        while w in ps and len(fs) !=0:
            w+=' '+fs[0]

        a=prev.find(w.strip())
        if a>=0:
##                print(prev)
##                print(f)
##                print(prev[a:],' |||| ',w)
##                print(a)
            c=prev[a:]+' '+w
            #print(c)
            if prev[a:].strip()==w.strip() and c not in grnd:
                #print(w,len(w))
                #print(a)
                #print(prev[a:],' |||| ',w)
                #print('repeat')
                #print(s)
                s=s[:-len(w.strip())-1]
                #print(s)
        s+=f+' '
        prev=f
    return s.lower().lstrip().rstrip()
Exemplo n.º 40
0
from difflib import SequenceMatcher as SQMA

argquan=len(sys.argv)
if argquan != 2:
    print "This script requires one argument: a file listing file"
    sys.exit(2)

with open(sys.argv[1]) as f: fl=f.read().splitlines()
flsz=len(fl)

f0sz=len(fl[0])
ssz=f0sz
for i in xrange(1,flsz):
    fisz=len(fl[i])
    s = SQMA(None, fl[0], fl[i])
    (ast, bst, csz)=s.find_longest_match(0, f0sz, 0, fisz)
    # print "*.s" % (fl[0], 3)
    # print ":.*".format(2, fl[0])
    if fisz > f0sz:
        ssz=fisz
        print '{:>{width}.{prec}}'.format(fl[0][ast:], width=ssz,prec=csz)
        # the > right aligns
        # print '{:>{width}.{prec}}'.format(fl[i][ast:], width=ssz,prec=csz)
        print "{}".format(fl[i])
    else:
        print '{:>{width}.{prec}}'.format(fl[0][ast:], width=ssz,prec=csz)
        print '{:>{width}.{prec}}'.format(fl[i][ast:], width=ssz,prec=csz)

# s = SQMA(None, " abcd", "abcd abcd")
# res=s.find_longest_match(0, 5, 0, 9)
# print res
Exemplo n.º 41
0
 def lcs(a, b):
     sm = SequenceMatcher(None, a, b, False)
     match = sm.find_longest_match(0, len(a), 0, len(b))
     matchstr = a[match.a:match.a + match.size]
     return matchstr
Exemplo n.º 42
0
# This example is adapted from the source for difflib.py.

from difflib import SequenceMatcher


def show_results(match):
    print('  a    = {}'.format(match.a))
    print('  b    = {}'.format(match.b))
    print('  size = {}'.format(match.size))
    i, j, k = match
    print('  A[a:a+size] = {!r}'.format(A[i:i + k]))
    print('  B[b:b+size] = {!r}'.format(B[j:j + k]))


A = " abcd"
B = "abcd abcd"

print('A = {!r}'.format(A))
print('B = {!r}'.format(B))

print('\nWithout junk detection:')
s1 = SequenceMatcher(None, A, B)
match1 = s1.find_longest_match(0, len(A), 0, len(B))
show_results(match1)

print('\nTreat spaces as junk:')
s2 = SequenceMatcher(lambda x: x == " ", A, B)
match2 = s2.find_longest_match(0, len(A), 0, len(B))
show_results(match2)
 def longest_match_ratio(str1, str2):
     sq = SequenceMatcher(lambda x: x == " ", str1, str2)
     match = sq.find_longest_match(0, len(str1), 0, len(str2))
     return MathUtil.try_divide(match.size, min(len(str1), len(str2)))
Exemplo n.º 44
0
def longest_common_subseq(a, b):
    s = SequenceMatcher(None, a, b)
    return s.find_longest_match(0, len(a), 0, len(b)).size
Exemplo n.º 45
0
def lcs(cleanString1, cleanString2):
    matcher = SequenceMatcher(None, cleanString1, cleanString2)
    match = matcher.find_longest_match(0, len(cleanString1), 0, len(cleanString2))
    return match.size
Exemplo n.º 46
0
def LongestCommonSubstringSize(S1, S2):
    cleanString1 = cleanString(S1)
    cleanString2 = cleanString(S2)
    matcher = SequenceMatcher(None, cleanString1, cleanString2)
    match = matcher.find_longest_match(0, len(cleanString1), 0, len(cleanString2))
    return match.size
def calculate(base, term):
	sequnce_matcher = SequenceMatcher(None, base, term)
	longest_common_substring_match = sequnce_matcher.find_longest_match(0, len(base), 0, len(term))
	longest_common_substring_size = longest_common_substring_match.size
	return longest_common_substring_size
Exemplo n.º 48
0
# OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT,
# NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#

"""Using the junk filter feature.

This example is taken from the source for difflib.py.

"""

__version__ = "$Id$"
#end_pymotw_header

from difflib import SequenceMatcher

A = " abcd"
B = "abcd abcd"

print 'A = "%s"' % A
print 'B = "%s"' % B

s = SequenceMatcher(None, A, B)
i, j, k = s.find_longest_match(0, 5, 0, 9)
print 'isjunk=None     :', (i, j, k), '"%s"' % A[i:i+k], '"%s"' % B[j:j+k]

s = SequenceMatcher(lambda x: x==" ", A, B)
i, j, k = s.find_longest_match(0, 5, 0, 9)
print 'isjunk=(x==" ") :', (i, j, k), '"%s"' % A[i:i+k], '"%s"' % B[j:j+k]

Exemplo n.º 49
0
def rf(inputLevelWithBFM,targetLevelName):
    #inputLevelWithBFM = pd.read_csv(inputFile, header=0)
    # inputLevelWithBFM = pd.read_csv('data/HierarchyInputWithBFMResult_brandFamily.csv', header=0)
    ## Read input datasets - input labels and target labels
    
    #targetLevelName = pd.read_csv(targetFile, header=0)
    # targetLevelName = pd.read_csv('data/target_brandFamily.csv', header=0)
    targetLevelName = pd.DataFrame({'targetLevel': pd.unique(targetLevelName.targetLevel)})
    ## Convert the labels as character string
    #inputLevelWithBFM.astype(str)
    
    ## Take top 200 for test
    inputLevelWithBFM = inputLevelWithBFM[:50]
    ##########################################################################################
    ###### Predicting 'All Other' versus Non-All Other
    ##########################################################################################
    
    
    ## Define Train and Test data. Train data holds randomly chosen 70% records
    ## Rest of 30% records are considered as test data
    np.random.seed([101])
    trainData = inputLevelWithBFM.sample(frac=.7, replace=False)
    ## Store the data for prediction
    predDataAllOther = inputLevelWithBFM
    
    ################# Prepare TRAIN data #####
    
    ## Train Data - Pre-processing
    inputLabel_df = pd.DataFrame(trainData.loc[:, 'inputLabel'])
    # labelCompare = list(trainData['inputLabel'].str.replace('[^0-9A-Za-z]', ''))
    inputLabel_df.loc[:, 'key'] = 0
    
    ## Make tuple of input labels for which longest common substring needs
    ## to be identified
    labelCompare = pd.merge(inputLabel_df, inputLabel_df, how='outer', on='key').drop(['key'], axis=1)
    labelCompare.columns = ['inputLabel', 'inputLabel_compare']
    labelCompare = labelCompare.drop(labelCompare[labelCompare['inputLabel'] == labelCompare['inputLabel_compare']].index)
    labelCompare.loc[:, 'inputNoSplChr'] = labelCompare.loc[:, 'inputLabel'].str.replace('[^0-9A-Za-z]', '')
    labelCompare.loc[:, 'inputCompareNoSplChr'] = labelCompare.loc[:, 'inputLabel_compare'].str.replace('[^0-9A-Za-z]', '')
    ## Prepare an empty data frame to store the longest common substring
    ## corresponding to each input label
    ## Check whether any number is present in input label
    ## Determine the length of input lable
    labelWithLCS = pd.DataFrame()
    labelCompareLength = len(labelCompare)
    # def getLongestCommonSubstring(x, y):
    #    s = SequenceMatcher(None, x, y)
    #    return s.find_longest_match(0, len(x), 0, len(y)).size
    
    ## Store the longest common substring, flag to determine whether a number is present and
    ## the length for each input label
    for i in range(0, (labelCompareLength)):
        a = pd.DataFrame(labelCompare.iloc[i, [2]])
        a = pd.DataFrame.to_string(a, header=False, index=False)[1:]
    
        b = pd.DataFrame(labelCompare.iloc[i, [3]])
        b = pd.DataFrame.to_string(b, header=False, index=False)[1:]
    
        c = list(labelCompare.iloc[i, [0]])
        # c = pd.DataFrame.to_string(c, header=False, index=False)
        # m = longest_common_substring(a,b)
        s = SequenceMatcher(None, a, b)
        result = s.find_longest_match(0, len(a), 0, len(b))
        if result.size > 3:
            lcs = a[(result.a):(result.a + result.size)]
            labelWithLCS_tmp = pd.DataFrame({'longestCommonString': lcs, 'inputLabel': c})
            labelWithLCS = labelWithLCS.append(labelWithLCS_tmp, ignore_index=True)
    ## Create dummy variables for all longest common substring categorical variables
    ## Remove 'longestCommonString' string from the dummy variable names
    
    dummies = pd.get_dummies(labelWithLCS.loc[:, 'longestCommonString'])
    labelWithLCS_withdummy = pd.concat([labelWithLCS, dummies], axis=1)
    labelWithLCS_withdummy = labelWithLCS_withdummy.drop(['longestCommonString'], axis=1)
    labelWithLCSVariable = labelWithLCS_withdummy.groupby('inputLabel').max().reset_index()
    
    ## Prepare the train data for random forest model
    ## Define the class flag for 'ALL OTHER' and 'ALL_OTHER'
    
    trainDataWithLCS = pd.merge(labelWithLCSVariable, trainData, on='inputLabel', how='inner')
    trainDataWithLCS['inputLength'] = map(len, trainDataWithLCS['inputLabel'])
    trainDataWithLCS['numPresence'] = trainDataWithLCS['inputLabel'].str.contains('\d', na=False).astype(float)
    trainDataWithLCS['splChrPresence'] = trainDataWithLCS['inputLabel'].str.replace(' ', '').str.contains('[^0-9A-Za-z]',
                                                                                                          na=False).astype(
        float)
    trainDataWithLCS.loc[:, 'AllOtherFlag'] = trainDataWithLCS.loc[:, 'manualMappedLabel'].apply(
        lambda x: 1 if x in ["ALL OTHER", "ALL_OTHER"] else 0)
    trainDataWithLCS = trainDataWithLCS.drop(['inputLabel', 'bfmMappedLabel', 'manualMappedLabel'], axis=1)
    
    ## Fit a random forest model using down sampling techniques for unbalanced data
    ## Check the class for low frequency and use them for sample size
    features1 = trainDataWithLCS.columns[:-1]
    y, _ = pd.factorize(trainDataWithLCS.loc[:, 'AllOtherFlag'], sort=True)
    lowClassFreq = len(trainDataWithLCS) - trainDataWithLCS['AllOtherFlag'].sum(axis=0)
    forest = RandomForestClassifier(n_estimators=1000, class_weight="balanced")
    forest.fit(trainDataWithLCS[features1], y)
    
    
    ############### Prepare TEST data / Prediction data
    
    ## Remove special characters from input labels
    predDataAllOther.loc[:, 'inputNoSplChr'] = predDataAllOther.loc[:, 'inputLabel'].str.replace('[^0-9A-Za-z]', '')
    
    ## Define the class variable and make it as categorical/factor
    predDataAllOther.loc[:, 'AllOtherFlag'] = predDataAllOther.loc[:, 'manualMappedLabel'].apply(
        lambda x: 1 if x in ["ALL OTHER", "ALL_OTHER"] else 0)
    predDataAllOther.loc[:, 'AllOtherFlag'], _ = pd.factorize(predDataAllOther.loc[:, 'AllOtherFlag'], sort=True)
    
    ## Define longest common string dummies
    predDataLength = len(predDataAllOther)
    impVars = features1
    for i in range(0, len(impVars)):
        dummyx = str(impVars[i])
        predDataAllOther[dummyx] = predDataAllOther['inputNoSplChr'].str.contains(dummyx, na=False).astype(float)
    
    ## Define input length, number presence flag in input, special character
    ## presence flag in input
    predDataAllOther['inputLength'] = map(len, predDataAllOther['inputLabel'])
    predDataAllOther['numPresence'] = predDataAllOther['inputLabel'].str.contains('\d', na=False).astype(float)
    predDataAllOther['splChrPresence'] = predDataAllOther['inputLabel'].str.replace(' ', '').str.contains('[^0-9A-Za-z]',
                                                                                                          na=False).astype(
        float)
    
    ## Predict the class using the trained random forest model
    preds = forest.predict(predDataAllOther[impVars])
    predDataAllOther['predAllOtherFlag'] = preds
    predDataAllOther['shConfidence'] = 1 - forest.predict_proba(predDataAllOther[impVars])[:, 0]
    
    ## Define the mapped label as 'ALL OTHER' if the prediction class is 1
    predDataAllOther['shMappedLabel'] = predDataAllOther['predAllOtherFlag'].apply(
        lambda x: 'All OTHER' if x == 1 else 'Non All Other')
    
    ## Validate the results using confusion matrix
    pd.crosstab(predDataAllOther['AllOtherFlag'], preds, rownames=['actual'], colnames=['preds'])
    
    ## Save the predicted results
    
    
    ##########################################################################################
    ###### Predicting labels for all Non-All Other
    ##########################################################################################
    
    ## Extract data for only non-all other labels
    # inputLevelNAOWithBFM = subset(inputLevelWithBFM, !(inputLevelWithBFM$manualMappedLabel %in% c("ALL OTHER", "ALL_OTHER") ) )
    
    inputLevelNAOWithBFM = predDataAllOther.query('predAllOtherFlag == 0')[
        ['manualMappedLabel', 'inputLabel', 'bfmMappedLabel']]
    
    ## Define Train and Test data. Train data holds randomly chosen 70% records.
    ## Rest of 30% records are considered as test data
    trainData = inputLevelNAOWithBFM.sample(frac=.7, replace=False).reset_index()
    # testData = inputLevelNAOWithBFM.drop(trainData.index)
    
    ##################################################################
    ##part2
    ##################################################################
    
    ## Extract data for only non-all other labels
    # inputLevelNAOWithBFM = subset(inputLevelWithBFM, !(inputLevelWithBFM$manualMappedLabel %in% c("ALL OTHER", "ALL_OTHER") ) )
    
    inputLevelNAOWithBFM = predDataAllOther.query('predAllOtherFlag == 0')[
        ['manualMappedLabel', 'inputLabel', 'bfmMappedLabel']]
    
    ## Define Train and Test data. Train data holds randomly chosen 70% records.
    ## Rest of 30% records are considered as test data
    trainData = inputLevelNAOWithBFM.sample(frac=.7, replace=False).reset_index()
    # testData = inputLevelNAOWithBFM.drop(trainData.index)
    
    ############## Prepare the TRAIN data
    
    ## Create variable as input labels without any special character
    trainData['inputNoSplChr'] = trainData['inputLabel'].str.replace('[^0-9A-Za-z]', '')
    trainData = trainData.reset_index(drop=True)
    targetLevelName['targetNoSplChr'] = targetLevelName['targetLevel'].str.replace('[^0-9A-Za-z]', '')
    
    ## Create the empty similarity matrix
    similarMatrix = pd.DataFrame()
    
    ## Update similarity matrix with all input labels and corresponding mapped labels
    ## having maximum similarity measures.
    ## The maximum similarity might return multiple mapped labels.
    ## Rules needs to be applied later to select the best mapped one
    ## Measure the similarity between the target label and the input label after removing the special characters
    lengthTrain = len(trainData)
    
    for i in range(0, lengthTrain):
        inputx = trainData.loc[i, 'inputNoSplChr']
        ## Measure Jaro-Winker similarity, equals to 1 - jw in R
        jarowinkerSim = targetLevelName['targetNoSplChr'].apply(lambda x: jf.jaro_winkler(unicode(inputx), unicode(x)))
        kmax = max(jarowinkerSim)
        targetmax = targetLevelName.loc[jarowinkerSim.idxmax(), 'targetNoSplChr']
        tempDF = pd.DataFrame({'targetNoSplChr': [targetmax]})
        tempDF['similarityScore'] = kmax
        tempDF['measures'] = 'JaroWinkler'
        tempDF['inputNoSplChr'] = inputx
        similarMatrix = similarMatrix.append(tempDF, ignore_index=True)
        ## Measure Damerau-Levenshtein similarity equals to dl in R
        dlavenshteinSim = targetLevelName['targetNoSplChr'].apply(
            lambda x: jf.damerau_levenshtein_distance(unicode(inputx), unicode(x))) / targetLevelName[
                              'targetNoSplChr'].apply(
            lambda x: max(len(inputx), len(x)))
        kmax = max(dlavenshteinSim)
        targetmax = targetLevelName.loc[dlavenshteinSim.idxmax(), 'targetNoSplChr']
        tempDF = pd.DataFrame({'targetNoSplChr': [targetmax]})
        tempDF['similarityScore'] = kmax
        tempDF['measures'] = 'DamerauLevenshtein'
        tempDF['inputNoSplChr'] = inputx
        similarMatrix = similarMatrix.append(tempDF, ignore_index=True)
        ## Measure Jaccard similarity equals to 1 - jaccard in R
        jaccardSim = targetLevelName['targetNoSplChr'].apply(lambda x: jaccard_similarity(inputx, x))
        kmax = max(dlavenshteinSim)
        targetmax = targetLevelName.loc[jaccardSim.idxmax(), 'targetNoSplChr']
        tempDF = pd.DataFrame({'targetNoSplChr': [targetmax]})
        tempDF['similarityScore'] = kmax
        tempDF['measures'] = 'Jaccard'
        tempDF['inputNoSplChr'] = inputx
        similarMatrix = similarMatrix.append(tempDF, ignore_index=True)
        ## Measure Cosine similarity
        # cosineSimMatrix =
        # kmax = max(dlavenshteinSim)
        # tempDF = pd.DataFrame()
        # tempDF['targetNoSplChr'] =
        # tempDF['similarityScore'] = kmax
        # tempDF['measures'] = 'Cosine'
        # tempDF['inputNoSplChr'] = inputx
        # similarMatrix = similarMatrix.append(tempDF, ignore_index=True)
        #
        # ## Measure Longest Common Substring similarity
        # lcsSimMatrix =
        # kmax = max(dlavenshteinSim)
        # tempDF = pd.DataFrame()
        # tempDF['targetNoSplChr'] =
        # tempDF['similarityScore'] = kmax
        # tempDF['measures'] = 'LongestCommonSubstr'
        # tempDF['inputNoSplChr'] = inputx
        # similarMatrix = similarMatrix.append(tempDF, ignore_index=True)
    
    similarMatrix = pd.merge(similarMatrix, targetLevelName, how='left', on='targetNoSplChr')
    
    similarMatrix = similarMatrix.rename(columns={'targetLevel': 'mappedLabel'})
    similarMatrix = similarMatrix.rename(columns={'targetNoSplChr': 'mappedNoSplChr'})
    
    ## Merge the similarity score with training data
    trainDataWithSimilarity = pd.merge(trainData, similarMatrix, how='inner', on='inputNoSplChr')
    
    ## Find the maximum length of the longest common string (without the special character) between the
    ## input label and mapped label, corresponding to each similarity score##?mapped=target?
    dataLength = len(trainDataWithSimilarity)
    
    for i in range(0, dataLength):
        a = trainDataWithSimilarity.loc[i, 'inputNoSplChr']
        b = trainDataWithSimilarity.loc[i, 'mappedNoSplChr']
        s = SequenceMatcher(None, a, b)
        result = s.find_longest_match(0, len(a), 0, len(b))
        trainDataWithSimilarity.loc[i, 'maxLengthOfLCS'] = result.size
    
    trainDataWithSimilarity['inputLength'] = map(len, trainDataWithSimilarity['inputNoSplChr'])
    trainDataWithSimilarity['lcsLengthRatio'] = trainDataWithSimilarity['maxLengthOfLCS'] / trainDataWithSimilarity[
        'inputLength']
    
    ## Create similarity measure dummy variables
    dummies = pd.get_dummies(trainDataWithSimilarity['measures'])
    trainDataWithSimilarity = pd.concat([trainDataWithSimilarity, dummies], axis=1)
    
    ## Define the class varibales and make it as categorical variables
    for i in range(0, len(trainDataWithSimilarity)):
        a = trainDataWithSimilarity.loc[i, 'mappedLabel'].replace(' ', '')
        b = trainDataWithSimilarity.loc[i, 'manualMappedLabel'].replace(' ', '')
        if a == b:
            trainDataWithSimilarity.loc[i, 'matchSimilarityClass'] = 1
        else:
            trainDataWithSimilarity.loc[i, 'matchSimilarityClass'] = 0
    ## Store data for measuring variable importance using random forest
    trainDataForVarImp = trainDataWithSimilarity.drop(['measures', 'bfmMappedLabel', 'manualMappedLabel', 'inputLabel',
                                                       'inputNoSplChr', 'mappedLabel', 'mappedNoSplChr', 'similarityScore',
                                                       'index'],
                                                      axis=1)
    
    ## Fit a random forest model for feature selection
    
    
    ## Train the random forest model using important vriables
    features2 = trainDataForVarImp.columns[:-1]
    y, _ = pd.factorize(trainDataWithSimilarity.loc[:, 'matchSimilarityClass'], sort=True)
    trainDataNAORF = RandomForestClassifier(n_estimators=500)
    trainDataNAORF.fit(trainDataForVarImp[features2], y)
    print trainDataNAORF
    
    with open(modelPath + '/' + Client + Category + 'model', 'wb') as f:
        cPickle.dump([features1, features2, forest, trainDataNAORF], f)