Python lev 예제들, weighted_levenshtein.lev Python 예제들

예제 #1

0

파일 보기

파일: api_collection.py 프로젝트: OvroAbir/Tutorial-Buddy

	def issameword(word1, word2):
		WordDistance.generate_weight_table()
		dist = lev(word1, word2, substitute_costs=WordDistance.substitute_costs)
		#print(word1, word2, dist)
		if(dist < 0.3):
			return True
		return False

예제 #2

0

파일 보기

def check_word(corpus, term):
    '''
    check's if the word exists in the dictionary, if so return same word. 
    else, checks weighted distance with word and dictionary word, if distance is <= 1 then return word
    if not, then the smallest weight is returned
    '''
    if corpus == 'uottawa':
        df = pd.read_csv('./uottawa_dictionary.csv', index_col=0)
    else:
        df = pd.read_csv('./reuters_dictionary.csv', index_col=0)
    
    res = {}

    insert_costs = np.ones(128, dtype=np.float64) 
    delete_costs = np.ones(128, dtype=np.float64)
    substitute_costs = np.ones((128, 128), dtype=np.float64) 

    for i in range(df.shape[0]):
        if term == df.iat[i,0]:
            return df.iat[i,0]
        else:
            if len(str(df.iat[i,0])) >= len(term):
                weight = lev(term, str(df.iat[i,0]), insert_costs, delete_costs, substitute_costs)
                res[df.iat[i,0]] = weight
    
    sortedWeights = {k: v for k, v in sorted(res.items(), key=lambda item: item[1])}
    print("new spelling: ", next(iter(sortedWeights)))
    return (next(iter(sortedWeights)))

예제 #3

0

파일 보기

def edit_distance_normalized_cost(word, target):
    cost = lev(word,
               target,
               insert_costs=insert_costs,
               delete_costs=delete_costs,
               substitute_costs=substitute_costs)
    return (cost + alfha) / len(target)

예제 #4

0

파일 보기

 def match(self, string1, string2):
     # Testing
     return lev(string1.lower(),
                string2.lower(),
                substitute_costs=self.substitute_costs,
                delete_costs=self.delete_costs,
                insert_costs=self.insert_costs)

예제 #5

0

파일 보기

    def calculate(self, wrong_word, words_dict):
        change_costs = self.get_change_costs()
        insert_costs = self.get_insert_cost()
        delete_costs = self.get_delete_cost()

        ### https://weighted-levenshtein.readthedocs.io/en/master/

        lev_dict = {
            cnd: lev(wrong_word,
                     cnd,
                     insert_costs=insert_costs,
                     delete_costs=delete_costs,
                     substitute_costs=change_costs)
            for cnd in words_dict
        }

        top_rated = sorted(lev_dict.items(), key=lambda kv: kv[1]
                           )[:300]  ### Dictionary -> Word : Levenstein metric
        ranking = [x[0] for x in top_rated]
        max_levenstein = max(top_rated, key=lambda vector: vector[1])[1]
        Pwc = [(x[0], 1 - x[1] / max_levenstein)
               for x in top_rated]  ### sorted probability -> word : p

        dict_count = [(x, words_dict[x] + 1) for x in ranking]
        max_count = max(dict_count, key=lambda v: v[1])[1]

        Pc = [(x[0], x[1] / max_count) for x in dict_count]

        Pcw_probability = self.calculate_probability(Pwc=Pwc, Pc=Pc)

        return Pcw_probability

예제 #6

0

파일 보기

def lev_similarity(aa: str, bb: str) -> float:
    """
    Get a Levenshtein similarity score.

    :param aa: first string
    :param bb: second string
    :return: The similarity of the two strings (0=bad, 1=match):
             1- lev(aa,bb)/max(len(aa), len(bb))
    """

    # Since weighted levenshtein can't handle unicode,
    # convert to ASCII first:

    def convert_to_ascii(text: str, label: str) -> str:
        try:
            text_out = text.encode('ascii', 'ignore')
            return text_out
        except Exception as ex:
            raise Exception(f'Could not encode f{label}: f{aa}') from ex

    aa = convert_to_ascii(aa, 'aa')
    bb = convert_to_ascii(bb, 'bb')

    # TODO, consider penalizing whitespace alterations less
    return 1.0 - lev(aa, bb) / max(len(aa), len(bb))

예제 #7

0

파일 보기

 def test_lev(self):
     self.assertEqual(lev('1234', '1234'), 0.0)
     self.assertEqual(lev('', '1234'), 4.0)
     self.assertEqual(lev('1234', ''), 4.0)
     self.assertEqual(lev('', ''), 0.0)
     self.assertEqual(lev('1234', '12'), 2.0)
     self.assertEqual(lev('1234', '14'), 2.0)
     self.assertEqual(lev('1111', '1'), 3.0)

예제 #8

0

파일 보기

파일: test.py 프로젝트: infoscout/weighted-levenshtein

 def test_lev(self):
     self.assertEqual(lev('1234', '1234'), 0.0)
     self.assertEqual(lev('', '1234'), 4.0)
     self.assertEqual(lev('1234', ''), 4.0)
     self.assertEqual(lev('', ''), 0.0)
     self.assertEqual(lev('1234', '12'), 2.0)
     self.assertEqual(lev('1234', '14'), 2.0)
     self.assertEqual(lev('1111', '1'), 3.0)

예제 #9

0

파일 보기

파일: levenshtein_distance.py 프로젝트: vunb/chatbot_ner

 def edit_distance(self):
     if c_levenshtein:
         return min(
             self.max_threshold,
             int(
                 lev(self.word1,
                     self.word2,
                     insert_costs=insertion_costs,
                     delete_costs=deletion_costs,
                     substitute_costs=substitution_costs)))
     else:
         return self.levenshtein_distance()

예제 #10

0

파일 보기

def EditDistance(str1, str2):
    alphabet = [
        u'\u02c0', u'b', u'g', u'd', u'h', u'w', u'z', u'\u1e25', u'\u1e6d',
        u'y', u'k', u'k', u'l', u'm', u'm', u'n', u'n', u's', u'\u02c1', u'p',
        u'p', u'\u00e7', u'\u00e7', u'q', u'r', u'\u0161', u't'
    ] + list(u'euioa*-') + list(u'qwertyuiopasdfghjklzxcvbnm')
    alphabet = list(set(alphabet))

    int2char = {i + 32: ch for i, ch in enumerate(alphabet)}
    char2int = {char: ind for ind, char in int2char.items()}

    str1_ = ''.join([chr(char2int[x]) for x in str1])
    str2_ = ''.join([chr(char2int[x]) for x in str2])
    return lev(str1_, str2_)

예제 #11

0

파일 보기

파일: weighted_lev.py 프로젝트: shaneweisz/OCR-Character-Confusion

def main():
    """Run Weighted Levenshtein on two inputted words."""
    df = pd.read_pickle(
        get_absolute_path('confusion_matrix') +
        '/confusion_matrix_base.pkl')  # use the base pickle
    df = df.drop('other', axis=1)  # drop the ' ' column
    df = normalise(df)

    substitution_costs = get_subsitution_costs(df)
    # get the distance of truth from read:

    truth = sys.argv[1]
    read = sys.argv[2]
    print(wl.lev(read, truth, substitute_costs=substitution_costs))

예제 #12

0

파일 보기

def avg_edit2(assembled_f, labels_f, seqnum):
    avg = 0
    # Make array of costs, can control the edit cost for each opeartion and their application to each character
    # Example: insert_costs[ord('D')] = 1.5, make inserting the character 'D' have cost 1.5 (instead of 1)
    insert_costs = np.ones(128, dtype=np.float64) * 2
    delete_costs = np.ones(128, dtype=np.float64) * 2
    # Substitution costs can be specified independently in both directions, i.e. a->b can have different cost from b->a
    # Example: substitute_costs[ord('H'), ord('B')] = 1.25, make substituting 'H' for 'B' cost 1.25
    subs_costs = np.ones((128, 128), dtype=np.float64) * 1
    for (ass, lab) in zip(assembled_f, labels_f):
        avg += lev(ass,
                   lab,
                   insert_costs=insert_costs,
                   delete_costs=delete_costs,
                   substitute_costs=subs_costs) / len(lab)
    avg = avg / seqnum
    return avg

예제 #13

0

파일 보기

파일: AWS_accuracy.py 프로젝트: thorames/SamsonResearch

def calculate_levenshtein(gold_transcripts, silver_transcripts):
    average_accuracy = 0

    for key, value in tqdm.tqdm(gold_transcripts.items()):
        average_word_length = 0
        for token in value:
            average_word_length += len(token)
        average_word_length /= len(value)
        if key in silver_transcripts:
            distance = lev(" ".join(value), " ".join(silver_transcripts[key]))
            distance /= average_word_length
            error_rate = distance / len(value)
            accuracy = 1 - error_rate
            average_accuracy += accuracy
            print(key + " : " + str(accuracy))
    print("\n" + "Average Transcription Accuracy : " +
          str(average_accuracy / len(gold_transcripts)))

예제 #14

0

파일 보기

파일: tts_preprocess.py 프로젝트: nesati/pleskalka

    def distance(self, text1, text2):
        # filter text
        text1 = text1.lower()
        # unfortunately weighted_levenshtein doesn't support unicode so diacritics have a cost of 0
        text1 = text1.translate(diacritics)
        text1 = ''.join(filter(self.onlyascii, text1))

        text2 = text2.lower()
        # unfortunately weighted_levenshtein doesn't support unicode so diacritics have a cost of 0
        text2 = text2.translate(diacritics)
        text2 = ''.join(filter(self.onlyascii, text2))

        return lev(text1,
                   text2,
                   insert_costs=insert_costs,
                   delete_costs=delete_costs,
                   substitute_costs=substitute_costs)

예제 #15

0

파일 보기

def bayes(wrong_word, words_dict, polish_chars_dict):
    insert_costs, delete_costs, substitute_costs = init_weighted_lev_dicts()
    polish_mistakes_dict = {
        'r': ['Z', 's'],
        'c': ['h'],
        'z': ['Z', 'X', 'r'],
        'n': ['ń'],
        's': ['S'],
        'l': ['L'],
        'o': ['O', 'u'],
        'u': ['O'],
        'h': ['c']
    }
    Pwc_dict = {
        cnd: lev(wrong_word,
                 cnd,
                 insert_costs=insert_costs,
                 delete_costs=delete_costs,
                 substitute_costs=substitute_costs)
        for cnd in words_dict if cnd.startswith(
            find_by_first_chars(wrong_word, polish_mistakes_dict))
    }
    top_rated = sorted(Pwc_dict.items(), key=lambda kv: kv[1])[:200]
    top_rated_words = [x[0] for x in top_rated]
    top_rated_words_max = max(top_rated, key=lambda v: v[1])[1]
    Pwc = [(x[0], 1 - x[1] / top_rated_words_max) for x in top_rated]  # P(w|c)

    occurences_in_dict = [(x, words_dict[x] + 1) for x in top_rated_words]
    occurences_in_dict_max = max(occurences_in_dict, key=lambda v: v[1])[1]
    Pc = [(x[0], x[1] / occurences_in_dict_max)
          for x in occurences_in_dict]  # P(c)
    Pcw = []
    for p in range(len(Pwc)):
        Pcw.append(
            (handle_polish_word({v: k
                                 for k, v in polish_chars_dict.items()},
                                Pwc[p][0]),
             0.8 * Pwc[p][1] + 0.2 * Pc[p][1]))  # P(c|w)
    return [x[0] for x in sorted(Pcw, key=lambda v: v[1], reverse=True)[:5]]

예제 #16

0

파일 보기

파일: word_similarity.py 프로젝트: afking3/flask-base

def check_word_against_term(input_word, given_term):
    new_str = ""
    for chr in input_word:
        if ord(chr) < 128:
            new_str += chr
        else:
            new_str += "?"
    return lev(new_str, given_term, substitute_costs=substitute_costs)<2

#Tests (can be commented out after implementing)
# print ("DISPO and DISPO "+ str(lev('DISPO', 'DISPO', substitute_costs=substitute_costs)))
# print ("DISPO and DAMN "+str(lev('DISPO', 'DAMN', substitute_costs=substitute_costs)))
# print ("DISPO and D1SPO "+str(lev('DISPO', 'D1SPO', substitute_costs=substitute_costs))) 
# print ("DISPO and D15PO "+str(lev('DISPO', 'D15PO', substitute_costs=substitute_costs)))
# print ("DISPO and D15P0 "+str(lev('DISPO', 'D15P0', substitute_costs=substitute_costs)))

# print ("NAM 001 and NAM OO1 "+str(lev('NAM 001', 'NAM OO1', substitute_costs=substitute_costs)))

# print ("COURT and C0URT "+str(lev('COURT', 'C0URT', substitute_costs=substitute_costs)))

# print (check_word_similarity("D15P0"))
# print (check_word_similarity("FELONY"))

예제 #17

0

파일 보기

파일: process_website.py 프로젝트: mihaivinaga/soleadify_ml_v2

    def handle(self, *args, **options):
        # websites = Website.objects.raw(
        #     "select w.id, w.domain, count(distinct wc.id) from website_contacts wc "
        #     "JOIN website_locations wl on wc.website_id = wl.website_id "
        #     "JOIN category_websites cw on wc.website_id = cw.website_id "
        #     "JOIN websites w on wc.website_id = w.id "
        #     "left join ( "
        #     "select distinct website_id from website_contacts wc "
        #     "JOIN ( "
        #     "select id from lawyer_dir_part1 "
        #     "union "
        #     "select id from lawyer_dir_part2 "
        #     "union "
        #     "select id from lawyer_dir_part3 "
        #     ") doo ON doo.id = wc.id "
        #     ") doo ON doo.website_id = w.id "
        #     "where country_code = 'us' and region_code in ('ca','tx','fl','ny','il') "
        #     "and category_id IN (10368) and doo.website_id is null and ((64 & wc.score) or (32 & wc.score)) "
        #     "group by wc.website_id "
        #     "order by count(distinct wc.id) desc"
        # )
        websites = Website.objects.raw(
            "select w.id, w.domain, count(distinct  wc.id) from websites w "
            "JOIN website_locations wl on w.id = wl.website_id "
            "JOIN category_websites cw on w.id = cw.website_id "
            "JOIN website_contacts wc on w.id = wc.website_id "
            "where country_code = 'us' and category_id IN (10368) and ((64 & wc.score) or (32 & wc.score)) "
            "group by w.id "
            "having count(distinct  wc.id) < 20 "
            "order by count(distinct  wc.id) desc")
        progress_bar = tqdm(desc="Processing", total=len(websites))
        for website in websites:

            progress_bar.update(1)

            filter_contact = Q(
                Q(score=WebsiteContact.score.has_matching_email)
                | Q(score=WebsiteContact.score.has_unique_email)
                | Q(score=WebsiteContact.score.has_unique_phone))

            website_contacts = WebsiteContact.objects. \
                filter(website_id=website.id). \
                filter(filter_contact)

            if len(website_contacts) < 2:
                continue

            query = Q()
            for website_contact in website_contacts:
                if website_contact.first_name and website_contact.last_name:
                    first_and_last_name_query = Q(
                        first_name=website_contact.first_name,
                        last_name=website_contact.last_name,
                        organization_key__isnull=False,
                    )
                    query.add(first_and_last_name_query, Q.OR)

            if len(query):
                organization_keys = DirectoryContact.objects.values_list('organization_key'). \
                                        annotate(dcount=Count('name', distinct=True)). \
                                        filter(query).exclude(organization_key__isnull=True).order_by('-dcount')[:3]
                delete_costs = np.zeros(128, dtype=np.float64)

                print("website_id: %s, domain: %s, website_contacts: %s" %
                      (website.id, website.domain, len(website_contacts)))

                for organization_key in organization_keys:

                    if organization_key[1] / len(website_contacts) < 0.03:
                        break

                    website_director = None
                    head = website.domain.partition('.')[0]
                    lev_cost = 10
                    try:
                        if len(head) > len(organization_key[0]):
                            lev_cost = lev(head,
                                           organization_key[0],
                                           delete_costs=delete_costs)
                        else:
                            lev_cost = lev(organization_key[0],
                                           head,
                                           delete_costs=delete_costs)
                    except:
                        pass

                    if organization_key[1] > 5 and len(website_contacts) > 5 and \
                            (organization_key[1] / len(website_contacts)) > 0.1:
                        website_director = WebsiteDirector(
                            organization_key=organization_key[0],
                            website_id=website.id)
                        print(
                            "organization_key: %s, matching_contacts: %s, lev: %s, type: %s"
                            % (organization_key[0], organization_key[1],
                               lev_cost, 1))
                    elif (organization_key[1] /
                          len(website_contacts)) > 0.3 and lev_cost <= 2:
                        website_director = WebsiteDirector(
                            organization_key=organization_key[0],
                            website_id=website.id)
                        print(
                            "organization_key: %s, matching_contacts: %s, lev: %s, type: %s"
                            % (organization_key[0], organization_key[1],
                               lev_cost, 2))

                    if website_director:
                        WebsiteDirector.objects.bulk_create(
                            [website_director], ignore_conflicts=True)
                        # break

        progress_bar.close()

예제 #18

0

파일 보기

파일: GED 3.py 프로젝트: chenglinj/Spelling-Correction

    substitute_costs[i] = np.array([3] * 128)

output = []
correct_response = 0
attempted_response = 0
for i in range(0, len(misspell)):
    temp_dis = 6
    temp_word = ''
    count = 0
    for j in range(0, len(dictionary)):
        if abs(len(misspell[i]) - len(dictionary[j])) > 5:
            j += 1
        else:
            if weighted_levenshtein.lev(
                    misspell[i],
                    dictionary[j],
                    delete_costs=delete_costs,
                    substitute_costs=substitute_costs) < temp_dis:
                temp_dis = weighted_levenshtein.lev(
                    misspell[i],
                    dictionary[j],
                    delete_costs=delete_costs,
                    substitute_costs=substitute_costs)
                temp_word = str(dictionary[j])
                count = 1
            elif weighted_levenshtein.lev(
                    misspell[i],
                    dictionary[j],
                    delete_costs=delete_costs,
                    substitute_costs=substitute_costs) == temp_dis:
                temp_word = temp_word + ' ' + str(dictionary[j])

예제 #19

0

파일 보기

def missingparameterstext(psflist, parameterlist):
    # Pass a directory that has psfs in it and a list of parameter files that contain analogous parameters, and return 
    # a string containing all the parameters that need to be added for simulation.
    params = CharmmParameterSet()
    for p in parameterlist:
        params.read_parameter_file(p)
    returntext = "! This file was written by analogy from the following input parameter files: {}" \
                 "\n\n".format(str(prmlist)[1:-1])
    missingset = [set(), set(), set(), set()]
    for psf in psflist:
        mol = pmd.load_file(psf)
        newmissing = mol.findmissingparameters(params)
        for i in range(4):
            for el in newmissing[i]:
                missingset[i].add(el)
    # Eliminate duplicate dihedrals.
    if len(missingset[0]):
        print(
            "The following atomtypes are missing nonbonded terms from the input parameter files. This is probably an "
            "input problem, and so we are exiting.")
        print(missingset[0])
        exit()
    for i in range(1, 4):
        missingset[i] = removeduplicates(missingset[i])
    # Write bonds section
    returntext += "BONDS\n"
    for missingbondtype in missingset[1]:
        mindistance = 1000
        typekey = "-".join(missingbondtype)
        for k in params.bond_types:
            compkey = "-".join(k)
            ed = lev(typekey, compkey, substitute_costs=subcost, delete_costs=delcost)
            if ed < mindistance:
                replacementtype = k
                mindistance = ed
        returntext += "%-8s %-8s %.3f %.4f ! From %-8s %-8s\n" % (
        missingbondtype[0], missingbondtype[1], params.bond_types[replacementtype].k,
        params.bond_types[replacementtype].req, replacementtype[0], replacementtype[1])
    # Write angles section
    returntext += "\nANGLES\n"
    for missingangletype in missingset[2]:
        mindistance = 1000
        typekey = "-".join(missingangletype)
        for k in params.angle_types:
            compkey = "-".join(k)
            ed = lev(typekey, compkey, substitute_costs=subcost, delete_costs=delcost)
            if ed < mindistance:
                replacementtype = k
                mindistance = ed
        if params.urey_bradley_types[replacementtype].k == 0:
            returntext += "%-8s %-8s %-8s %.3f %.4f ! From %-8s %-8s %-8s\n" % (
            missingangletype[0], missingangletype[1], missingangletype[2], params.angle_types[replacementtype].k,
            params.angle_types[replacementtype].theteq, replacementtype[0], replacementtype[1], replacementtype[2])
        else:
            returntext += "%-8s %-8s %-8s %.3f %.4f %.2f %.4f ! From %-8s %-8s %-8s\n" % (
            missingangletype[0], missingangletype[1], missingangletype[2], params.angle_types[replacementtype].k,
            params.angle_types[replacementtype].theteq, params.urey_bradley_types[replacementtype].k,
            params.urey_bradley_types[replacementtype].req, replacementtype[0], replacementtype[1], replacementtype[2])
    # Write dihedrals section
    returntext += "\nDIHEDRALS\n"
    for missingangletype in missingset[3]:
        mindistance = 1000
        typekey = "-".join(missingangletype)
        for k in params.dihedral_types:
            compkey = "-".join(k)
            compkey2 = "-".join(k[::-1])
            ed = lev(typekey, compkey, substitute_costs=subcost, delete_costs=delcost)
            ed2 = lev(typekey, compkey2, substitute_costs=subcost, delete_costs=delcost)
            if ed < mindistance or ed2 < mindistance:
                replacementtype = k
                mindistance = ed
        for prm in params.dihedral_types[replacementtype]:
            returntext += "%-8s %-8s %-8s %-8s %.4f %d %5.1f ! From %-8s %-8s %-8s %-8s\n" % (
            missingangletype[0], missingangletype[1], missingangletype[2], missingangletype[3], prm.phi_k, prm.per,
            prm.phase, replacementtype[0], replacementtype[1], replacementtype[2], replacementtype[3])
    return returntext + "\n"

예제 #20

0

파일 보기

 def _lev(self, x, y):
     return lev(x, y, self.iw, self.dw, self.sw)

예제 #21

0

파일 보기

import csv
import configparser
import weighted_levenshtein

config = configparser.ConfigParser()

LEV_WEIGHTS = config['DEFAULT']['LevenshteinWeights']
LEV_TRESHOLD = 0.2

with open(LevenshteinWeights, 'r') as readFile:
    csvreader = csv.reader(readFile)
    lines = list(csvreader)[1:]
    for l in lines:
        substitute_costs[ord(l[0]), ord(l[1])] = l[2]


def lev_distance(a, b, weights=LEV_WEIGHTS)
	wlev_dist = weighted_levenshtein.lev(a, b, substitute_costs=weights)
	return wlev_dist

예제 #22

0

파일 보기

 def __call__(self, input1, input2):
     return lev(input1,
                input2,
                substitute_costs=self.substitute_costs,
                insert_costs=self.insertion_costs,
                delete_costs=self.deletion_costs)

예제 #23

0

파일 보기

파일: test.py 프로젝트: infoscout/weighted-levenshtein

 def _lev(self, x, y):
     return lev(x, y, self.iw, self.dw, self.sw)

예제 #24

0

파일 보기

파일: word_similarity.py 프로젝트: afking3/flask-base

def check_word_similarity(word):
    for term in term_list:
        if lev(term, word, substitute_costs=substitute_costs)<len(term):
            return term

    return None

예제 #25

0

파일 보기

        elif hamming == hamming_result["Hamming distance"]:
            hamming_result["match_positions"][start] = ref_sub
    return hamming_result


letter_to_letter_matches_uppercase = seqdistance.make_letter_to_letter_matches(
    genealloy.ambiguity_code_to_nt_set)
letter_to_letter_matches = seqdistance.make_dict_both_case(
    letter_to_letter_matches_uppercase)
nt_substitute_costs = seqdistance.make_penalty_table(letter_to_letter_matches)

seq = "ATGGATCGGCGGGCG"
#            |||||||||||  ||
ref = "ggGGGCATGGATCGGCGAACGAGSCtgATAAGGTGCTAGCTAAAAAAAAAA"

lev(seq, ref, substitute_costs=nt_substitute_costs)
# 36.0
# Hamming distance with positions and sequences:
find_shortest_hamming(seq, ref, substitute_costs=nt_substitute_costs)
# {'Hamming distance': 2.0, 'match_positions': {6: 'ATGGATCGGCGAACG'}}
########################################################################################

# Calculate distance for complement sequences
from Examples.EpiJinn import epijinn

seq = "AAAAAAAAAACCC"
ref = "GGGTTTTTTTTTT"
print(lev(seq, ref, substitute_costs=nt_substitute_costs))
# 13.0
seq_rc = epijinn.Methylase.reverse_complement(seq)
print(lev(seq_rc, ref, substitute_costs=nt_substitute_costs))

예제 #26

0

파일 보기

def merge_levenshtein(pool, buffers, cols, lens, matrix, *flags):
    """ pool contains offsets of buffers that are to be aligned """
    #print("merge_levenshtein(",pool,"buffers",cols,lens,matrix,flags,")")
    if len(buffers) != 2:
        raise Exception("no support for aligning more than two files, yet")

    result = []
    if len(pool[0]) == 0:
        for y in pool[1]:
            if "force" in flags and len(result) > 0:
                result[-1] = result[-1][0:lens[0]] + [
                    re.sub(r"(^[\?_\*]\+)?(.*)(\+[\?_\*])?$", r"\2",
                           val1 + "+" + val2)
                    for val1, val2 in zip(result[-1][lens[0]:], buffers[1][y])
                ]
            else:  # default mode
                if len(buffers[1][y]) > cols[1]:  # we skip empty lines
                    newrow = ["?"] * (cols[0]) + [
                        "*" + buffers[1][y][cols[1]] + "*"
                    ] + (["?"] * (lens[0] - cols[0] - 1)) + buffers[1][y]
                    result.append(newrow)

        return result

    if len(pool[1]) == 0:
        for x in pool[0]:
            if len(buffers[0][x]) == 1 and buffers[0][x][0] == "":
                result.append(buffers[0][x])
            else:
                result.append(buffers[0][x] + ["?"] * lens[1])
        return result

    if matrix == None:
        matrix = []
        for x in pool[0]:
            matrix.append([])
            src = buffers[0][x][0]
            try:
                src = buffers[0][x][cols[0]]
            except:
                pass
            for y in pool[1]:
                tgt = buffers[1][y][0]
                try:
                    tgt = buffers[1][y][cols[1]]
                except:
                    pass
                src = norm(src)
                tgt = norm(tgt)
                if (max(len(src), len(tgt)) == 0):
                    matrix[-1].append(1.0)
                else:
                    #print(src,tgt,lev(src,tgt))
                    matrix[-1].append(
                        1.0 - lev(src, tgt) /
                        max(len(src), len(tgt)))  # leventhein similarity!

    max_x = 0
    max_y = 0
    min_dist = 0
    max_sim = 0
    try:
        min_dist = abs(
            max_x / len(pool[0]) -
            max_y / len(pool[1]))  # secondary criterion for equal similarity
        max_sim = matrix[0][0]
    except:
        pass

    #print("matrix:",matrix)
    for x in range(len(pool[0])):
        for y in range(len(pool[1])):
            sim = matrix[x][y]
            dist = abs(x / len(pool[0]) - y / len(pool[1]))
            #print(sim,max_sim,dist,min_dist)
            if (sim > max_sim or (sim == max_sim and min_dist > dist)):
                min_dist = dist
                max_sim = sim
                max_x = x
                max_y = y

    result = []

    # "i" and "r" before alignment
    if max_x == 0:  # "i"
        for y in pool[1][0:max_y]:
            if "force" in flags and len(result) > 0:
                result[-1] = result[-1][0:lens[0]] + [
                    re.sub(r"(^[\?_\*]\+)?(.*)(\+[\?_\*])?$", r"\2",
                           val1 + "+" + val2)
                    for val1, val2 in zip(result[-1][lens[0]:], buffers[1][y])
                ]
            else:  # default mode
                if len(buffers[1][y]) > cols[1]:
                    # print(buffers[1][y])
                    result.append(["?"] * cols[0] +
                                  ["*" + buffers[1][y][cols[1]] + "*"] +
                                  ["?"] * (lens[0] - cols[0] - 1) +
                                  buffers[1][y])
    elif max_y == 0:  # "r"
        for x in pool[0][0:max_x]:
            if len(buffers[0][x]) == 1 and buffers[0][x][0] == "":
                result.append(buffers[0][x])
            else:
                result.append(buffers[0][x] + ["?"] * lens[1])
    else:
        sub_pool = [pool[0][0:max_x], pool[1][0:max_y]]
        #print(pool,max_x,max_y,"=>",sub_pool)
        result = merge_levenshtein(sub_pool, buffers, cols, lens, matrix,
                                   *flags)

    # max alignment
    if len(pool[0]) > 0 and len(pool[1]) > 0:
        result.append(buffers[0][pool[0][max_x]] + buffers[1][pool[1][max_y]])

    # align final elements
    if max_x == len(pool[0]) - 1:  # "i"
        for y in pool[1][max_y + 1:]:
            if "force" in flags and len(result) > 0:
                result[-1] = result[-1][0:lens[0]] + [
                    re.sub(r"(^[\?_\*]\+)?(.*)(\+[\?_\*])?$", r"\2",
                           val1 + "+" + val2)
                    for val1, val2 in zip(result[-1][lens[0]:], buffers[1][y])
                ]
            else:  # default mode
                if len(buffers[1][y]) > cols[1]:
                    result.append(["?"] * cols[0] +
                                  ["*" + buffers[1][y][cols[1]] + "*"] +
                                  ["?"] * (lens[0] - cols[0] - 1) +
                                  buffers[1][y])
    elif max_y == len(pool[1]) - 1:  # "r"
        for x in pool[0][max_x + 1:]:
            if len(buffers[0][x]) == 1 and buffers[0][x][0] == "":
                result.append(buffers[0][x])
            else:
                result.append(buffers[0][x] + ["?"] * lens[1])
    else:
        # recursion for non final elements
        sub_pool = [pool[0][max_x + 1:], pool[1][max_y + 1:]]
        result = result + merge_levenshtein(sub_pool, buffers, cols, lens,
                                            matrix, *flags)

    return result