Python levenshtein 예제들, jellyfish.levenshtein Python 예제들

예제 #1

0

파일 보기

파일: words.py 프로젝트: ogogol/best-choice-of-gsents

def addWordOrNotChoice(orSent, gSentWds, googleSentenceWords):
    #возращает наиболее близкий к оригиналу вариант из двух предложений, в одном из которых добавлено или удалено слово
    orSent = unicode(orSent)
    if levenshtein(orSent, unicode(' '.join(gSentWds))) >= levenshtein(orSent, unicode(' '.join(googleSentenceWords))):
        gSentWds = [w for w in googleSentenceWords]
    else:
        googleSentenceWords = [w for w in gSentWds]

    return gSentWds, googleSentenceWords

예제 #2

0

파일 보기

파일: words.py 프로젝트: ogogol/best-choice-of-gsents

def addWordOrNotChoice(orSent, gSentWds, googleSentenceWords):
    #возращает наиболее близкий к оригиналу вариант из двух предложений, в одном из которых добавлено или удалено слово
    orSent = unicode(orSent)
    if levenshtein(orSent, unicode(' '.join(gSentWds))) >= levenshtein(
            orSent, unicode(' '.join(googleSentenceWords))):
        gSentWds = [w for w in googleSentenceWords]
    else:
        googleSentenceWords = [w for w in gSentWds]

    return gSentWds, googleSentenceWords

예제 #3

0

파일 보기

파일: words.py 프로젝트: ogogol/best-choice-of-gsents

def isSumma2WordsTheBest(baseWord, word1, word2, n, comSentWds):
    #проверяет не больше ли подходит два слова из предложения вместе на слово из оригинала,
    #чем каждое само по себе
    #возращает лучший вариант списка слов предожении, либо с одним слитым, либо с двумя
    baseWord = unicode(baseWord)
    word1 = unicode(word1)
    word2 = unicode(word2)
    leve_2words = levenshtein(baseWord, word1+word2)
    if leve_2words < levenshtein(baseWord, word1) and leve_2words < levenshtein(baseWord, word2):
        comSentWds[n] += comSentWds[n+1]
        del comSentWds[n+1]

    return comSentWds

예제 #4

0

파일 보기

파일: words.py 프로젝트: ogogol/best-choice-of-gsents

def isSumma2WordsTheBest(baseWord, word1, word2, n, comSentWds):
    #проверяет не больше ли подходит два слова из предложения вместе на слово из оригинала,
    #чем каждое само по себе
    #возращает лучший вариант списка слов предожении, либо с одним слитым, либо с двумя
    baseWord = unicode(baseWord)
    word1 = unicode(word1)
    word2 = unicode(word2)
    leve_2words = levenshtein(baseWord, word1 + word2)
    if leve_2words < levenshtein(
            baseWord, word1) and leve_2words < levenshtein(baseWord, word2):
        comSentWds[n] += comSentWds[n + 1]
        del comSentWds[n + 1]

    return comSentWds

예제 #5

0

파일 보기

def get_best_newWord(pos, word, original_words, user_words, mb_words):
    '''
    находит лучшую замену заменяемому слову из списка подходящих, близких слов
    :param pos: int - позиция слова в предложении пользователя
    :param word: str - заменяемое слово
    :param original_words: list - список слов оригинального предожения
    :param user_words: list - список слов предложения пользователя
    :param mb_words: list - слова близкие к заменяемому
    :return: newWord: str - слово на которое меняем старое слово
    '''

    newWord = mb_words[0]
    if not mb_words: return ''

    orSent = ' '.join(original_words)
    user_sent = ' '.join(user_words)
    gSentWds1 = [w for w in user_words]
    gSentWds1[pos] = mb_words[0]

    if len(mb_words) == 1:
        if levenshtein(unicode(orSent), unicode(
                ' '.join(gSentWds1))) >= levenshtein(unicode(orSent),
                                                     unicode(user_sent)):
            newWord = word

    else:
        gSentWds2 = [w for w in user_words]
        gSentWds2[pos] = mb_words[1]

        if levenshtein(unicode(orSent), unicode(
                ' '.join(gSentWds1))) > levenshtein(
                    unicode(orSent), unicode(' '.join(gSentWds2))):
            if levenshtein(unicode(orSent), unicode(
                    ' '.join(gSentWds2))) > levenshtein(
                        unicode(orSent), unicode(user_sent)):
                newWord = word
            else:
                newWord = mb_words[1]
        else:
            if levenshtein(unicode(orSent), unicode(user_sent)) <= levenshtein(
                    unicode(orSent), unicode(' '.join(gSentWds1))):
                newWord = word

        if levenshtein(unicode(orSent), unicode(
                ' '.join(gSentWds2))) < levenshtein(
                    unicode(orSent), unicode(' '.join(gSentWds1))):
            newWord = mb_words[1]

    return newWord

예제 #6

0

파일 보기

파일: words.py 프로젝트: ogogol/best-choice-of-gsents

def wordsRightOrder(maxSentWds, sentWds):
    '''
    коррекция порядка слов в предложении
    сравнение предложения по отношению к самому длинному и вставка одного '' вместо пропущенного слова
    :param maxSent: string
    :param sent: string
    :return: sentWds: list
    '''
    msl = len(maxSentWds)
    sl = len(sentWds)

    if msl - sl > 0:
        for i in range(msl):
            if i + 1 < msl and i < len(sentWds):
                if sentWds[i] == maxSentWds[i]:
                    continue
                elif levenshtein(nysiis(unicode(sentWds[i])), nysiis(unicode(maxSentWds[i]))) >\
                        levenshtein(nysiis(unicode(sentWds[i])), nysiis(unicode(maxSentWds[i+1]))):
                    sentWds.insert(i, '')

    return sentWds

예제 #7

0

파일 보기

파일: words.py 프로젝트: ogogol/best-choice-of-gsents

def wordsRightOrder(maxSentWds, sentWds):
    '''
    коррекция порядка слов в предложении
    сравнение предложения по отношению к самому длинному и вставка одного '' вместо пропущенного слова
    :param maxSent: string
    :param sent: string
    :return: sentWds: list
    '''
    msl = len(maxSentWds)
    sl = len(sentWds)

    if msl - sl > 0:
        for i in range(msl):
            if i + 1 < msl and i < len(sentWds):
                if sentWds[i] == maxSentWds[i]:
                    continue
                elif levenshtein(nysiis(unicode(sentWds[i])), nysiis(unicode(maxSentWds[i]))) >\
                        levenshtein(nysiis(unicode(sentWds[i])), nysiis(unicode(maxSentWds[i+1]))):
                    sentWds.insert(i,'')

    return sentWds

예제 #8

0

파일 보기

파일: new_guess_inaccuracy.py 프로젝트: ogogol/best-choice-of-gsents

def get_best_newWord(pos, word, original_words, user_words, mb_words):
    '''
    находит лучшую замену заменяемому слову из списка подходящих, близких слов
    :param pos: int - позиция слова в предложении пользователя
    :param word: str - заменяемое слово
    :param original_words: list - список слов оригинального предожения
    :param user_words: list - список слов предложения пользователя
    :param mb_words: list - слова близкие к заменяемому
    :return: newWord: str - слово на которое меняем старое слово
    '''

    newWord = mb_words[0]
    if not mb_words: return ''

    orSent = ' '.join(original_words)
    user_sent = ' '.join(user_words)
    gSentWds1 = [w for w in user_words]
    gSentWds1[pos] = mb_words[0]

    if len(mb_words) == 1:
        if levenshtein(unicode(orSent), unicode(' '.join(gSentWds1))) >= levenshtein(unicode(orSent), unicode(user_sent)):
            newWord = word

    else:
        gSentWds2 = [w for w in user_words]
        gSentWds2[pos] = mb_words[1]

        if levenshtein(unicode(orSent), unicode(' '.join(gSentWds1))) > levenshtein(unicode(orSent), unicode(' '.join(gSentWds2))):
            if levenshtein(unicode(orSent), unicode(' '.join(gSentWds2))) > levenshtein(unicode(orSent), unicode(user_sent)):
                newWord = word
            else:
                newWord = mb_words[1]
        else:
            if levenshtein(unicode(orSent), unicode(user_sent)) <= levenshtein(unicode(orSent), unicode(' '.join(gSentWds1))):
                newWord = word

        if levenshtein(unicode(orSent), unicode(' '.join(gSentWds2))) < levenshtein(unicode(orSent), unicode(' '.join(gSentWds1))):
            newWord = mb_words[1]

    return newWord

예제 #9

0

파일 보기

파일: core.py 프로젝트: 00mjk/splitaake

def find_tag(seq, tags, read, match_type, result):
    """Matching methods for left linker - regex first, followed by fuzzy (SW)
    alignment, if the option is passed"""
    #result = SeqSearchResult('tag')
    #pdb.set_trace()
    if read == 'r1':
        tag_group = tags.r1
    elif read == 'r2':
        tag_group = tags.r2
    if match_type == 'regex':
        for tag in tag_group:
            match = tag.regex.search(seq)
            if match is not None:
                result.match = True
                # by default, this is true
                assert match.groups()[0] == tag.tag
                result.match_type = 'regex'
                result.start, result.end = match.start(), match.end()
                result.tag = tag.string
                result.seq = seq[result.start:result.end]
                result.name = tags.name_lookup(result.tag)
                result.offset = 0
                break
    elif match_type == 'fuzzy':
        match = align(seq[:tags.max_tag_length], tag_group, tags.errors)
        if match:
            result.match = True
            result.match_type = 'fuzzy'
            result.tag = match.tag.tag
            result.seq = match.seq_span
            result.name = tags.name_lookup(match.tag.string)
            result.start, result.end = get_align_match_position(
                match.seq_span, match.start, match.end)
            result.offset = match.offset
    # check ALL resulting values for correct levenshtein distance or
    # reset match parameters to None/False
    if result.match:
        if not levenshtein(result.seq.upper(),
                           result.tag.upper()) <= tags.errors:
            result.reset()
    return result

예제 #10

0

파일 보기

파일: core.py 프로젝트: faircloth-lab/splitaake

def find_tag(seq, tags, read, match_type, result):
    """Matching methods for left linker - regex first, followed by fuzzy (SW)
    alignment, if the option is passed"""
    #result = SeqSearchResult('tag')
    #pdb.set_trace()
    if read == 'r1':
        tag_group = tags.r1
    elif read == 'r2':
        tag_group = tags.r2
    if match_type == 'regex':
        for tag in tag_group:
            match = tag.regex.search(seq)
            if match is not None:
                result.match = True
                # by default, this is true
                assert match.groups()[0] == tag.tag
                result.match_type = 'regex'
                result.start, result.end = match.start(), match.end()
                result.tag = tag.string
                result.seq = seq[result.start:result.end]
                result.name = tags.name_lookup(result.tag)
                result.offset = 0
                break
    elif match_type == 'fuzzy':
        match = align(seq[:tags.max_tag_length], tag_group, tags.errors)
        if match:
            result.match = True
            result.match_type = 'fuzzy'
            result.tag = match.tag.tag
            result.seq = match.seq_span
            result.name = tags.name_lookup(match.tag.string)
            result.start, result.end = get_align_match_position(match.seq_span, match.start, match.end)
            result.offset = match.offset
    # check ALL resulting values for correct levenshtein distance or
    # reset match parameters to None/False
    if result.match:
        if not levenshtein(result.seq.upper(), result.tag.upper()) <= tags.errors:
            result.reset()
    return result

예제 #11

0

파일 보기

파일: words.py 프로젝트: ogogol/best-choice-of-gsents

def delAddWordOrNotChoice(orSent, gSentWds1, gSentWds2, googleSentenceWords):
    #возращает наиболее близкий к оригиналу вариант из трех предложений,
    #в одном из которых добавлено или удалено слово, в другом и то и другое
    orSent = unicode(orSent)
    if levenshtein(orSent, unicode(' '.join(gSentWds1))) > levenshtein(orSent, unicode(' '.join(gSentWds2))):
        if levenshtein(orSent, unicode(' '.join(gSentWds2))) > levenshtein(orSent, unicode(' '.join(googleSentenceWords))):
            gSentWds1 = [w for w in googleSentenceWords]
            gSentWds2 = [w for w in googleSentenceWords]
        else:
            gSentWds1 = [w for w in gSentWds2]
            googleSentenceWords = [w for w in gSentWds2]
    else:
        if levenshtein(orSent, unicode(' '.join(gSentWds1))) < levenshtein(orSent, unicode(' '.join(googleSentenceWords))):
            googleSentenceWords = [w for w in gSentWds1]
            gSentWds2 = [w for w in gSentWds1]
        else:
            gSentWds1 = [w for w in googleSentenceWords]
            gSentWds2 = [w for w in googleSentenceWords]

    return gSentWds1, gSentWds2, googleSentenceWords

예제 #12

0

파일 보기

파일: words.py 프로젝트: ogogol/best-choice-of-gsents

def delAddWordOrNotChoice(orSent, gSentWds1, gSentWds2, googleSentenceWords):
    #возращает наиболее близкий к оригиналу вариант из трех предложений,
    #в одном из которых добавлено или удалено слово, в другом и то и другое
    orSent = unicode(orSent)
    if levenshtein(orSent, unicode(' '.join(gSentWds1))) > levenshtein(
            orSent, unicode(' '.join(gSentWds2))):
        if levenshtein(orSent, unicode(' '.join(gSentWds2))) > levenshtein(
                orSent, unicode(' '.join(googleSentenceWords))):
            gSentWds1 = [w for w in googleSentenceWords]
            gSentWds2 = [w for w in googleSentenceWords]
        else:
            gSentWds1 = [w for w in gSentWds2]
            googleSentenceWords = [w for w in gSentWds2]
    else:
        if levenshtein(orSent, unicode(' '.join(gSentWds1))) < levenshtein(
                orSent, unicode(' '.join(googleSentenceWords))):
            googleSentenceWords = [w for w in gSentWds1]
            gSentWds2 = [w for w in gSentWds1]
        else:
            gSentWds1 = [w for w in googleSentenceWords]
            gSentWds2 = [w for w in googleSentenceWords]

    return gSentWds1, gSentWds2, googleSentenceWords

예제 #13

0

파일 보기

파일: read_sents.py 프로젝트: ogogol/best-choice-of-gsents

def testingAndWriting():
    f = open ('test_result.txt', 'w')
    originalSentences, lineSentences, sentss, rightAnswers = readTest('test.txt')
    transriptions_dict = make_trascriptions_dict(originalSentences, sentss)
    sourceLevensh_sum = 0
    resultLevensh_sum = 0
    sourceLeveWords_sum = 0
    resultLeveWords_sum = 0
    improvingCount = 0
    fineImprovingCount = 0
    improvingCount_after = 0
    resLevenInTheEnd_sum = 0
    wrongCases = []
    tt0 = time.time()
    for i, orS in enumerate(originalSentences):
        gSeBeCh = googleSentensBestChoice(orS, lineSentences[i], sentss[i])
        auto_changedSent = gSeBeCh
        result = []
        if gSeBeCh != orS:
            auto_changedSent, result = replace_oldWord_to_newWord(gSeBeCh.split(), orS.split(), transriptions_dict)
        else:
            fineImprovingCount += 1

        sourceLevensh = levenshtein(unicode(orS), unicode(lineSentences[i]))
        resultLevensh = levenshtein(unicode(orS), unicode(gSeBeCh))
        resultLevensh_inTheEnd = levenshtein(unicode(orS), unicode(auto_changedSent))

        if sourceLevensh > resultLevensh:
            improvingCount += 1
        if resultLevensh_inTheEnd ==0:
            improvingCount_after += 1

        sourceLevensh_sum += sourceLevensh
        sourceLeveWords_sum += sourceLevensh/(len(gSeBeCh.split()))
        resultLevensh_sum += resultLevensh
        resultLeveWords_sum += resultLevensh/(len(gSeBeCh.split()))
        resLevenInTheEnd_sum += resultLevensh_inTheEnd

        if rightAnswers[i] == gSeBeCh:
            f.write('%s. OK, Нач. Л-штэйн - %s, Итоговый - %s\n' % (i+1, sourceLevensh, resultLevensh))
            f.write("Оригинальное - %s\n" % originalSentences[i])
            f.write("В строке     - %s\n" % lineSentences[i])
            f.write("Итоговое     - %s\n" % gSeBeCh)
            f.write('\n')
        else:
            wrongCases.append(i+1)
            f.write('%s. !!!!!!!!!!---НЕПРАВИЛЬНО---!!!!!!!!!!!\n' % (i+1))
            f.write("Должно быть  - %s\n" % rightAnswers[i])
        f.write('После автозамены    - %s\n' % auto_changedSent)
        f.write('Итоговый Левенштэйн - %s\n' % resultLevensh_inTheEnd)
        f.write('Клиенту - %s\n' % result)
        f.write('\n')

        if i == len(originalSentences) - 1:
            tt1 = time.time()
            f.write('Неправильные проверки %s,  всего - %s\n' % (wrongCases, len(wrongCases)))
            f.write('Количество улучшений - %s, процент - %s, полностью исправленных - %s\nисправленных после автозамены - %s\n' %\
                  (improvingCount, 100*improvingCount/(i+1), fineImprovingCount, improvingCount_after))
            f.write('Средний нач. Л-штейн - %s, средний итоговый - %s,\nпосле автозамены - %s\n' % (round(sourceLevensh_sum/(i+1.0),2),
                                                                        round(resultLevensh_sum/(i+1.0),2),
                                                                        round(resLevenInTheEnd_sum/(i+1.0),2)))

            f.write('Средний нач. Л-штейн к словам - %s, средний итоговый - %s\n' % (round(sourceLeveWords_sum/(i+1.0),2),
                                                                        round(resultLeveWords_sum/(i+1.0),2)))

            f.write('Время выполнения %s проверок - %s, %s sec на одну проверку\n'
                    'Дата -%s, Python 2.7 ' % (i+1, round(tt1-tt0,2), round((tt1-tt0)/(i+1), 3), str(datetime.now())))

    f.close()
    return

예제 #14

0

파일 보기

def testingAndWriting():
    f = open('test_result.txt', 'w')
    originalSentences, lineSentences, sentss, rightAnswers = readTest(
        'test.txt')
    transriptions_dict = make_trascriptions_dict(originalSentences, sentss)
    sourceLevensh_sum = 0
    resultLevensh_sum = 0
    sourceLeveWords_sum = 0
    resultLeveWords_sum = 0
    improvingCount = 0
    fineImprovingCount = 0
    improvingCount_after = 0
    resLevenInTheEnd_sum = 0
    wrongCases = []
    tt0 = time.time()
    for i, orS in enumerate(originalSentences):
        gSeBeCh = googleSentensBestChoice(orS, lineSentences[i], sentss[i])
        auto_changedSent = gSeBeCh
        result = []
        if gSeBeCh != orS:
            auto_changedSent, result = replace_oldWord_to_newWord(
                gSeBeCh.split(), orS.split(), transriptions_dict)
        else:
            fineImprovingCount += 1

        sourceLevensh = levenshtein(unicode(orS), unicode(lineSentences[i]))
        resultLevensh = levenshtein(unicode(orS), unicode(gSeBeCh))
        resultLevensh_inTheEnd = levenshtein(unicode(orS),
                                             unicode(auto_changedSent))

        if sourceLevensh > resultLevensh:
            improvingCount += 1
        if resultLevensh_inTheEnd == 0:
            improvingCount_after += 1

        sourceLevensh_sum += sourceLevensh
        sourceLeveWords_sum += sourceLevensh / (len(gSeBeCh.split()))
        resultLevensh_sum += resultLevensh
        resultLeveWords_sum += resultLevensh / (len(gSeBeCh.split()))
        resLevenInTheEnd_sum += resultLevensh_inTheEnd

        if rightAnswers[i] == gSeBeCh:
            f.write('%s. OK, Нач. Л-штэйн - %s, Итоговый - %s\n' %
                    (i + 1, sourceLevensh, resultLevensh))
            f.write("Оригинальное - %s\n" % originalSentences[i])
            f.write("В строке     - %s\n" % lineSentences[i])
            f.write("Итоговое     - %s\n" % gSeBeCh)
            f.write('\n')
        else:
            wrongCases.append(i + 1)
            f.write('%s. !!!!!!!!!!---НЕПРАВИЛЬНО---!!!!!!!!!!!\n' % (i + 1))
            f.write("Должно быть  - %s\n" % rightAnswers[i])
        f.write('После автозамены    - %s\n' % auto_changedSent)
        f.write('Итоговый Левенштэйн - %s\n' % resultLevensh_inTheEnd)
        f.write('Клиенту - %s\n' % result)
        f.write('\n')

        if i == len(originalSentences) - 1:
            tt1 = time.time()
            f.write('Неправильные проверки %s,  всего - %s\n' %
                    (wrongCases, len(wrongCases)))
            f.write('Количество улучшений - %s, процент - %s, полностью исправленных - %s\nисправленных после автозамены - %s\n' %\
                  (improvingCount, 100*improvingCount/(i+1), fineImprovingCount, improvingCount_after))
            f.write(
                'Средний нач. Л-штейн - %s, средний итоговый - %s,\nпосле автозамены - %s\n'
                % (round(sourceLevensh_sum /
                         (i + 1.0), 2), round(resultLevensh_sum /
                                              (i + 1.0), 2),
                   round(resLevenInTheEnd_sum / (i + 1.0), 2)))

            f.write(
                'Средний нач. Л-штейн к словам - %s, средний итоговый - %s\n' %
                (round(sourceLeveWords_sum /
                       (i + 1.0), 2), round(resultLeveWords_sum /
                                            (i + 1.0), 2)))

            f.write(
                'Время выполнения %s проверок - %s, %s sec на одну проверку\n'
                'Дата -%s, Python 2.7 ' %
                (i + 1, round(tt1 - tt0, 2), round(
                    (tt1 - tt0) / (i + 1), 3), str(datetime.now())))

    f.close()
    return