Пример #1
0
def simDif(w1,w2):
  normalized_levenshtein = NormalizedLevenshtein()
  dis = normalized_levenshtein.distance(w1,w2)
  sim = normalized_levenshtein.similarity(w1,w2)
  print('distance: '+str(dis)+' similarity: '+ str(sim))
  # 此线表示的是固定文本的一段和活动文本各段比较之后的分隔
  print('----------------------------')
Пример #2
0
def get_norm_levenshtein(prediction, groundtruth):
    normalized_levenshtein = NormalizedLevenshtein()
    if type(groundtruth)==list:
        if len(groundtruth)==0:
            return 0
        return np.max([get_norm_levenshtein(prediction, gt) for gt in groundtruth])
    return normalized_levenshtein.similarity(normalize_answer(prediction), normalize_answer(groundtruth))
Пример #3
0
def searchlike(query_str,s1):
    # 字面量相似度
    #print(difflib.SequenceMatcher(None, query_str, s1).quick_ratio())
    # 编辑距离
    normalized_levenshtein = NormalizedLevenshtein()
    #print(normalized_levenshtein.similarity(query_str, s1))
    #杰卡德相似度
    jaccard_coefficient = Jaccrad(query_str, s1)
    #print(jaccard_coefficient)
    return difflib.SequenceMatcher(None, query_str, s1).quick_ratio(), normalized_levenshtein.similarity(query_str, s1),jaccard_coefficient
Пример #4
0
    def similarity(self, question, answer):

        stopword = self.read_from(folder_path + '上证专用停用词.txt')
        stopwords = []
        for sw in stopword:
            sw = sw.strip('\n')
            sw = sw.strip(' ')
            stopwords.append(sw)
        # print(stopwords)

        meaningful_words1 = []
        meaningful_words2 = []

        words2 = jieba.cut(str(question))
        words3 = jieba.cut(str(answer))
        for word in words2:
            if word not in stopwords:
                meaningful_words1.append(word)
        for word in words3:
            if word not in stopwords:
                meaningful_words2.append(word)
        s2 = ''.join(meaningful_words1)
        # print(s2)
        s3 = ''.join(meaningful_words2)
        a1 = Cosine(1)
        b1 = Damerau()
        c1 = Jaccard(1)
        d1 = JaroWinkler()
        e1 = Levenshtein()
        f1 = LongestCommonSubsequence()
        g1 = MetricLCS()
        h1 = NGram(2)
        i1 = NormalizedLevenshtein()
        j1 = OptimalStringAlignment()
        k1 = QGram(1)
        l1 = SorensenDice(2)
        m1 = WeightedLevenshtein(character_substitution=CharSub())

        line_sim = []

        cos_s = a1.similarity(s2, s3)
        line_sim.append(cos_s)
        cos_d = a1.distance(s2, s3)
        line_sim.append(cos_d)
        dam = b1.distance(s2, s3)
        line_sim.append(dam)
        jac_d = c1.distance(s2, s3)
        line_sim.append(jac_d)
        jac_s = c1.similarity(s2, s3)
        line_sim.append(jac_s)
        jar_d = d1.distance(s2, s3)
        line_sim.append(jar_d)
        jar_s = d1.similarity(s2, s3)
        line_sim.append(jar_s)
        lev = e1.distance(s2, s3)
        line_sim.append(lev)
        lon = f1.distance(s2, s3)
        line_sim.append(lon)
        met = g1.distance(s2, s3)
        line_sim.append(met)
        ngr = h1.distance(s2, s3)
        line_sim.append(ngr)
        nor_d = i1.distance(s2, s3)
        line_sim.append(nor_d)
        nor_s = i1.similarity(s2, s3)
        line_sim.append(nor_s)
        opt = j1.distance(s2, s3)
        line_sim.append(opt)
        qgr = k1.distance(s2, s3)
        line_sim.append(qgr)
        sor_d = l1.distance(s2, s3)
        line_sim.append(sor_d)
        sor_s = l1.similarity(s2, s3)
        line_sim.append(sor_s)
        wei = m1.distance(s2, s3)
        line_sim.append(wei)

        return line_sim
Пример #5
0
    pages = os.listdir(article_path)
    pages.sort()  # sort by page number

    result_path = os.path.join(args.out_dir, article + '.txt')
    result_file = open(result_path, 'a', encoding='utf8')

    for page in pages:
        if page.split('.')[-1] not in ['png', 'jpg', 'tiff']:
            continue

        print(page)
        page_img = cv2.imread(os.path.join(article_path, page))
        text = ''.join(extractor.extract(page_img))
        result_file.write(text)
    result_file.close()

    # read file
    result_text = open(result_path, 'r').read().replace('\n',
                                                        '').replace(' ', '')
    groudtruth_text = open(groudtruth_path,
                           'r').read().replace('\n', '').replace(' ', '')

    # calculate the similarity
    similarity = normalized_levenshtein.similarity(result_text,
                                                   groudtruth_text)
    similarities[article] = similarity

    print('similarity of {} is {}'.format(article, similarity))

print(similarities)
Пример #6
0
def diff_lev(source, source_asr, target, source_pos, result_source,
             result_source_asr, result_target, result_source_pos, workdir,
             **kwargs):
    with open(source, 'r', encoding='utf-8') as source, \
            open(source_asr, 'r', encoding='utf-8') as source_asr, \
            open(source_pos, 'r', encoding='utf-8') as source_pos, \
            open(target, 'r', encoding='utf-8') as target, \
            open(result_source, 'w', encoding='utf-8') as result_source, \
            open(result_source_asr, 'w', encoding='utf-8') as result_source_asr, \
            open(result_source_pos, 'w', encoding='utf-8') as result_source_pos, \
            open(result_target, 'w', encoding='utf-8') as result_target, \
            open(workdir / 'distances.txt', 'w', encoding='utf-8') as distances:

        source = source.readlines()
        source_asr = source_asr.readlines()
        target = target.readlines()
        source_pos = source_pos.readlines()

        # different types to classify the sentences
        counter = Counter()

        normalized_levenshtein = NormalizedLevenshtein()
        norm_dist = []

        # Loop to analyze each pair of sentences and count the number of occurrences of each type
        for source_sent, source_asr_sent in tqdm.tqdm(zip(source, source_asr)):
            ratio = normalized_levenshtein.similarity(source_sent,
                                                      source_asr_sent)
            norm_dist.append(ratio)
            if ratio == 1:
                counter["equal"] += 1
            if ratio > 0.9:
                counter["close"] += 1
            if 0.9 >= ratio > 0.7:
                counter["medium"] += 1
            if 0.7 >= ratio > 0.5:
                counter["low"] += 1
            if 0.5 >= ratio:
                counter["different"] += 1

        # Write the results of the comparisons in output file
        for dist in norm_dist:
            distances.write(f"{dist}\n")

        # print Statistics
        print(
            f"Equal count:{counter['equal']}, ratio: {counter['equal'] / len(source)}"
        )
        print(f"{counter['close']} {counter['close'] / len(source)}")
        print(f"{counter['medium']} {counter['medium'] / len(source)}")
        print(f"{counter['low']} {counter['low'] / len(source)}")
        print(f"{counter['different']} {counter['different'] / len(source)}")

        # Loop to identify similar sentences and write in output files
        # Counting sentences with same number of tokens comparing clean and asr
        for ratio, source_sent, source_asr_sent, target_sent, source_pos_sent in tqdm.tqdm(
                zip(norm_dist, source, source_asr, target, source_pos)):
            if float(ratio) >= 0.9:
                result_source.write(source_sent)
                result_source_asr.write(source_asr_sent)
                result_target.write(target_sent)
                result_source_pos.write(source_pos_sent)
                counter["after_filter"] += 1
                if len(source_sent.split(" ")) == len(
                        source_asr_sent.split(" ")):
                    counter["equal_nb_tokens"] += 1

        print(f"Sentences after cleaning {counter['after_filter']}")
        print(
            f"Sentences with equal number of tokens: {counter['equal_nb_tokens']}"
        )
Пример #7
0
class PostOCRTextCorrection:
    '''
    This class helps to correct spelling errors introduced in texts by OCR.
    To correct errors it combines FastText, Word2vec and Normalized Levenshtein Distance .
   '''
    def __init__(self, word2vec: Word2Vec, fasttext: FastText,
                 lexicon: set) -> None:
        '''Creates a new PostOCRTextCorrection object
        Args:
            word2vec (Word2Vec): Word2Vec pre-trained model: a gensim model trained on dirty data (text without cleaning)
            fasttext (FastText): FastText pre-trained model: a gensim model trained on dirty data (text without cleaning)
            lexicon (set): complete list of words belonging to the language of the corpus documents (including conjugations, ...)
        '''

        self.__n_levenshtein = NormalizedLevenshtein()

        # loads pretrained models
        self.word2vec = word2vec
        self.word2vec_vocab = set(
            [word for word in self.word2vec.wv.index_to_key if len(word) > 3])
        self.fasttext = fasttext

        self.lexicon = set(lexicon)

    def __word_in_word2vec__(self, word: str) -> bool:
        '''Returns True if there is a 'word'-vector in Word2vec Space, otherwise False'''
        return word in self.word2vec_vocab

    def __word_in_lexicon__(self, word: str) -> bool:
        '''Returns True if 'word' belongs to the language of the corpus documents'''
        return word in self.lexicon

    def normalized_levhenstein_similarity(self, word1: str,
                                          word2: str) -> float:
        '''Returns the normalized levhenstein similarity [0,1] between 'word1' and 'word2' '''
        return self.__n_levenshtein.similarity(word1, word2)

    def weighted_normalized_levhenstein_similarity(self, word1: str,
                                                   word2: str,
                                                   weight: float) -> float:
        '''Returns the normalized levhenstein similarity [0,1] between 'word1' and 'word2' weighted
        by 'weight' '''

        return weight * self.normalized_levhenstein_similarity(word1, word2)

    def ocr_correction(self,
                       word: str,
                       topn_we: int = 10,
                       topn_nwe=150,
                       weight_we=0.3,
                       weight_nwe=1.0,
                       topn_co: int = 1) -> list:
        '''Retrieves the most suitable words to correct the wrong one in input
        Args:
            word (str): a wrong word to correct
            topn_we (int, default=10): top N most similar words if 'word is models embedded
            topn_nwe (int, default=150): top N most similar words if 'word' is not models embedded
            weight_we (float, default=0.3): normalized levhenstein weight if 'word' is models embedded
            weight_nwe (float, default=1.0): normalized levhenstein weight if 'word' is not models embedded
            topn_co (int, default=1): ton N most suitable words to correct the wrong one in input
        Returns:
            list: ton N most suitable words to correct the wrong one in input
        Examples:
          >>> ocr = PostOCRTextCorrection(...)
          >>> word = 'niziitlniente'
          >>> ocr.ocr_correction(word, topn_co=1)
          >>> [('nullatenente', 0.99)] #output
        '''

        word = word.lower()

        # {corrected word: score}
        is_word2vec_embedded = self.__word_in_word2vec__(word)
        corrections_score = defaultdict(int)

        if is_word2vec_embedded:
            for similar_word, cosine_simalirity in self.word2vec.wv.most_similar(
                    word, topn=topn_we):
                if not self.__word_in_lexicon__(similar_word): continue
                corrections_score[similar_word] = \
                    cosine_simalirity + self.weighted_normalized_levhenstein_similarity(word, similar_word, weight_we)
        else:
            # if not self.is_word2vec_embedded(word)
            # we check on a larger neighborod in FastText
            topn_we = topn_nwe

        for similar_word, cosine_simalirity in self.fasttext.wv.most_similar(
                word, topn=topn_we):
            if not self.__word_in_word2vec__(similar_word): continue

            candidate = list()
            candidate.append(similar_word)

            if is_word2vec_embedded:
                candidate.extend(
                    list(
                        dict(
                            self.word2vec.wv.most_similar(similar_word,
                                                          topn=topn_we))))
            else:
                pass

            for similar_word in candidate:
                if not self.__word_in_lexicon__(similar_word): continue

                if is_word2vec_embedded:
                    sim = self.word2vec.wv.similarity(word, similar_word) + \
                          self.weighted_normalized_levhenstein_similarity(word, similar_word, weight_we)
                else:
                    sim = cosine_simalirity + \
                          self.weighted_normalized_levhenstein_similarity(word, similar_word, weight_nwe)

                if sim > corrections_score[similar_word]:
                    corrections_score[similar_word] = sim

        return sorted(corrections_score.items(),
                      key=lambda x: x[1],
                      reverse=True)[:topn_co]

    def ocr_correction_corpus(
            self,
            data: List[dict],
            key: str = 'text',
            processing_text: callable = lambda x: x,
            processing_correction: callable = lambda x: x.lower(),
            min_len: int = 5,
            topn_we: int = 10,
            topn_nwe=150,
            weight_we=0.3,
            weight_nwe=1.0,
            paragraphs_level: bool = False) -> Tuple[List[dict], dict]:
        '''
        Given a corpus, retrieves ocr errors, estimates corrections for each error, corrects ocr errors with the estimated corrections.
        Args:
            data (List[dict]): a list of json documents
            key (str, default=text): key to the text field in each documents
            processing_text (callable, default=lambda x:x): function to process texts
            processing_correction (callable, default=lambda x:x.lower()): function to process corrections
            min_len (int, default=4): words less than 'min_len' characters long are not corrected.
            topn_we (int, default=10): top N most similar words if 'word' is models embedded
            topn_nwe (int, default=150): top N most similar words if 'word' is not models embedded
            weight_we (float, default=0.3): normalized levhenstein weight if 'word' is models embedded
            weight_nwe (float, default=1.0): normalized levhenstein weight if 'word' is not models embedded
            topn_co (int, default=1): ton N most suitable words to correct the wrong one in input
        Returns:
            Tuple[List[dict], dict]: a list of json documents corrected; corrections
        Examples:
          >>> ocr = PostOCRTextCorrection(...)
          >>> key='text'
          >>> data = [{'text': '[...] niziitlniente [...]'}, ..., {'text': '[...] preliininari [...]'}]
          >>> ocr.ocr_correction(data, key)
          >>> [{'text': '[...] nullatenente [...]'}, ..., {'text': '[...] preliminari [...]'}]#output
        '''
        ocr_errors = set()

        for doc in tqdm(data,
                        position=0,
                        leave=True,
                        desc='Retrieving ocr errors'):
            if paragraphs_level:
                for para_json in doc['paragraphs']:
                    words = set(
                        word
                        for word in processing_text(para_json[key]).split()
                        if len(word) >= min_len and not word.isdigit()
                        and not word[0].isupper())
                    ocr_errors.update(words.difference(self.lexicon))
            else:
                words = set(word for word in processing_text(doc[key]).split()
                            if len(word) >= min_len and not word.isdigit()
                            and not word[0].isupper())
                ocr_errors.update(words.difference(self.lexicon))

        ocr_corrections = dict()
        for ocr_error in tqdm(list(ocr_errors),
                              position=0,
                              leave=True,
                              desc='Retrieving ocr corrections'):
            ocr_correction = self.ocr_correction(ocr_error,
                                                 topn_we=topn_we,
                                                 topn_nwe=topn_nwe,
                                                 weight_we=weight_we,
                                                 weight_nwe=weight_nwe,
                                                 topn_co=1)

            if len(ocr_correction) == 0: continue

            ocr_correction = [
                processing_correction(ocr_corr[0])
                for ocr_corr in ocr_correction
            ]
            ocr_corrections[ocr_error] = ocr_correction[0]

        new_data = list()
        for doc in tqdm(data,
                        position=0,
                        leave=True,
                        desc='Correcting ocr errors'):
            if '_id' in doc:
                del doc["_id"]
            if paragraphs_level:
                new_para_json = list()
                for para_json in doc['paragraphs']:
                    para_json[key] = " " + processing_text(
                        para_json[key]) + " "
                    for token in para_json[key].split():
                        if token.lower() in ocr_corrections:
                            para_json[key] = para_json[key].replace(
                                " " + token.lower() + " ",
                                " " + ocr_corrections[token.lower()] + " ")
                    new_para_json.append(para_json)
                doc['paragraphs'] = new_para_json
            else:
                doc[key] = " " + processing_text(doc[key]) + " "
                for token in doc[key].split():
                    if token.lower() in ocr_corrections:
                        doc[key] = doc[key].replace(
                            " " + token.lower() + " ",
                            " " + ocr_corrections[token.lower()] + " ")
            new_data.append(doc)

        return new_data, ocr_corrections
Пример #8
0
from similarity.levenshtein import Levenshtein
from similarity.normalized_levenshtein import NormalizedLevenshtein
from similarity.cosine import Cosine
lev = Levenshtein()
nolev = NormalizedLevenshtein()
cosine = Cosine(4)
str1 = 'I enjoy playing football'
str2 = 'I love to play soccer'

print(lev.distance(str1, str2))
print('Levenshtein distance:')
print(nolev.similarity(str1, str2))
print('Cosine similarity:')
print(cosine.similarity(str1, str2))
Пример #9
0
                data_1[i][1] = data_1[i][1].replace(", ", ',')
                data_2[i][1] = data_2[i][1].replace(", ", ',')

                data_1[i][1] = data_1[i][1].replace(",;", '')
                data_2[i][1] = data_2[i][1].replace(",;", '')

                data_1[i][1] = data_1[i][1].split(",")
                data_2[i][1] = data_2[i][1].split(",")

            sim_file = open(str(file_i.replace('.csv', '')) + '-' +
                            str(file_j.replace('.csv', '')) + '.txt',
                            'w+',
                            encoding="utf8")
            print(
                str(file_i.replace('.csv', '')) + '-' +
                str(file_j.replace('.csv', '')) + '.txt: ' +
                str(files_increment))
            for i in tqdm(range(60000)):
                for w_1 in data_1[i][1]:
                    if (w_1 != 'NULL' and w_1 != 'null' and w_1 != ''):
                        for w_2 in data_2[i][1]:
                            if (w_2 != 'NULL' and w_2 != 'null' and w_2 != ''):
                                simVal = normalized_levenshtein.similarity(
                                    w_1, w_2)
                                if (simVal >= 0.85):
                                    sim_file.writelines(
                                        str(i + 1) + ";" + w_1 + ";" + w_2 +
                                        ';' + str(simVal) + '\n')

            sim_file.close()
Пример #10
0
    return SequenceMatcher(None, a, b).ratio()


# Definition of the 5 different types to classify the sentences
equal = 0
close = 0
medium = 0
low = 0
different = 0

normalized_levenshtein = NormalizedLevenshtein()
norm_dist = []

# Loop to analyze each pair of sentences and count the number of occurences of each type
for sent_nb in range(len(clean)):
    Ratio = normalized_levenshtein.similarity(clean[sent_nb],
                                              clean_asr[sent_nb])
    norm_dist.append(Ratio)
    if (Ratio == 1):
        equal += 1
    if (Ratio > 0.9):
        close += 1
    if (Ratio <= 0.9 and Ratio > 0.7):
        medium += 1
    if (Ratio <= 0.7 and Ratio > 0.5):
        low += 1
    if (Ratio <= 0.5):
        different += 1

# Write the results of the comparisons in output file
for dist in norm_dist:
    result.write(str(dist) + "\n")