class SegmentText(): def __init__(self, dictionary_path = None, bigram_path = None): self.name = "SegmenText" self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) # dictionary_path = pkg_resources.resource_filename( # "symspellpy", "frequency_dictionary_en_82_765.txt") if dictionary_path != None: self.dictionary_path = dictionary_path else: self.dictionary_path = os.path.join("./symspellfre_", "frequency_dictionary_en_82_765.txt") if bigram_path != None: self.bigram_path = bigram_path else: self.bigram_path = os.path.join("./symspellfre_", "frequency_bigramdictionary_en_243_342.txt") # self.bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt") self.sym_spell.load_dictionary(self.dictionary_path, term_index=0, count_index=1) self.sym_spell.load_bigram_dictionary(self.bigram_path, term_index=0, count_index=2) def split(self, sentence): # lookup suggestions for multi-word input strings (supports compound # splitting & merging) # input_term = ("in te dhird qarter oflast jear he hadlearned ofca sekretplan") # input_term = ("in te dhird qarter oflast jear he hadlearned ofca sekretplan eoy") # max edit distance per lookup (per single word, not per whole input string) suggestions = self.sym_spell.lookup_compound(sentence, max_edit_distance=2) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print(suggestion) return suggestions
class SpellCorrect(): def __init__(self, dictionary_path=dictionary_path__, bigram_path=bigram_path__): self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) if self.is_valid_path(dictionary_path) and self.is_valid_path( bigram_path): self.sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) self.sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2) self.load_status = True else: self.load_status = False self.name = "Spell Corrector" def is_valid_path(self, path_file): if not os.path.exists(path_file): logging.error("The {} is not exists".format(path_file)) return False return True def correct(self, sentence): if self.load_status: # max edit distance per lookup (per single word, not per whole input string) suggestions = self.sym_spell.lookup_compound(sentence, max_edit_distance=2) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: return suggestion.term return self.load_status
class SymSpellChecker(object): def __init__(self): self.checker = SymSpell(max_dictionary_edit_distance=2) self.checker.load_dictionary( '/home/citao/github/symspellpy/frequency_dictionary_en_82_765.txt', 0, 1) self.checker.load_bigram_dictionary( '/home/citao/github/symspellpy/frequency_bigramdictionary_en_243_342.txt', 0, 2) def correct(self, word): suggestions = self.checker.lookup(word, Verbosity.CLOSEST, max_edit_distance=2) for suggestion in suggestions: cor_word = suggestion.term logging.info('Spell check: [{}] -> [{}]'.format(word, cor_word)) return cor_word return word def correct_text(self, text): cor_list = [] for word in text.split(' '): suggestions = self.checker.lookup(word, Verbosity.CLOSEST, max_edit_distance=2) cor_flag = False for suggestion in suggestions: cor_word = suggestion.term cor_list.append(cor_word) cor_flag = True break if not cor_flag: cor_list.append(word) return ' '.join(cor_list)
def load_name_corection(dictionary_path, bigram_path): sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1, encoding='utf-8') sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2, encoding='utf-8') return sym_spell
def load_symspell(): import pkg_resources from symspellpy import SymSpell, Verbosity sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") # term_index is the column of the term and count_index is the # column of the term frequency sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2) return sym_spell
def test_lookup_compound_ignore_non_words(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) sym_spell.load_bigram_dictionary(self.bigram_path, 0, 2) typo = ("whereis th elove 123 hehad dated forImuch of THEPAST who " "couqdn'tread in SIXTHgrade and ins pired him") correction = ("where is the love 123 he had dated for much of THEPAST " "who couldn't read in sixth grade and inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = "in te DHIRD 1 qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the DHIRD 1 quarter of last year he had learned " "of a secret plan") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("the bigjest playrs in te stroGSOmmer film slatew ith PLETY " "of 12 funn") correction = ("the biggest players in the strong summer film slate " "with PLETY of 12 fun") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("Can yu readtHIS messa ge despite thehorible 1234 " "sppelingmsitakes") correction = ("can you read this message despite the horrible 1234 " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = ("Can yu readtHIS messa ge despite thehorible AB1234 " "sppelingmsitakes") correction = ("can you read this message despite the horrible AB1234 " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) typo = "PI on leave, arrange Co-I to do screening" correction = "PI on leave arrange co i to do screening" results = sym_spell.lookup_compound(typo, edit_distance_max, True) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term)
def test_lookup_compound_transfer_casing(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) sym_spell.load_bigram_dictionary(self.bigram_path, 0, 2) typo = ("Whereis th elove hehaD Dated forImuch of thepast who " "couqdn'tread in sixthgrade AND ins pired him") correction = ("Where is the love he haD Dated for much of the past " "who couldn't read in sixth grade AND inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max, transfer_casing=True) self.assertEqual(correction, results[0].term)
def test_load_bigram_dictionary_invalid_path(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) self.assertEqual( False, sym_spell.load_bigram_dictionary("invalid/dictionary/path.txt", 0, 2))
def test_load_bigram_dictionary_bad_dict(self): dictionary_path = os.path.join(self.fortests_path, "bad_dict.txt") edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) self.assertEqual( True, sym_spell.load_bigram_dictionary(dictionary_path, 0, 2)) self.assertEqual(2, len(sym_spell.bigrams)) self.assertEqual(12, sym_spell.bigrams["rtyu tyui"]) self.assertEqual(13, sym_spell.bigrams["yuio uiop"])
class WordCorrector(LogicAdapter): def __init__(self, chatbot, **kwargs): super().__init__(chatbot, **kwargs) self.language = kwargs.get('language', languages.ENG) self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) self.dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt") # TODO : 숫자 없음. dictionary modifying 필요 self.bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt") self.sym_spell.load_dictionary(self.dictionary_path, term_index=0, count_index=1) self.sym_spell.load_bigram_dictionary(self.bigram_path, term_index=0, count_index=2) def can_process(self, statement): try: if " " in statement.text.lower(): return False else: response = self.process(statement) return response.confidence == 1 except: return False def process(self, statement, additional_response_selection_parameters=None): input_text = statement.text input_text = (input_text) suggestions = self.sym_spell.lookup_compound(input_text, max_edit_distance=2) for suggestion in suggestions: #print(suggestion) #print(type(suggestion)) expression = "Do you mean \""+ str(suggestion).split(",")[0] +"\"" if input_text == str(suggestion).split(",")[0]: expression = "" response = Statement(text=expression) response.confidence = 1 #TODO: corrector 돌렸을 때 같을 땐 confidence 0, 다를 땐 confidence 1로 줬었는데 모 딴 거하다 이걸로 냅둠 return response
class Spell_Checker(): def __init__(self): self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) self.dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") self.bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") self.sym_spell.load_dictionary(self.dictionary_path, term_index=0, count_index=1) self.sym_spell.load_bigram_dictionary(self.bigram_path, term_index=0, count_index=2) def Correct_It(self, data): suggestions = self.sym_spell.lookup_compound(data, max_edit_distance=2, transfer_casing=True) clean_data = list() for suggestion in suggestions: clean_data.append(str(suggestion.term)) correct_data = " ".join(clean_data) return correct_data
def test_load_bigram_dictionary_separator(self): dictionary_path = os.path.join(self.fortests_path, "separator_dict.txt") edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) self.assertEqual( True, sym_spell.load_bigram_dictionary(dictionary_path, 0, 1, "$")) self.assertEqual(5, len(sym_spell.bigrams)) self.assertEqual(23135851162, sym_spell.bigrams["the"]) self.assertEqual(13151942776, sym_spell.bigrams["of"]) self.assertEqual(10956800, sym_spell.bigrams["abcs of"]) self.assertEqual(10721728, sym_spell.bigrams["aaron and"]) self.assertEqual(12997637966, sym_spell.bigrams["and"])
def init_symspell(): # maximum edit distance per dictionary precalculation max_edit_distance_dictionary = 1 # bylo tutaj 0 prefix_length = 100 # create object # sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) # load dictionary dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") # term_index is the column of the term and count_index is the # column of the term frequency if not sym_spell.load_dictionary( dictionary_path, term_index=0, count_index=1): print("Dictionary file not found") return if not sym_spell.load_bigram_dictionary( dictionary_path, term_index=0, count_index=2): print("Bigram dictionary file not found") return return sym_spell
from pythainlp.corpus import get_corpus_path from pythainlp.corpus import path_pythainlp_corpus from pythainlp.tokenize import word_tokenize _UNIGRAM = "tnc_freq.txt" _BIGRAM = "tnc_bigram_word_freqs" sym_spell = SymSpell() sym_spell.load_dictionary(path_pythainlp_corpus(_UNIGRAM), 0, 1, separator='\t', encoding="utf-8-sig") sym_spell.load_bigram_dictionary(get_corpus_path(_BIGRAM), 0, 2, separator='\t', encoding="utf-8-sig") def spell(text: str, max_edit_distance: int = 2) -> List[str]: return [ str(i).split(',')[0] for i in list( sym_spell.lookup( text, Verbosity.CLOSEST, max_edit_distance=max_edit_distance)) ] def correct(text: str, max_edit_distance: int = 1) -> str: return spell(text, max_edit_distance=max_edit_distance)[0]
def test_lookup_compound_replaced_words(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) sym_spell.load_bigram_dictionary(self.bigram_path, 0, 2) typo = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixthgrade and ins pired him") correction = ("where is the love he had dated for much of the past " "who couldn't read in sixth grade and inspired him") replacement_1 = { "whereis": "where is", "th": "the", "elove": "love", "hehad": "he had", "forimuch": "for much", "thepast": "the past", "couqdn'tread": "couldn't read", "sixthgrade": "sixth grade", "ins": "in" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(len(replacement_1), len(sym_spell.replaced_words)) for k, v in replacement_1.items(): self.assertEqual(v, sym_spell.replaced_words[k].term) typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the third quarter of last year he had learned of a " "secret plan") replacement_2 = { "te": "the", "dhird": "third", "qarter": "quarter", "oflast": "of last", "jear": "year", "hadlearned": "had learned", "ofca": "of a", "sekretplan": "secret plan" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual( len(replacement_1) + len(replacement_2), len(sym_spell.replaced_words)) for k, v in replacement_2.items(): self.assertEqual(v, sym_spell.replaced_words[k].term) typo = ("the bigjest playrs in te strogsommer film slatew ith plety " "of funn") correction = ("the biggest players in the strong summer film slate " "with plenty of fun") replacement_3 = { "bigjest": "biggest", "playrs": "players", "strogsommer": "strong summer", "slatew": "slate", "ith": "with", "plety": "plenty", "funn": "fun" } results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual( len(replacement_1) + len(replacement_2) + len(replacement_3), len(sym_spell.replaced_words)) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) for k, v in replacement_3.items(): self.assertEqual(v, sym_spell.replaced_words[k].term)
def export(): import os import torch import zipfile import torchaudio from glob import glob device = torch.device('cpu') # gpu also works, but our models are fast enough for CPU model, decoder, utils = torch.hub.load('snakers4/silero-models', model='silero_stt', language='en') (read_batch, split_into_batches, read_audio, prepare_model_input) = utils # see function signature for details os.system("ffmpeg -i 'video.mp4' -vn -acodec copy audio.aac") os.system("ffmpeg -i audio.aac audio.wav") # download a single file, any format compatible with TorchAudio (soundfile backend) # torch.hub.download_url_to_file('https://opus-codec.org/static/examples/samples/speech_orig.wav', # dst ='speech_orig.wav', progress=True) test_files = glob('audio.wav') batches = split_into_batches(test_files, batch_size=10) input = prepare_model_input(read_batch(batches[0])) text = "" output = model(input) for example in output: pred = decoder(example.cpu()) text = text + pred os.system("curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_dictionary_en_82_765.txt") os.system("curl -LJO https://raw.githubusercontent.com/mammothb/symspellpy/master/symspellpy/frequency_bigramdictionary_en_243_342.txt") import pkg_resources from symspellpy import SymSpell, Verbosity sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") # term_index is the column of the term and count_index is the # column of the term frequency sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2) # lookup suggestions for multi-word input strings (supports compound # splitting & merging) # input_term = ("whereis th elove hehad dated forImuch of thepast who " # "couqdn'tread in sixtgrade and ins pired him") # max edit distance per lookup (per single word, not per whole input string) suggestions = sym_spell.lookup_compound(text, max_edit_distance=2) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print(suggestion) text = str(suggestion) cnt = 0 textlines = [] while cnt < len(text.split(" ")): print(text.split(" ")[cnt:cnt+5]) line = "\n" + " ".join(text.split(" ")[cnt:cnt+5]) textlines.append(line) cnt += 5 f = open("script_cleaned.txt", "a") f.writelines(textlines) f.close() os.system("python -m aeneas.tools.execute_task \ audio.wav \ script_cleaned.txt \ 'task_language=eng|os_task_file_format=srt|is_text_type=plain' \ subtitles.srt") with open("subtitles.srt") as f: srt = f.read() return Response( srt, mimetype="text/srt", headers={ "Content-disposition": "attachment; filename=subtitiles.srt" } )
class MaskTextSpotter(object): def __init__(self, cfg, confidence_threshold=0.7, min_image_size=224, output_polygon=True, spellfix=True): self.cfg = cfg.clone() self.model = build_detection_model(cfg) self.model.eval() self.device = torch.device(cfg.MODEL.DEVICE) self.model.to(self.device) self.min_image_size = min_image_size self.spellfix = spellfix self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") self.sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) self.sym_spell.load_bigram_dictionary(bigram_dictionary_path, term_index=0, count_index=2) checkpointer = DetectronCheckpointer(cfg, self.model) if len(cfg.MODEL.WEIGHT): import logging logging.info('loading MaskTextSpotter from %s' % cfg.MODEL.WEIGHT) _ = checkpointer.load(cfg.MODEL.WEIGHT) self.transforms = self.build_transform() self.cpu_device = torch.device("cpu") self.confidence_threshold = confidence_threshold self.output_polygon = output_polygon def build_transform(self): """ Creates a basic transformation that was used to train the models """ cfg = self.cfg # we are loading images with OpenCV, so we don't need to convert them # to BGR, they are already! So all we need to do is to normalize # by 255 if we want to convert to BGR255 format, or flip the channels # if we want it to be in RGB in [0-1] range. if cfg.INPUT.TO_BGR255: to_bgr_transform = T.Lambda(lambda x: x * 255) else: to_bgr_transform = T.Lambda(lambda x: x[[2, 1, 0]]) normalize_transform = T.Normalize(mean=cfg.INPUT.PIXEL_MEAN, std=cfg.INPUT.PIXEL_STD) transform = T.Compose([ T.ToPILImage(), T.Resize(self.min_image_size), T.ToTensor(), to_bgr_transform, normalize_transform, ]) return transform def run_on_opencv_image(self, image): """ Arguments: image (np.ndarray): an image as returned by OpenCV Returns: result_polygons (list): detection results result_words (list): recognition results """ result_polygons, result_words, result_dict = self.compute_prediction( image) return result_polygons, result_words, result_dict def run_on_pillow_image(self, image): arr = np.array(image, dtype=np.uint8) result_polygons, result_words, result_dict = self.run_on_opencv_image( arr) return result_polygons, result_words, result_dict def compute_prediction(self, original_image): def spell_fix(wd): if self.spellfix: new_word = [ s.term for s in self.sym_spell.lookup(wd, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True) ][0] else: new_word = wd return new_word def chunks(l, n): for i in range(0, len(l), n): yield l[i:i + n] def mk_direction(char_polygons): def centroid(char_polygon): centroid = Polygon(list(chunks(char_polygon, 2))).centroid.coords return list(centroid)[0] first, last = char_polygons[0], char_polygons[-1] start, end = centroid(first), centroid(last) if start[0] == end[0]: end = (end[0] + 1, end[1]) return start, end def line_detection(dicts, char_ratio=1.5): # box [x1, y1, x2, y2] sorted_res = sorted(dicts, key=lambda d: d["box"][0]) lines = dict() def point_in_next_word(word): width = word["box"][2] - word["box"][0] # width = x2 - x1 avg_char_width = width / float(len(word["seq_word"])) last_right_border = word["box"][2] next_word_pos_x = last_right_border + char_ratio * avg_char_width next_word_pos_y = word["box"][1] direction = word["direction"] point = Point(next_word_pos_x, next_word_pos_y) line = LineString(direction) x = np.array(point.coords[0]) u = np.array(line.coords[0]) v = np.array(line.coords[len(line.coords) - 1]) n = v - u n /= np.linalg.norm(n, 2) P = u + n * np.dot(x - u, n) return (int(P[0]), int(P[1])) def distance_to_mid(word_point, word_box): point = Point(word_point["next_point"]) box = word_box["box"] return abs(point.y - (box[1] + box[3]) / 2.0) # abs( y - (y2+y1)/2 ) def find_next_word(word, index, sorted_words): next_point = Point(word["next_point"]) next_words = [ other for other in sorted_words[index + 1:] if Polygon( chunks(other["polygon"], 2)).contains(next_point) ] if next_words: return min(next_words, key=lambda x: distance_to_mid(word, x)) else: return None def find_previous_word(prev, word): if "previous_word" not in word.keys(): return prev else: return min(prev, word["previous_word"], key=lambda x: distance_to_mid(x, word)) for w in sorted_res: w["next_point"] = point_in_next_word(w) for i, w in enumerate(sorted_res): next_word = find_next_word(w, i, sorted_res) w["next_word"] = None if next_word: better_previous = find_previous_word(w, next_word) if better_previous == w: w["next_word"] = next_word if "previous_word" in next_word.keys(): next_word["previous_word"]["next_word"] = None next_word["previous_word"] = w for w in sorted_res: if "previous_word" not in w.keys(): a = w key_y = a["box"][1] while key_y in lines.keys(): key_y = key_y + 1 lines[key_y] = [a] while a["next_word"]: a = a["next_word"] lines[key_y].append(a) sorted_lines = sorted(lines.items(), key=lambda x: x[0]) return ",".join([ " ".join([w["seq_word"] for w in line]) for _, line in sorted_lines ]), sorted_lines # apply pre-processing to image import datetime, time start_time = time.time() # print('transform', datetime.datetime.now()) image = self.transforms(original_image) # convert to an ImageList, padded so that it is divisible by # cfg.DATALOADER.SIZE_DIVISIBILITY # print('to image list', datetime.datetime.now()) image_list = to_image_list(image, self.cfg.DATALOADER.SIZE_DIVISIBILITY) image_list = image_list.to(self.device) # compute predictions with torch.no_grad(): # print('predict', datetime.datetime.now()) self.model.eval() predictions, _, _ = self.model(image_list) if not predictions or len(predictions) < 1: # print('no text detected') return [], [], {'label': '', 'details': []} # print('post process', datetime.datetime.now()) global_predictions = predictions[0] char_predictions = predictions[1] char_mask = char_predictions['char_mask'] char_boxes = char_predictions['boxes'] words, rec_scores, rec_char_scores, char_polygons = self.process_char_mask( char_mask, char_boxes) detailed_seq_scores = char_predictions['detailed_seq_scores'] seq_words = char_predictions['seq_outputs'] seq_scores = char_predictions['seq_scores'] global_predictions = [ o.to(self.cpu_device) for o in global_predictions ] # always single image is passed at a time global_prediction = global_predictions[0] # reshape prediction (a BoxList) into the original image size height, width = original_image.shape[:-1] test_image_width, test_image_height = global_prediction.size global_prediction = global_prediction.resize((width, height)) resize_ratio = float(height) / test_image_height boxes = global_prediction.bbox.tolist() scores = global_prediction.get_field("scores").tolist() masks = global_prediction.get_field("mask").cpu().numpy() result_polygons = [] result_words = [] result_dicts = [] for k, box in enumerate(boxes): score = scores[k] if score < self.confidence_threshold: continue box = list(map(int, box)) mask = masks[k, 0, :, :] polygon = self.mask2polygon(mask, box, original_image.shape, threshold=0.5, output_polygon=self.output_polygon) if polygon is None: polygon = [ box[0], box[1], box[2], box[1], box[2], box[3], box[0], box[3] ] result_polygons.append(polygon) word = words[k] rec_score = rec_scores[k] char_score = rec_char_scores[k] seq_word = seq_words[k] seq_char_scores = seq_scores[k] seq_score = sum(seq_char_scores) / float(len(seq_char_scores)) # spell_fix = lambda word: \ # [s.term for s in sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True)][ # 0] detailed_seq_score = detailed_seq_scores[k] detailed_seq_score = np.squeeze(np.array(detailed_seq_score), axis=1) # if 'total_text' in output_folder or 'cute80' in output_folder: # result_log = [int(x * 1.0) for x in box[:4]] + polygon + [word] + [seq_word] + [score] + [rec_score] + [ # seq_score] + [char_score] + [detailed_seq_score] + [len(polygon)] # else: result_log = [int(x * 1.0) for x in box[:4]] + polygon + [word] + [ seq_word ] + [score] + [rec_score] + [seq_score] + [char_score] + [ detailed_seq_score ] # result_logs.append(result_log) if len(seq_word) > 0 and len(char_polygons[k]) > 0: d = { "seq_word": seq_word if len(seq_word) < 4 else spell_fix(seq_word), "seq_word_orig": seq_word, "direction": mk_direction([[int(c * resize_ratio) for c in p] for p in char_polygons[k]]), "word": word if len(word) < 4 else spell_fix(word), "word_orig": word, "box": [int(x * 1.0) for x in box[:4]], "polygon": polygon, "prob": score * seq_score } result_words.append(d['seq_word']) result_dicts.append(d) # default_logger.debug('done', datetime.datetime.now()) label, details = line_detection(result_dicts) end_time = time.time() # default_logger.debug('cost time: %s' % (end_time - start_time)) line_result = {'label': label, 'details': details} # line_result_words = [] # line_result_polygons = [] # for ocr_detail in line_result['details']: # pass # line_result_words = [a[1][0]['seq_word'] for a in line_result['details']] # line_result_polygons = [a[1][0]['polygon'] for a in line_result['details']] line_result_words = [a['seq_word'] for a in result_dicts] line_result_polygons = [a['polygon'] for a in result_dicts] # return result_polygons, result_words, line_result return line_result_polygons, line_result_words, line_result # def process_char_mask(self, char_masks, boxes, threshold=192): # texts, rec_scores = [], [] # for index in range(char_masks.shape[0]): # box = list(boxes[index]) # box = list(map(int, box)) # text, rec_score, _, _ = getstr_grid(char_masks[index, :, :, :].copy(), box, threshold=threshold) # texts.append(text) # rec_scores.append(rec_score) # return texts, rec_scores def process_char_mask(self, char_masks, boxes, threshold=192): texts, rec_scores, rec_char_scores, char_polygons = [], [], [], [] for index in range(char_masks.shape[0]): box = list(boxes[index]) box = list(map(int, box)) text, rec_score, rec_char_score, char_polygon = getstr_grid( char_masks[index, :, :, :].copy(), box, threshold=threshold) texts.append(text) rec_scores.append(rec_score) rec_char_scores.append(rec_char_score) char_polygons.append(char_polygon) # segmss.append(segms) return texts, rec_scores, rec_char_scores, char_polygons def mask2polygon(self, mask, box, im_size, threshold=0.5, output_polygon=True): # mask 32*128 image_width, image_height = im_size[1], im_size[0] box_h = box[3] - box[1] box_w = box[2] - box[0] cls_polys = (mask * 255).astype(np.uint8) poly_map = np.array(Image.fromarray(cls_polys).resize((box_w, box_h))) poly_map = poly_map.astype(np.float32) / 255 poly_map = cv2.GaussianBlur(poly_map, (3, 3), sigmaX=3) ret, poly_map = cv2.threshold(poly_map, 0.5, 1, cv2.THRESH_BINARY) if output_polygon: SE1 = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) poly_map = cv2.erode(poly_map, SE1) poly_map = cv2.dilate(poly_map, SE1) poly_map = cv2.morphologyEx(poly_map, cv2.MORPH_CLOSE, SE1) try: _, contours, _ = cv2.findContours( (poly_map * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE) except: contours, _ = cv2.findContours( (poly_map * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE) if len(contours) == 0: print(contours) print(len(contours)) return None max_area = 0 max_cnt = contours[0] for cnt in contours: area = cv2.contourArea(cnt) if area > max_area: max_area = area max_cnt = cnt perimeter = cv2.arcLength(max_cnt, True) epsilon = 0.01 * cv2.arcLength(max_cnt, True) approx = cv2.approxPolyDP(max_cnt, epsilon, True) pts = approx.reshape((-1, 2)) pts[:, 0] = pts[:, 0] + box[0] pts[:, 1] = pts[:, 1] + box[1] polygon = list(pts.reshape((-1, ))) polygon = list(map(int, polygon)) if len(polygon) < 6: return None else: SE1 = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3)) poly_map = cv2.erode(poly_map, SE1) poly_map = cv2.dilate(poly_map, SE1) poly_map = cv2.morphologyEx(poly_map, cv2.MORPH_CLOSE, SE1) idy, idx = np.where(poly_map == 1) xy = np.vstack((idx, idy)) xy = np.transpose(xy) hull = cv2.convexHull(xy, clockwise=True) # reverse order of points. if hull is None: return None hull = hull[::-1] # find minimum area bounding box. rect = cv2.minAreaRect(hull) corners = cv2.boxPoints(rect) corners = np.array(corners, dtype="int") pts = get_tight_rect(corners, box[0], box[1], image_height, image_width, 1) polygon = [x * 1.0 for x in pts] polygon = list(map(int, polygon)) return polygon def visualization(self, img, polygons, words): cur_img = copy.deepcopy(img) for polygon, word in zip(polygons, words): pts = np.array(polygon, np.int32) pts = pts.reshape((-1, 1, 2)) xmin = min(pts[:, 0, 0]) ymin = min(pts[:, 0, 1]) r = random.randint(0, 255) g = random.randint(0, 255) b = random.randint(0, 255) cv2.polylines(cur_img, [pts], True, (b, g, r)) cv2.putText(cur_img, word, (xmin, ymin), cv2.FONT_HERSHEY_TRIPLEX, 0.5, (b, g, r), 1) return cur_img
def test_lookup_compound(self): edit_distance_max = 2 prefix_length = 7 sym_spell = SymSpell(edit_distance_max, prefix_length) sym_spell.load_dictionary(self.dictionary_path, 0, 1) sym_spell.load_bigram_dictionary(self.bigram_path, 0, 2) typo = "whereis th elove" correction = "where is the love" results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(2, results[0].distance) self.assertEqual(585, results[0].count) typo = "the bigjest playrs" correction = "the biggest players" results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(2, results[0].distance) self.assertEqual(34, results[0].count) typo = "Can yu readthis" correction = "can you read this" results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(3, results[0].distance) self.assertEqual(11440, results[0].count) typo = ("whereis th elove hehad dated forImuch of thepast who " "couqdn'tread in sixthgrade and ins pired him") correction = ("where is the love he had dated for much of the past " "who couldn't read in sixth grade and inspired him") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(9, results[0].distance) self.assertEqual(0, results[0].count) typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan" correction = ("in the third quarter of last year he had learned of a " "secret plan") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(9, results[0].distance) self.assertEqual(0, results[0].count) typo = ("the bigjest playrs in te strogsommer film slatew ith plety " "of funn") correction = ("the biggest players in the strong summer film slate " "with plenty of fun") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(9, results[0].distance) self.assertEqual(0, results[0].count) typo = ("Can yu readthis messa ge despite thehorible sppelingmsitakes") correction = ("can you read this message despite the horrible " "spelling mistakes") results = sym_spell.lookup_compound(typo, edit_distance_max) self.assertEqual(1, len(results)) self.assertEqual(correction, results[0].term) self.assertEqual(10, results[0].distance) self.assertEqual(0, results[0].count)
class spellchecker: def __init__( self, max_dictionary_edit_distance, prefix_length, unigram_freq_file, bigram_freq_file=None, pickle_file=None, ): self.sym_spell = SymSpell( max_dictionary_edit_distance=max_dictionary_edit_distance, prefix_length=prefix_length, ) if pickle_file is not None: self.sym_spell.load_pickle(pickle_file, ) else: self.sym_spell.load_dictionary( unigram_freq_file, term_index=0, count_index=1, encoding="utf-8", ) if bigram_freq_file: self.sym_spell.load_bigram_dictionary( bigram_freq_file, term_index=0, count_index=2, encoding="utf-8", ) def suggest( self, word, max_edit_dist=None, include_unknown=True, verbosity=Verbosity.CLOSEST, ): # defaults if max_edit_dist == None: max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE # spellcheck suggestions = self.sym_spell.lookup( word, verbosity, max_edit_distance=max_edit_dist, include_unknown=include_unknown, ) return { 'original_term': word, 'suggestions': suggestions, } def suggest_compound( self, phrase, max_edit_dist=None, ): if max_edit_dist == None: max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE # spellcheck suggestions = self.sym_spell.lookup_compound( phrase, max_edit_distance=max_edit_dist, # ignore_non_words=False, # split_phrase_by_space=True, ) return { 'original_term': phrase, 'suggestions': suggestions, } def tokenize(self, phrases): return tokenize_sentence(phrases) # Tokenize into individual phrases and return a list of suggestions for each def suggest_tokenize( self, phrases, max_edit_dist=None, include_unknown=True, verbosity=Verbosity.CLOSEST, ): if max_edit_dist == None: max_edit_dist = DEFAULT_MAX_EDIT_DISTANCE words = self.tokenize(phrases) sentence_suggestions = [] for word in words: suggestions = self.sym_spell.lookup( word, verbosity, max_edit_distance=max_edit_dist, include_unknown=include_unknown, ) sentence_suggestions.append({ 'original_term': word, 'suggestions': suggestions, }) return sentence_suggestions
warnings.filterwarnings(action="ignore", category=UserWarning, module="gensim") TROPICAL_PATH = "tropical_dic.json" FREQ_DICT_PATH = "frequency_dictionary_es_82_765.txt" BIGRAM_PATH = "frequency_bigramdictionary_es_1Mnplus.txt" with open(TROPICAL_PATH, "r") as file: tropical_dic = json.load(file) sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) # term_index is the column of the term and count_index is the # column of the term frequency sym_spell.load_dictionary(FREQ_DICT_PATH, term_index=0, count_index=1) sym_spell.load_bigram_dictionary(BIGRAM_PATH, term_index=0, count_index=2) FIRST_INT = 11111111111111 LAST_INT = 99999999999999 PLACEHOLDERS_DICT = {} @InputSeries(TextSeries) def fillna(s: TextSeries) -> TextSeries: """ Replaces not assigned values with empty string. Examples --------
from symspellpy import SymSpell, Verbosity sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename("symspellpy", "freq_name_dic.txt") bigram_path = pkg_resources.resource_filename("symspellpy", "freq_name_bigram.txt") # term_index is the column of the term and count_index is the # column of the term frequency sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1, encoding='utf-8') # sym_spell.load_dictionary('C:/Users/nt.anh6/PycharmProjects/aicr_vn/nlp_model/spell_checker/dict/vi_full.txt', term_index=0, count_index=1, encoding='utf-8') sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2, encoding='utf-8') # lookup suggestions for multi-word input strings (supports compound # splitting & merging) input_term = "Ngyễn tành nm" # max edit distance per lookup (per single word, not per whole input string) # suggestions = sym_spell.lookup(input_term, Verbosity.ALL, max_edit_distance=2, include_unknown=True) suggestions = sym_spell.lookup_compound(input_term, max_edit_distance=2) # display suggestion term, edit distance, and term frequency for suggestion in suggestions: print(suggestion) def load_name_corection(dictionary_path, bigram_path): sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
import pytesseract import cv2 import pkg_resources import main spell = SpellChecker() pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") # term_index is the column of the term and count_index is the # column of the term frequency sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2) # prob_corrector = malaya.spell.symspell() prob_corrector = malaya.spell.probability() app = Flask(__name__) app.config['DEBUG'] == True class FilePaths: "filenames and paths to data" fnCharList = 'model/charList.txt' fnAccuracy = 'model/accuracy.txt' fnTrain = 'data/' fnCorpus = 'data/corpus.txt'
def test3(): # from autocorrect import Speller # doc = docx.Document("Word docs_Peace/1_CTS_119_eng_text.docx") # result = [p.text for p in doc.paragraphs] # # spell = Speller(lang='en') # # for j in range(15): # print(spell(result[j])) # import jamspell # # corrector = jamspell.TSpellCorrector() # corrector.LoadLangModel('en.bin') # text = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII. According to this Foundation of a general and un-\nlimited Amnefty, all and every the Electors of the Sa-\ncred Roman Enmpire, the Princes and States therein inclu-\nded, the Nobility that hold immediately of the Empire,\ntheir Vaffals, Subjects, Citizens and Inhabitants, who\nupon occafion of the Troubles of Bohemia and Germany,\nor upon the account of Alliances contracted on one fide\nand another, may have fuffer'd any Prejudice or Damage\nfrom either Party, in any manner, or under any pretext\nwhatfoever, either in their Domains, Goods, Fees,\nSub-Fees, Állodials, or in their Dignities, Immunities,\nRights and Privileges, fhal be fully re-eftablifh'd on both\nfides, in the fame Štate, both as to Spirituals and Tem-\nporals, which they enjoy'd, or could of Right enjoy be-\nfore thofe Troubles, notwithftanding all the Changes\nmade to the contrary, which fhall be annul'd and remain\nvoid.\nBut as thefe and fuch like Reftitutions ought to be al\nunderftood, faving whatfoever Rights, either of Domi-\nnium directum, or Dominium utile, go along with the\nGoods which are to be reftor'd, whether Secular or Ec-\nclefiaftical, and belong to him who makes Reftitution,\nor to him to whom Reftitution is made, or to any third\nPerfon; faving alfo the Rights which lie undeternin'd ei-\nther in the Imperial Court, or in the Imperial Chamber,\n", # # text = corrector.FixFragment(text) # print(text) sys.path.append("treatyUtil") import pkg_resources from symspellpy import SymSpell, Verbosity from treatyUtil import spellcheck_keep_punctuation sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7) dictionary_path = pkg_resources.resource_filename( "symspellpy", "frequency_dictionary_en_82_765.txt") bigram_path = pkg_resources.resource_filename( "symspellpy", "frequency_bigramdictionary_en_243_342.txt") # term_index is the column of the term and count_index is the # column of the term frequency sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1) sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2) # lookup suggestions for multi-word input strings (supports compound # splitting & merging) input_term1 = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII.\ According to this Foundation of a general and un-\nlimited Amnefty, all and every the Electors of the Sa-\ncred \ Roman Enmpire, the Princes and States therein inclu-\nded, the Nobility that hold immediately of the Empire,\ntheir \ Vaffals, Subjects, Citizens and Inhabitants, who\nupon occafion of the Troubles of Bohemia and Germany,\nor upon the \ account of Alliances contracted on one fide\nand another, may have fuffer'd any Prejudice or Damage\nfrom either \ Party, in any manner, or under any pretext\nwhatfoever, either in their Domains, Goods, Fees,\nSub-Fees, Állodials, \ or in their Dignities, Immunities,\nRights and Privileges, fhal be fully re-eftablifh'd on both\nfides, in the fame Štate, \ both as to Spirituals and Tem-\nporals, which they enjoy'd, or could of Right enjoy be-\nfore thofe Troubles, notwithftanding \ all the Changes\nmade to the contrary, which fhall be annul'd and remain\nvoid.\nBut as thefe and fuch like Reftitutions \ ought to be al\nunderftood, faving whatfoever Rights, either of Domi-\nnium directum, or Dominium utile, go along with \ the\nGoods which are to be reftor'd, whether Secular or Ec-\nclefiaftical, and belong to him who makes Reftitution,\nor \ to him to whom Reftitution is made, or to any third\nPerfon; faving alfo the Rights which lie undeternin'd ei-\nther in the\ Imperial Court, or in the Imperial Chamber,\n" #input_term = "tended by one againft another upon this account, fhall\nbe bury'd in perpetual Oblivion.\nIII. According to this Foundation of a general and un-\nlimited " input_term = "God, and Safety of the Chriſtian World (the Electors,\nPrinces and States of the Sacred Roman Empire \ being\npreſent, approving and conſenting) the Articles of Peace\nand Anity, whereof the Tenour follows.\n1. That \ there be a Chriſtian, univerſal\nThe Re-efta. and perpetual Peace, and a true and ſincere\nbliſhment of Friendſhip and \ Amity between his Sacred\nPeace and A. Imperial Majeſty, the Houſe of Austria,\nmity.\nand all his Allies and Adherents, \ and the\nHeirs and Succeffors of each of them, chiefly the King\nof Spain, and the Electors, Princes and States of the En-\npire,\ of the one ſide, and her Sacred Royal Majeſty,\nand the Kingdom of Sweden, her Allies and Adherents,\nand the Heirs and Succeſſors\ of each of them, eſpecially\nthe moſt Chriſtian King, the reſpective Electors, Princes\nand States of the Empire, of the other ſide ; \ and that this\nPeace be obſerv'd and cultivated ſincerely and ſeriouſly,\nſo that each Party may procure the Benefit, Honour and\nAdvantage \ of one another, and thereby the Fruits of this\nPeace and Amity may be ſeen to grow up and fouriſh a-\nnew, by a ſure and reciprocal \ maintaining of a good\nand faithful Neighbourhood between the Roman Empire\nand the Kingdom of Sweden reciprocally,\nII. That there be \ on both ſides à perpe-\nAn Amneſty\ntua) Oblivion and Amneſty of all that has\nfrom all Hoffi- been done Since the beginning of theſe\nlity.\nTroubles, \ in what Place or in what Man-\n" input_term2 = "God, and Safety of the Chriſtian World (the Electors,\nPrinces" input_term = re.sub("\n", " ", input_term) input_term = re.sub("- ", "", input_term) #input_term = re.sub("-", "", input_term) input_term = re.sub("ſ", "s", input_term) # word_split = re.compile(r"[^\W]+", re.U) # suggestions = sym_spell.lookup_compound((input_term), ignore_non_words=True, max_edit_distance=2) # for suggestion in suggestions: # print(suggestion) # # corrected = suggestions[0].term # # This combined with split_phrase_by_space=True would be enough just to spell check # # but punctuation is lost. # # # The spell check is already done in 'corrected'. Now we just want to keep the punctuation. # in_list = word_split.findall(input_term) # chk_list = word_split.findall(corrected) # print(input_term) # print(corrected) # print(in_list) # print(chk_list) # pdb.set_trace() # # # To keep punctuation we take the original phrase and do word by word replacement # out_term = "" # outs = input_term.split() # word_count = 0 # for word in in_list: # print(out_term) # print(outs[word_count].lower(), word, chk_list[word_count]) # temp = outs[word_count].lower().replace(word, chk_list[word_count]) # word_count += 1 # out_term += temp+" " # # print(out_term) # return # max edit distance per lookup (per single word, not per whole input string) #pdb.set_trace() #print(spellcheck_keep_punctuation(input_term)) suggestions = sym_spell.lookup_compound((input_term), transfer_casing=True, ignore_non_words=True, max_edit_distance=2) # display suggestion term, edit distance, and term frequency #print(suggestions) for suggestion in suggestions: print(suggestion)