Exemplo n.º 1
0
    def __init__(self,
                 dict_path=None,
                 name='Spelling_Aug',
                 aug_min=1,
                 aug_max=10,
                 aug_p=0.3,
                 stopwords=None,
                 tokenizer=None,
                 reverse_tokenizer=None,
                 include_reverse=True,
                 stopwords_regex=None,
                 verbose=0):
        super().__init__(action=Action.SUBSTITUTE,
                         name=name,
                         aug_p=aug_p,
                         aug_min=aug_min,
                         aug_max=aug_max,
                         stopwords=stopwords,
                         tokenizer=tokenizer,
                         reverse_tokenizer=reverse_tokenizer,
                         device='cpu',
                         verbose=verbose,
                         stopwords_regex=stopwords_regex,
                         include_detail=False)

        self.dict_path = dict_path if dict_path else os.path.join(
            LibraryUtil.get_res_dir(), 'word', 'spelling', 'spelling_en.txt')
        self.include_reverse = include_reverse
        self.model = self.get_model(force_reload=False)
Exemplo n.º 2
0
    def __init__(self,
                 name='Keyboard_Aug',
                 aug_char_min=1,
                 aug_char_max=10,
                 aug_char_p=0.3,
                 aug_word_p=0.3,
                 aug_word_min=1,
                 aug_word_max=10,
                 stopwords=None,
                 tokenizer=None,
                 reverse_tokenizer=None,
                 include_special_char=True,
                 include_numeric=True,
                 include_upper_case=True,
                 lang="en",
                 verbose=0,
                 stopwords_regex=None,
                 model_path=None,
                 min_char=4,
                 include_detail=False):
        super().__init__(action=Action.SUBSTITUTE,
                         name=name,
                         min_char=min_char,
                         aug_char_min=aug_char_min,
                         aug_char_max=aug_char_max,
                         aug_char_p=aug_char_p,
                         aug_word_min=aug_word_min,
                         aug_word_max=aug_word_max,
                         aug_word_p=aug_word_p,
                         tokenizer=tokenizer,
                         reverse_tokenizer=reverse_tokenizer,
                         stopwords=stopwords,
                         device='cpu',
                         verbose=verbose,
                         stopwords_regex=stopwords_regex,
                         include_special_char=include_special_char,
                         include_detail=include_detail)

        # TODO: support other type of keyboard
        self.keyboard_type = 'qwerty'
        self.include_special_char = include_special_char
        self.include_numeric = include_numeric
        self.include_upper_case = include_upper_case
        self.include_lower_case = True
        self.lang = lang

        if model_path is None:
            if lang not in ['en', 'th']:
                raise ValueError(
                    'Only support en and th now. You may provide the keyboard mapping '
                    'such that we can support "{}"'.format(lang))
            self.model_path = os.path.join(LibraryUtil.get_res_dir(), 'char',
                                           'keyboard', lang + '.json')
        else:
            self.model_path = model_path
        self.model = self.get_model(include_special_char, include_numeric,
                                    include_upper_case, lang, self.model_path)
Exemplo n.º 3
0
    def __init__(self, name='Keyboard_Aug', aug_char_min=1, aug_char_max=10, aug_char_p=0.3,
                 aug_word_p=0.3, aug_word_min=1, aug_word_max=10, stopwords=None,
                 tokenizer=None, reverse_tokenizer=None, include_special_char=True, include_numeric=True,
                 include_upper_case=True, lang="en", verbose=0, stopwords_regex=None, model_path=None,
                 min_char=4):
        super().__init__(
            action=Action.SUBSTITUTE, name=name, min_char=min_char, aug_char_min=aug_char_min, aug_char_max=aug_char_max,
            aug_char_p=aug_char_p, aug_word_min=aug_word_min, aug_word_max=aug_word_max, aug_word_p=aug_word_p,
            tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, stopwords=stopwords, device='cpu',
            verbose=verbose, stopwords_regex=stopwords_regex, include_special_char=include_special_char,
            include_detail=False)

        # TODO: support other type of keyboard
        self.keyboard_type = 'qwerty'
        self.include_special_char = include_special_char
        self.include_numeric = include_numeric
        self.include_upper_case = include_upper_case
        self.include_lower_case = True
        self.lang = lang


        if model_path is None:
            lang_list = set(
                map(
                    lambda file_name: file_name.replace(".json", ""),
                    os.listdir(
                        os.path.join(LibraryUtil.get_res_dir(), "char", "keyboard")
                    ),
                )
            )
            if lang not in lang_list:
                raise ValueError(
                    "Only support en and th now. You may provide the keyboard mapping "
                    'such that we can support "{}"'.format(lang)
                )
            self.model_path = os.path.join(
                LibraryUtil.get_res_dir(), "char", "keyboard", lang + ".json"
            )
        else:
            self.model_path = model_path
        self.model = self.get_model(include_special_char, include_numeric, include_upper_case, lang, self.model_path)
Exemplo n.º 4
0
    def get_model(cls, dict_of_path):
        # Use default
        if not dict_of_path:
            default_path = os.path.join(LibraryUtil.get_res_dir(), 'char', 'ocr', 'en.json')
            model = ReadUtil.read_json(default_path)
            return nmc.Ocr(model=model)

        # Use dict
        if type(dict_of_path) is dict:
            return nmc.Ocr(model=dict_of_path)

        # Use json from file
        model = ReadUtil.read_json(dict_of_path)
        if not model:
            raise ValueError('The dict_of_path does not exist. Please check "{}"'.format(dict_of_path))
        return nmc.Ocr(model=model)