def __init__( self, # List of documents or sentences, with preprocessing already done # (e.g. using nwae.lang.preprocessing.TxtPreprocessor) # Words however are not split into array yet, just separated by word separator specified # by language in nwae.lang.preprocessing.BasicPreprocessor docs, # List of labels labels, # If None we use space as word splitter langs = None ): self.docs = docs self.labels = labels self.langs = langs if self.langs is None: # Assume all English self.langs = [LangFeatures.LANG_EN] * len(self.docs) if (len(self.docs) != len(self.labels)) or (len(self.docs) != len(self.langs)): raise Exception( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Length of docs ' + str(len(self.docs)) + ' must equal labels shape ' + str(len(self.labels)) + ' and langs shape ' + str(len(langs)) ) self.lang_features = LangFeatures() # We need to split the docs/sentences into a list of words self.docs_split = None return
def __init__( self ): self.lang_features = LangFeatures() # Map alphabet name to unicode character set array self.alphabet_dict = {} for alp in self.TESTS_BY_ORDER: self.alphabet_dict[alp] = LangCharacters.get_alphabet_charset( alphabet = alp ) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Alphabets used: ' + str(self.alphabet_dict.keys()) ) self.langs_with_no_word_sep = self.lang_features.get_languages_with_no_word_separator() Log.debugdebug('Langs with no word sep: ' + str(self.langs_with_no_word_sep)) # Load common words self.common_words = {} self.common_words[LangFeatures.LANG_EN] = English() self.common_words[LangFeatures.LANG_ES] = Spanish() self.common_words[LangFeatures.LANG_FR] = French() self.common_words[LangFeatures.LANG_ID] = Indonesian() self.common_words[LangFeatures.LANG_VI] = Vietnamese() # Load stemmers self.word_stemmer = {} for lang in self.SUPPORTED_LANGS: lang_have_verb_conj = self.lang_features.have_verb_conjugation( lang = lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(lang) + '" verb conjugation = ' + str(lang_have_verb_conj) + '.' ) self.word_stemmer[lang] = None if lang_have_verb_conj: try: self.word_stemmer[lang] = Lemmatizer( lang = lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(lang) + '" stemmer/lemmatizer initialized successfully.' ) except Exception as ex_stemmer: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Lang "' + str(lang) + ' stemmer/lemmatizer failed to initialize: ' \ + str(ex_stemmer) + '.' Log.warning(errmsg) self.profiler_detect_alp = ProfilingHelper(profiler_name = str(self.__class__)) return
def __init__( self, lang, # This words list can be a full dictionary (for languages with natural space # as word separator) or just a common words list in our usage application context # for languages without a natural space as word separator. # This is because for languages without space, the word splitting itself might # be wrong, and the spelling correction algorithm might need to look at previous # or subsequent words. words_list, # Directory and identifier string for looking up EIDF files dir_path_model=None, identifier_string=None, # Option to pass in EIDF DataFrame instead of using directory and identifier string eidf_dataframe=None, do_profiling=False): self.lang = LangFeatures.map_to_lang_code_iso639_1(lang_code=lang) self.words_list = words_list self.dir_path_model = dir_path_model self.identifier_string = identifier_string self.eidf_dataframe = eidf_dataframe self.do_profiling = do_profiling self.sep_type = LangFeatures().get_word_separator_type(lang=lang) self.spell_check_word = SpellCheckWord( lang=self.lang, words_list=self.words_list, dir_path_model=self.dir_path_model, identifier_string=self.identifier_string, eidf_dataframe=self.eidf_dataframe, do_profiling=self.do_profiling) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Initialize Spelling Correction for "' + str(lang) + '", separator type "' + str(self.sep_type) + '"') return
def __init__( self, lang ): self.lang = LangFeatures.map_to_lang_code_iso639_1( lang_code = lang ) self.raw_words = None self.common_words = None lfobj = LangFeatures() self.lang_have_verb_conj = lfobj.have_verb_conjugation( lang = self.lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '" verb conjugation = ' + str(self.lang_have_verb_conj) + '.' ) self.word_stemmer = None if self.lang_have_verb_conj: try: self.word_stemmer = Lemmatizer( lang = self.lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '" stemmer/lemmatizer initialized successfully.' ) except Exception as ex_stemmer: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Lang "' + str(self.lang) + ' stemmer/lemmatizer failed to initialize: ' \ + str(ex_stemmer) + '.' Log.warning(errmsg) self.word_stemmer = None return