def __init__( self, # We support a single lang or a list of languages, one for each text lang_str_or_list, # A list of sentences in str format, but split by words either with our # default word delimiter DEFAULT_WORD_SPLITTER or space or whatever. # Or can also be a list of sentences in already split list format text_segmented_list ): self.lang = lang_str_or_list self.text_segmented_list = text_segmented_list self.lang_list = None if type(self.lang) in (list, tuple): self.lang_list = [LangFeatures.map_to_lang_code_iso639_1(lang_code=l) for l in self.lang] if len(self.lang_list) != len(self.text_segmented_list): raise Exception( str(TextProcessor.__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Language list & text segmented list must have same length! ' ) else: self.lang = LangFeatures.map_to_lang_code_iso639_1( lang_code = self.lang ) self.lang_list = [self.lang] * len(self.text_segmented_list) lg.Log.debugdebug( str(TextProcessor.__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Text segmented list: ' + str(self.text_segmented_list) ) return
def __init__( self ): self.lang_features = LangFeatures() # Map alphabet name to unicode character set array self.alphabet_dict = {} for alp in self.TESTS_BY_ORDER: self.alphabet_dict[alp] = LangCharacters.get_alphabet_charset( alphabet = alp ) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Alphabets used: ' + str(self.alphabet_dict.keys()) ) self.langs_with_no_word_sep = self.lang_features.get_languages_with_no_word_separator() Log.debugdebug('Langs with no word sep: ' + str(self.langs_with_no_word_sep)) # Load common words self.common_words = {} self.common_words[LangFeatures.LANG_EN] = English() self.common_words[LangFeatures.LANG_ES] = Spanish() self.common_words[LangFeatures.LANG_FR] = French() self.common_words[LangFeatures.LANG_ID] = Indonesian() self.common_words[LangFeatures.LANG_VI] = Vietnamese() # Load stemmers self.word_stemmer = {} for lang in self.SUPPORTED_LANGS: lang_have_verb_conj = self.lang_features.have_verb_conjugation( lang = lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(lang) + '" verb conjugation = ' + str(lang_have_verb_conj) + '.' ) self.word_stemmer[lang] = None if lang_have_verb_conj: try: self.word_stemmer[lang] = Lemmatizer( lang = lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(lang) + '" stemmer/lemmatizer initialized successfully.' ) except Exception as ex_stemmer: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Lang "' + str(lang) + ' stemmer/lemmatizer failed to initialize: ' \ + str(ex_stemmer) + '.' Log.warning(errmsg) self.profiler_detect_alp = ProfilingHelper(profiler_name = str(self.__class__)) return
def __init__( self, # List of documents or sentences, with preprocessing already done # (e.g. using nwae.lang.preprocessing.TxtPreprocessor) # Words however are not split into array yet, just separated by word separator specified # by language in nwae.lang.preprocessing.BasicPreprocessor docs, # List of labels labels, # If None we use space as word splitter langs = None ): self.docs = docs self.labels = labels self.langs = langs if self.langs is None: # Assume all English self.langs = [LangFeatures.LANG_EN] * len(self.docs) if (len(self.docs) != len(self.labels)) or (len(self.docs) != len(self.langs)): raise Exception( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Length of docs ' + str(len(self.docs)) + ' must equal labels shape ' + str(len(self.labels)) + ' and langs shape ' + str(len(langs)) ) self.lang_features = LangFeatures() # We need to split the docs/sentences into a list of words self.docs_split = None return
def __init__( self, lang, # This words list can be a full dictionary (for languages with natural space # as word separator) or just a common words list in our usage application context # for languages without a natural space as word separator. # This is because for languages without space, the word splitting itself might # be wrong, and the spelling correction algorithm might need to look at previous # or subsequent words. words_list, # Directory and identifier string for looking up EIDF files dir_path_model=None, identifier_string=None, # Option to pass in EIDF DataFrame instead of using directory and identifier string eidf_dataframe=None, do_profiling=False): self.lang = LangFeatures.map_to_lang_code_iso639_1(lang_code=lang) self.words_list = words_list self.dir_path_model = dir_path_model self.identifier_string = identifier_string self.eidf_dataframe = eidf_dataframe self.do_profiling = do_profiling self.sep_type = LangFeatures().get_word_separator_type(lang=lang) self.spell_check_word = SpellCheckWord( lang=self.lang, words_list=self.words_list, dir_path_model=self.dir_path_model, identifier_string=self.identifier_string, eidf_dataframe=self.eidf_dataframe, do_profiling=self.do_profiling) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Initialize Spelling Correction for "' + str(lang) + '", separator type "' + str(self.sep_type) + '"') return
def __init__( self, lang ): self.lang = LangFeatures.map_to_lang_code_iso639_1( lang_code = lang ) self.raw_words = None self.common_words = None lfobj = LangFeatures() self.lang_have_verb_conj = lfobj.have_verb_conjugation( lang = self.lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '" verb conjugation = ' + str(self.lang_have_verb_conj) + '.' ) self.word_stemmer = None if self.lang_have_verb_conj: try: self.word_stemmer = Lemmatizer( lang = self.lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '" stemmer/lemmatizer initialized successfully.' ) except Exception as ex_stemmer: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Lang "' + str(self.lang) + ' stemmer/lemmatizer failed to initialize: ' \ + str(ex_stemmer) + '.' Log.warning(errmsg) self.word_stemmer = None return
def __init__( self, lang, dirpath_synonymlist, postfix_synonymlist, add_latin_equiv_words = False ): self.lang = LangFeatures.map_to_lang_code_iso639_1( lang_code = lang ) self.dirpath_synonymlist = dirpath_synonymlist self.postfix_synonymlist = postfix_synonymlist self.add_latin_equiv_words = add_latin_equiv_words self.map_word_to_rootword = {} return
def __init__(self, lang=LangFeatures.LANG_EN): self.lang = LangFeatures.map_to_lang_code_iso639_1(lang_code=lang) Ssl.disable_ssl_check() try: if nltk.download(Corpora.NLTK_COMTRANS): Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': NLTK download of "' + Corpora.NLTK_COMTRANS + '" OK.') else: raise Exception('Download "' + str(Corpora.NLTK_COMTRANS) + '" returned False') except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': NLTK download of "' + str(Corpora.NLTK_COMTRANS) + '" exception: ' \ + str(ex) + '.' Log.error(errmsg) raise Exception(errmsg) return
def __init__( self, lang, # Список слов из словаря или любых words_list, # Directory and identifier string for looking up EIDF files dir_path_model=None, identifier_string=None, # Option to pass in EIDF DataFrame instead of using directory and identifier string eidf_dataframe=None, use_word_weighting=True, do_profiling=False): self.lang = LangFeatures.map_to_lang_code_iso639_1(lang_code=lang) self.words_list = words_list self.dir_path_model = dir_path_model self.identifier_string = identifier_string self.use_word_weighting = use_word_weighting self.eidf_dataframe = eidf_dataframe self.do_profiling = do_profiling self.trie = TrieNode.build_trie_node(words=self.words_list) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Read ' + str(TrieNode.WORD_COUNT) + ' words, ' + str(TrieNode.NODE_COUNT) + ' trie nodes from wordlist ' + str(self.words_list[0:50]) + ' (first 50 of ' + str(len(self.words_list)) + ')') if not self.use_word_weighting: self.eidf_words = None self.eidf_value = None else: try: Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Initializing EIDF object.. try to read from file..') # Try to read from file df_eidf_file = eidf.Eidf.read_eidf_from_storage( data_pd_dataframe=self.eidf_dataframe, dir_path_model=self.dir_path_model, identifier_string=self.identifier_string, # No need to reorder the words in EIDF file x_name=None) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Successfully Read EIDF from file from directory "' + str(self.dir_path_model) + '" for model "' + str(self.identifier_string) + '".') Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': EIDF initialized as:' + str(df_eidf_file)) self.eidf_words = np.array( df_eidf_file[eidf.Eidf.STORAGE_COL_X_NAME], dtype=str) self.eidf_value = np.array( df_eidf_file[eidf.Eidf.STORAGE_COL_EIDF], dtype=float) except Exception as ex_eidf: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': No EIDF from file available. Exception ' + str(ex_eidf) + '.' Log.error(errmsg) raise Exception(errmsg) return
def __init__(self, model_name, identifier_string, dir_path_model, lang, dirpath_synonymlist, postfix_synonymlist, dir_wordlist, postfix_wordlist, dir_wordlist_app, postfix_wordlist_app, word_freq_model=FeatureVector.COL_FREQUENCY, confidence_level_scores=None, do_spelling_correction=False, do_word_stemming=True, do_profiling=False, lang_additional=()): super(PredictClass, self).__init__() self.model_name = model_name self.identifier_string = identifier_string self.dir_path_model = dir_path_model self.lang_main = lang self.dirpath_synonymlist = dirpath_synonymlist self.postfix_synonymlist = postfix_synonymlist self.dir_wordlist = dir_wordlist self.postfix_wordlist = postfix_wordlist self.dir_wordlist_app = dir_wordlist_app self.postfix_wordlist_app = postfix_wordlist_app self.word_freq_model = word_freq_model self.do_spelling_correction = do_spelling_correction self.do_word_stemming = do_word_stemming self.do_profiling = do_profiling if lang_additional is None: lang_additional = () self.lang_additional = [ LangFeatures.map_to_lang_code_iso639_1(lang_code=l) for l in lang_additional ] try: self.lang_additional.remove(self.lang_main) except ValueError: pass self.lang_additional = list(set(self.lang_additional)) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Predictor class initialization using model "' + str(self.identifier_string) + '", word freq model "' + str(self.word_freq_model) + '", main language "' + str(self.lang_main) + '", additional languages: ' + str(self.lang_additional)) self.model = ModelHelper.get_model( model_name=self.model_name, # We will load the model params from file/etc from trained model model_params=None, identifier_string=self.identifier_string, dir_path_model=self.dir_path_model, training_data=None, confidence_level_scores=confidence_level_scores, do_profiling=self.do_profiling) self.model.start() # Keep track if model reloaded. This counter is manually updated by this class. self.model_last_reloaded_counter = 0 self.load_text_processor_mutex = threading.Lock() # After loading model, we still need to load word lists, etc. self.is_all_initializations_done = False # # We initialize word segmenter and synonym list after the model is ready # because it requires the model features so that root words of synonym lists # are only from the model features # self.predict_class_txt_processor = None self.lang_detect = None self.count_predict_calls = 0 # Wait for model to be ready to load synonym & word lists self.start() return
class LangDetect: SUPPORTED_LANGS = ( LangFeatures.LANG_KO, LangFeatures.LANG_JA, LangFeatures.LANG_RU, LangFeatures.LANG_ZH, LangFeatures.LANG_TH, LangFeatures.LANG_EN, LangFeatures.LANG_ES, LangFeatures.LANG_FR, LangFeatures.LANG_VI, LangFeatures.LANG_ID, ) THRESHOLD_PCT_WORDS_IN_MOST_COMMON = 0.15 # We break text into these blocks TEXT_BLOCK_LEN = 10 # Default covers 30% of blocks (e.g. if there are 10 blocks, we will randomly pick 3) DEFAULT_TEST_COVERAGE_PCT = 0.3 # Not more than 5 blocks we will test to ensure speed DEFAULT_TEST_MAX_RANGE_BLOCKS = 5 TEST_LATIN_BY_ORDER = [ LangFeatures.ALPHABET_LATIN_AZ, # We also detect these special Vietnamese characters, to increase accuracy for Vietnamese LangFeatures.ALPHABET_LATIN_VI, # This Latin that covers all must be last to test LangFeatures.ALPHABET_LATIN ] TEST_CYRILLIC_BY_ORDER = [ LangFeatures.ALPHABET_CYRILLIC ] TEST_HANGUL_BY_ORDER = [ LangFeatures.ALPHABET_HANGUL ] TEST_JAPANESE_BY_ORDER = [ # No point to test CJK LangFeatures.ALPHABET_HIRAGANA_KATAKANA, ] TEST_CJK_BY_ORDER = [ LangFeatures.ALPHABET_CJK ] TEST_THAI_BY_ORDER = [ LangFeatures.ALPHABET_THAI ] """ Notes: - Need to test CJK first, then only Japanese that also contains CJK """ TESTS_BY_ORDER = TEST_LATIN_BY_ORDER \ + TEST_CYRILLIC_BY_ORDER \ + TEST_HANGUL_BY_ORDER \ + TEST_CJK_BY_ORDER \ + TEST_JAPANESE_BY_ORDER \ + TEST_THAI_BY_ORDER def __init__( self ): self.lang_features = LangFeatures() # Map alphabet name to unicode character set array self.alphabet_dict = {} for alp in self.TESTS_BY_ORDER: self.alphabet_dict[alp] = LangCharacters.get_alphabet_charset( alphabet = alp ) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Alphabets used: ' + str(self.alphabet_dict.keys()) ) self.langs_with_no_word_sep = self.lang_features.get_languages_with_no_word_separator() Log.debugdebug('Langs with no word sep: ' + str(self.langs_with_no_word_sep)) # Load common words self.common_words = {} self.common_words[LangFeatures.LANG_EN] = English() self.common_words[LangFeatures.LANG_ES] = Spanish() self.common_words[LangFeatures.LANG_FR] = French() self.common_words[LangFeatures.LANG_ID] = Indonesian() self.common_words[LangFeatures.LANG_VI] = Vietnamese() # Load stemmers self.word_stemmer = {} for lang in self.SUPPORTED_LANGS: lang_have_verb_conj = self.lang_features.have_verb_conjugation( lang = lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(lang) + '" verb conjugation = ' + str(lang_have_verb_conj) + '.' ) self.word_stemmer[lang] = None if lang_have_verb_conj: try: self.word_stemmer[lang] = Lemmatizer( lang = lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(lang) + '" stemmer/lemmatizer initialized successfully.' ) except Exception as ex_stemmer: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Lang "' + str(lang) + ' stemmer/lemmatizer failed to initialize: ' \ + str(ex_stemmer) + '.' Log.warning(errmsg) self.profiler_detect_alp = ProfilingHelper(profiler_name = str(self.__class__)) return # # Only for languages with space as word separator # Or in the case of Vietnamese, it will split by syllables # def __segment_words( self, text ): sent = StringUtils.trim(text) sent = sent.lower() sent = sent.split(' ') # Split out punctuations sent = BasicPreprocessor.clean_punctuations( sentence = sent ) return sent # # Описание Алгоритма # 1. Обнарушение Алфавитов # i) Если приналежит языкам без пробела в качестве разбиение слов или слогов, # это сразу определит тот язык. # ii) Потом Латинские языки, сравнить обычные слова языка с данным текстом # def detect( self, text, test_coverage_pct = DEFAULT_TEST_COVERAGE_PCT, max_test_coverage_len = DEFAULT_TEST_MAX_RANGE_BLOCKS * TEXT_BLOCK_LEN, detailed = False ): det_start_time = Profiling.start() text = str(text) if len(text) == 0: return None # # First step # alps = self.__detect_alphabet_type( text = text, test_coverage_pct = test_coverage_pct, max_test_coverage_len = max_test_coverage_len ) # Either None type or empty dict if not alps: return None # Return value in this format ['hiragana_katakana', 'cjk'] detected_top_alps = list(alps.keys()) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Top alphabets = ' + str(detected_top_alps) ) """ Первый специальный алгоритм - совсем "вручную" обрабатывает исключения из общих правил """ pos_alphabets_manual = self.detect_via_manual_rules(detected_top_alphabet_names=detected_top_alps) if pos_alphabets_manual is not None: self.profiler_detect_alp.profile_time( start_time = det_start_time, additional_info = 'Manual detect lang "' + str(pos_alphabets_manual) + '" for "' + str(text) + '"' ) return pos_alphabets_manual """ Второй общий алгоритм - цикл по наиболее частым типам алфавита """ # Loop by the top detected alphabet types loop_top_x = 2 loop_counter = 0 while loop_counter < loop_top_x: if len(detected_top_alps) > loop_counter: loop_alp = detected_top_alps[loop_counter] else: break loop_counter += 1 Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Loop ' + str(loop_counter) + ' alphabet "' + str(loop_alp) + '"' ) # Get possible languages for this alphabet possible_langs_for_alphabet = self.lang_features.get_languages_for_alphabet_type( alphabet = loop_alp ) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Possible languages for alphabet "' + str(loop_alp) + '": ' + str(possible_langs_for_alphabet) ) # No dispute when only 1 possible language for given alphabet if len(possible_langs_for_alphabet) == 1: Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Only 1 possible language for alphabet: ' + str(possible_langs_for_alphabet) ) self.profiler_detect_alp.profile_time( start_time = det_start_time, additional_info = 'Detect lang "' + str(possible_langs_for_alphabet) + '" for "' + str(text) + '"' ) return possible_langs_for_alphabet det_langs = [] # # From alphabets detected, try to determine language # if loop_alp in self.TEST_HANGUL_BY_ORDER: det_langs = self.detect_lang_from_hangul(text=text, detailed=detailed) # Check Japanese first before CJK because CJK is a subset of Japanese elif loop_alp in self.TEST_JAPANESE_BY_ORDER: det_langs = self.detect_lang_from_japanese(text=text, detailed=detailed) elif loop_alp in self.TEST_CYRILLIC_BY_ORDER: det_langs = self.detect_lang_from_cyrillic(text=text, detailed=detailed) elif loop_alp in self.TEST_THAI_BY_ORDER: det_langs = self.detect_lang_from_thai_alphabet(text=text, detailed=detailed) # # Alphabet belongs to the Latin family # elif loop_alp in self.TEST_LATIN_BY_ORDER: # Almost all Latin Family languages will have LatinAZ come out tops first if loop_alp == LangFeatures.ALPHABET_LATIN_AZ: det_langs = self.detect_lang_from_latin_az( text = text, detected_alphabets_present = detected_top_alps ) if not det_langs: # We extend the search to all Latin if can't find anything det_langs = self.detect_lang_from_latin( text = text ) elif loop_alp == LangFeatures.ALPHABET_CJK: det_langs = self.detect_lang_from_cjk(text=text) # If have result, return the result and quit the loop if det_langs: self.profiler_detect_alp.profile_time( start_time = det_start_time, additional_info='Detect lang "' + str(det_langs) + '" for "' + str(text) + '"' ) return det_langs self.profiler_detect_alp.profile_time( start_time = det_start_time, additional_info = 'Detect lang "' + str([]) + '" for "' + str(text) + '"' ) return [] def detect_via_manual_rules( self, detected_top_alphabet_names, ): # Don't change the original order of detected langs list_copy = detected_top_alphabet_names.copy() list_copy.sort() list_jap = [LangFeatures.ALPHABET_CJK, LangFeatures.ALPHABET_HIRAGANA_KATAKANA] list_jap.sort() if list_copy == list_jap: return [LangFeatures.LANG_JA] return None def detect_lang_from_hangul( self, text, detailed = False, ): if not detailed: return [LangFeatures.LANG_KO] else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Not yet implemented' ) def detect_lang_from_japanese( self, text, detailed = False, ): # TODO Handle the whole cyrillic family if not detailed: return [LangFeatures.LANG_JA] else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Not yet implemented' ) def detect_lang_from_cyrillic( self, text, detailed = False, ): # TODO Handle the whole cyrillic family if not detailed: return [LangFeatures.LANG_RU] else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Not yet implemented' ) def detect_lang_from_cjk( self, text, detailed = False, ): # TODO Differentiate Chinese (simplified, traditional, etc.), Japanese, .. if not detailed: return [LangFeatures.LANG_ZH] else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Not yet implemented' ) def detect_lang_from_thai_alphabet( self, text, detailed = False, ): # TODO Handle the different dialects if not detailed: return [LangFeatures.LANG_TH] else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Not yet implemented' ) def detect_lang_from_latin_az( self, text, detected_alphabets_present ): sent = self.__segment_words(text=text) lang_codes = [] lang_pct = [] for lang in ( LangFeatures.LANG_EN, LangFeatures.LANG_ES, LangFeatures.LANG_FR, LangFeatures.LANG_VI, LangFeatures.LANG_ID, ): lang_codes.append(lang) max_word_n_tuple = 1 if lang == LangFeatures.LANG_VI: max_word_n_tuple = 2 lang_pct.append(self.common_words[lang].get_pct_intersection_with_common_words( word_list = sent, max_word_n_tuple = max_word_n_tuple )) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': For sentence ' + str(sent) + ' lang codes/pct: ' + str(pd.DataFrame({'code': lang_codes, 'pct': lang_pct}).values) ) if lang_codes: idx_max = np.argmax(lang_pct) idx_max = int(idx_max) if lang_pct[idx_max] > self.THRESHOLD_PCT_WORDS_IN_MOST_COMMON: return [lang_codes[idx_max]] else: # Check word stems for lang in lang_codes: if self.word_stemmer[lang] is None: continue sent_stems = [] for w in sent: w_stem = self.word_stemmer[lang].stem(word=w) sent_stems.append(w_stem) if sent_stems == sent: continue Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': For lang "' + str(lang) + '", trying stemmed words: ' + str(sent_stems) ) pct_int = self.common_words[lang].get_pct_intersection_with_common_words( word_list = sent_stems ) if pct_int > self.THRESHOLD_PCT_WORDS_IN_MOST_COMMON: return [lang] # Although French, Spanish could also have these characters, we favor Vietnamese if LangFeatures.ALPHABET_LATIN_VI in detected_alphabets_present: return [LangFeatures.LANG_VI] return [] def detect_lang_from_latin( self, text ): # TODO This logic doesn't do anything sent = self.__segment_words(text=text) lang_codes = [] lang_pct = [] if lang_codes: idx_max = np.argmax(lang_pct) idx_max = int(idx_max) if lang_pct[idx_max] > self.THRESHOLD_PCT_WORDS_IN_MOST_COMMON: return [lang_codes[idx_max]] return [] # # Returns tuple of start/end (not inclusive) # E.g. [(0,10), (10,20), ..] # def __get_text_range_blocks( self, text ): # Break into ranges range_blocks = [] i = 0 len_text = len(text) while i < len_text: end_range = min(len_text, i+self.TEXT_BLOCK_LEN) # range_blocks.append(range(i, end_range, 1)) range_blocks.append((i,end_range)) i = i + self.TEXT_BLOCK_LEN return range_blocks def __detect_alphabet_type( self, text, # default coverage test_coverage_pct, max_test_coverage_len ): alp_chars = [] # Return the range blocks of the text range_blocks = self.__get_text_range_blocks(text = text) n_range = len(range_blocks) how_many_range_to_check = max(1, min( math.ceil(test_coverage_pct * n_range), math.ceil(max_test_coverage_len / self.TEXT_BLOCK_LEN) )) Log.debugdebug('Range blocks: ' + str(range_blocks) + ' how many to check ' + str(how_many_range_to_check)) # Randomly pick the ranges random_ranges_index = random.sample(range(n_range), how_many_range_to_check) random_ranges_index = sorted(random_ranges_index) total_len = 0 for rg in random_ranges_index: start, end = range_blocks[rg] total_len += (end - start + 1) # Means we got the last truncated block if total_len < self.TEXT_BLOCK_LEN: if 0 not in random_ranges_index: random_ranges_index = [0] + random_ranges_index text_excerps = [] for rg in random_ranges_index: start, end = range_blocks[rg] text_excerps.append(text[start:end]) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Random ranges index: ' + str(random_ranges_index) + ' or: ' + str(text_excerps) ) # TODO # How to check without looping, loop is slow # One way is to build a reverse dictionary of all characters to alphabet name/type for rge_idx in random_ranges_index: #for i in range_blocks[rge_idx]: start, end = range_blocks[rge_idx] for i in range(start, end, 1): c = text[i] for alp in self.TESTS_BY_ORDER: if c in self.alphabet_dict[alp]: alp_chars.append(alp) # Go to next character when found alphabet type break if len(alp_chars) == 0: return None ser = pd.Series(alp_chars) vals, counts = np.unique(ser, return_counts=True) # We must mup count as key, so that when we sort the paired items later, # python will sort by the first index which is the count results = dict(zip(counts, vals)) # Sort ascending results_list = sorted(results.items(), reverse=True) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Alphabet detection: ' + str(results_list) + ' details: ' + str(results) ) # Reverse back the mapping results_rev = {kv[1]:kv[0] for kv in results_list} Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Alphabet detection results: ' + str(results_rev) ) return results_rev