def wait_for_all_initializations_to_be_done(self, wait_max_time=10): if self.is_all_initializations_done: return count = 1 sleep_time_wait_initializations = 0.1 while not self.is_all_initializations_done: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model not yet fully initialized, sleep for ' + str(count * sleep_time_wait_initializations) + ' secs now..') if count * sleep_time_wait_initializations > wait_max_time: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Waited too long ' + str(count * sleep_time_wait_initializations)\ + ' secs. Raising exception..' raise Exception(errmsg) time.sleep(sleep_time_wait_initializations) count = count + 1 Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Initializations all done for model "' + str(self.identifier_string) + '" READY.') return
def send(self, user, password, recipients_list, message): try: if password not in [None, '']: self.server.login(user=user, password=password) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Login for user "' + str(user) + '" successful.') else: # If no password passed in, no need to do login Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Not doing login for user "' + str(user) + '", no password given "' + str(password) + '"') self.server.sendmail(from_addr=user, to_addrs=recipients_list, msg=message) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Message from ' + str(user) + ' to ' + str(recipients_list) + ' sent successfully. Closing server..') self.server.close() Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Mail server "' + str(self.mail_server_url) + '" closed') except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Exception sending mail from ' + str(user) + ' to ' + str(recipients_list)\ + '. Got exception ' + str(ex) + '.' Log.error(errmsg) raise Exception(errmsg)
def __init__(self, lang): self.lang = LangFeatures.map_to_lang_code_iso639_1(lang_code=lang) self.raw_words = None self.common_words = None lfobj = LangFeatures() self.lang_have_verb_conj = lfobj.have_verb_conjugation(lang=self.lang) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '" verb conjugation = ' + str(self.lang_have_verb_conj) + '.') self.word_stemmer = None if self.lang_have_verb_conj: try: self.word_stemmer = Lemmatizer(lang=self.lang) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '" stemmer/lemmatizer initialized successfully.') except Exception as ex_stemmer: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Lang "' + str(self.lang) + ' stemmer/lemmatizer failed to initialize: ' \ + str(ex_stemmer) + '.' Log.warning(errmsg) self.word_stemmer = None return
def add_intent_name_to_training_data(self): # # We need to add intent name into the training data also # df_intent_id_name = pd.DataFrame({ DaehuaTrainDataModel.COL_TDATA_INTENT_ID: self.df_training_data[DaehuaTrainDataModel.COL_TDATA_INTENT_ID], DaehuaTrainDataModel.COL_TDATA_INTENT_NAME: self.df_training_data[DaehuaTrainDataModel.COL_TDATA_INTENT_NAME] }) # Make unique by dropping duplicate intent IDs df_intent_id_name.drop_duplicates(inplace=True) for idx in df_intent_id_name.index: intId = df_intent_id_name[ DaehuaTrainDataModel.COL_TDATA_INTENT_ID].loc[idx] try: int_name = str(df_intent_id_name[ DaehuaTrainDataModel.COL_TDATA_INTENT_NAME].loc[idx]) # Arguments be a list form, otherwise will not be able to create this DataFrame row_to_append = pd.DataFrame( data=self.__get_row_to_append_to_training_data( intent_id=[intId], intent_name=[int_name], text=[int_name], text_id=[TrDataPreprocessor.TRDATA_ID_INTENT_NAME], # Make sure to write back this value with processed text processed_text=[None], lang_detected=[None], internal_counter=[self.df_training_data.shape[0]])) # # We are appending to a dataframe that might have different columns ordering # So we make sure they are in the same order, to avoid all the sort=False/True # warning messages by pandas due to required join() operation. # If in same order, then we avoid the join(). # self.df_training_data = self.df_training_data.append( row_to_append, sort=True) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Appended intent name "' + str(int_name) + '" with intent ID ' + str(intId) + ' to list of training data. Row appended = ' + str(row_to_append)) except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Could not append to dataframe or could not get intent name for intent ID ' \ + str(intId) + '. Exception ' + str(ex) Log.warning(errmsg) raise Exception(errmsg) self.__process_training_data_index() return self.df_training_data
def __init__( self ): self.lang_features = LangFeatures() # Map alphabet name to unicode character set array self.alphabet_dict = {} for alp in self.TESTS_BY_ORDER: self.alphabet_dict[alp] = LangCharacters.get_alphabet_charset( alphabet = alp ) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Alphabets used: ' + str(self.alphabet_dict.keys()) ) self.langs_with_no_word_sep = self.lang_features.get_languages_with_no_word_separator() Log.debugdebug('Langs with no word sep: ' + str(self.langs_with_no_word_sep)) # Load common words self.common_words = {} self.common_words[LangFeatures.LANG_EN] = English() self.common_words[LangFeatures.LANG_ES] = Spanish() self.common_words[LangFeatures.LANG_FR] = French() self.common_words[LangFeatures.LANG_ID] = Indonesian() self.common_words[LangFeatures.LANG_VI] = Vietnamese() # Load stemmers self.word_stemmer = {} for lang in self.SUPPORTED_LANGS: lang_have_verb_conj = self.lang_features.have_verb_conjugation( lang = lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(lang) + '" verb conjugation = ' + str(lang_have_verb_conj) + '.' ) self.word_stemmer[lang] = None if lang_have_verb_conj: try: self.word_stemmer[lang] = Lemmatizer( lang = lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(lang) + '" stemmer/lemmatizer initialized successfully.' ) except Exception as ex_stemmer: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Lang "' + str(lang) + ' stemmer/lemmatizer failed to initialize: ' \ + str(ex_stemmer) + '.' Log.warning(errmsg) self.profiler_detect_alp = ProfilingHelper(profiler_name = str(self.__class__)) return
def transform_input_for_model( self, # For the model to interpret and transform in to x usable for model input # (e.g. map using one-hot dictionaries) x_input, word_freq_model = None, ): try: Log.debugdebug('***** x input: ' + str(x_input)) # We expect x_input to be an np array of words if type(x_input) is np.ndarray: x_input = x_input.tolist() if type(x_input) not in (list, tuple): raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.identifier_string) + '". Expect list/tuple type, got type "' + str(type(x_input)) + '" for x input: ' + str(x_input) ) if self.x_one_hot_dict_inverse is None: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.identifier_string) + '" x one hot not yet initialized!' ) x = [] for i in range(len(x_input)): word = x_input[i] if word in self.x_one_hot_dict_inverse.keys(): x.append(self.x_one_hot_dict_inverse[word]) else: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.identifier_string) + '", could not map input value "' + str(word) + '" to code x. Not in x one hot dictionary.' ) # TODO Pad with 0's to satisfy neural network in put length input_shape = self.network.layers[0].input_shape input_len = input_shape[1] Log.debugdebug('***** INPUT SHAPE ' + str(input_shape) + ', len ' + str(input_len) + ', x = ' + str(x)) while len(x) < input_len: x = [0] + x Log.debugdebug(' ***** padded x: ' + str(x)) x = np.array(x) x_transformed = NumpyUtil.convert_dimension(arr=x, to_dim=2) Log.debugdebug(' ***** transformed x: ' + str(x_transformed)) return x_transformed except Exception as ex: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.identifier_string) + '", exception tranforming ' + str(x_input) + '. Exception: ' + str(ex) )
def build_tree(self, dict_parent_childs): self.reset_tree() for parent_key in dict_parent_childs.keys(): child_keys = dict_parent_childs[parent_key] Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Doing for line ' + str(parent_key) + ': ' + str(child_keys) ) if parent_key not in self.tree_nodes.keys(): Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Create new parent ' + str(parent_key) ) parent = MultiTreeNode(name=parent_key, dead_node=False) self.tree_nodes[parent_key] = parent else: parent = self.tree_nodes[parent_key] Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Retrieved parent ' + str(parent.name) ) for child_k in child_keys: if child_k == parent_key: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Child "' + str(child_k) + '" same as parent "' + str(parent_key) + '" Ignoring...' ) continue Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Doing child ' + str(child_k) + ' for parent ' + str(parent_key) ) if child_k not in self.tree_nodes.keys(): Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Create new child ' + str(child_k) ) child = MultiTreeNode(name=child_k, dead_node=False) self.tree_nodes[child_k] = child else: child = self.tree_nodes[child_k] Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Retrieved child ' + str(child.name) ) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Child ' + str(child.name) + ' adding parent ' + str(parent.name) ) child.add_parent(parent=parent) self.build_tree_roots() return self.tree_nodes
def __init__(self, noun_case_endings=NOUN_PARTICLES, verb_case_endings=()): super().__init__(noun_case_endings=noun_case_endings, verb_case_endings=verb_case_endings) try: # Разбить Хангул (한글) слоги на буквы (자모) # https://github.com/JDongian/python-jamo, https://python-jamo.readthedocs.io/en/latest/ from jamo import h2j, j2hcj except Exception as ex: errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Error importing jamo library: ' + str(ex) Log.warning(errmsg) raise Exception(errmsg) return
def slice_str(x, maxlen): len_x = len(str(x)) l = min(len_x, maxlen) if l < len_x: x_slice = str(x)[0:l] Log.warning( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Cut from length ' + str(len_x) + ' to ' + str(maxlen) + ' characters. From "' + str(x) + '" to "' + str(x_slice) + '"' ) return x_slice else: return x
def __send_email(self, text_subject, text_msg, files, ignore_limit): email_msg = SendMail.prepare_message( from_addr=self.from_addr, to_addrs_list=self.alert_recipients, subject=text_subject, text=text_msg, files=files) try: # Check how many already sent this hour if datetime.now().hour != self.current_hour: self.current_hour = datetime.now().hour self.emails_sent_this_hour = 0 if not ignore_limit: if self.emails_sent_this_hour >= self.limit_per_hour: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Send email alert limit ' + str(self.limit_per_hour) + ' per hour hit. Not sending subject: "' + str(text_subject) + '", message: ' + str(text_msg)) return else: Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Ignoring send limit of ' + str(self.limit_per_hour) + ' per hour.') if self.fake_send: print('Fake send email from "' + str(self.from_addr) + '" to: ' + str(self.alert_recipients) + ' Message:\n\r' + str(email_msg)) else: SendMail(mode=self.mail_mode, mail_server_url=self.mail_server_url, mail_server_port=self.mail_server_port).send( user=self.from_addr, password=self.password, recipients_list=self.alert_recipients, message=email_msg) self.emails_sent_this_hour += 1 except Exception as ex_mail: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Error sending email: ' + str(ex_mail) + '. Could not send message: ' + str(email_msg))
def print_tree(self, level, tnode, max_levels=8, newline='\n\r', tabchar='\t'): if level > max_levels: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Level too much > ' + str(max_levels) + ' for "' + str(tnode.name) + '"' ) return '' tabstr = '' for i in range(level): tabstr += tabchar string_to_print = str(tabstr) + 'Level ' + str(level) + ': ' + str(tnode.name) + str(newline) for child in tnode.children: string_to_print += self.print_tree( level=level+1, tnode=child, max_levels=max_levels, newline=newline, tabchar=tabchar ) return string_to_print
def wait_for_model(self): count = 1 sleep_time_wait_rfv = 0.1 wait_max_time = 10 while not self.is_model_ready(): Log.warning( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Waiting for model with identifier "' + str(self.identifier_string) + ', sleep for ' + str(count * sleep_time_wait_rfv) + ' secs now..') if count * sleep_time_wait_rfv > wait_max_time: errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Waited too long for model "' + str(self.identifier_string) \ + '" total wait time ' + str(count * sleep_time_wait_rfv) + ' secs. Raising exception..' raise Exception(errmsg) time.sleep(sleep_time_wait_rfv) count = count + 1
def is_higher_level(self, node, supposed_child_node): Log.debug( '***** check if "' + str(supposed_child_node.name) + '" is higher level than "' + str(node.name) + '", parents: ' + str(node.parent_names) ) if supposed_child_node.name in node.parent_names: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Node "' + str(self.name) + '" cannot add "' + str(supposed_child_node.name) + '" as child. Node "' + str(supposed_child_node.name) + '" is already a higher level parent node to "' + str(self.name) + '"' ) return True for par in node.parents: if self.is_higher_level(node=par, supposed_child_node=supposed_child_node): return True else: continue return False
def __sanity_check( self, sentences_list, ): for sent in sentences_list: if type(sent) not in (list, tuple): errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Warning line ' + str(sent) + ', sentence not list type but type "'\ + str(type(sent)) + '": ' + str(sent) Log.warning(errmsg) raise Exception(errmsg) for j in range(len(sent)): w = sent[j] if type(w) is not str: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Warning line ' + str(sent) + ', have non string type words "' \ + str(type(w)) + '": ' + str(w) Log.warning(errmsg) raise Exception(errmsg) return
def wait_for_model_to_be_ready(self, wait_max_time=10): # # Model reloaded without us knowing, e.g. user trained it, etc. # if self.model_last_reloaded_counter != self.model.get_model_reloaded_counter( ): Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + 'Model "' + str(self.identifier_string) + '" last counter ' + str(self.model_last_reloaded_counter) + ' not equal to model counter ' + str(self.model.get_model_reloaded_counter()) + '. Model updated, thus we must update our text processor.' ) # # Должен опять загрузить потому что класс TxtPreprocessor нужны данные из модели # self.load_text_processor() if self.model.is_model_ready(): return count = 1 sleep_time_wait_model = 0.1 while not self.model.is_model_ready(): Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.identifier_string) + '" not yet ready, sleep for ' + str(count * sleep_time_wait_model) + ' secs now..') if count * sleep_time_wait_model > wait_max_time: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Waited for model "' + str(self.identifier_string)\ + '" too long ' + str(count * sleep_time_wait_model) + ' secs. Raising exception..' raise Exception(errmsg) time.sleep(sleep_time_wait_model) count = count + 1 Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.identifier_string) + '" READY.') return
def __attach_file_check_validity_and_size( files_attachment_list, max_total_files_size=MAX_TOTAL_FILES_SIZE_MB_EMAIL_ATTCH): if files_attachment_list is None: return [] files_attachment_list_allowed = [] cum_size_mb = 0.0 for filepath in files_attachment_list: if os.path.isfile(filepath): Log.info('File <' + str(__name__) + '> line ' + str(getframeinfo(currentframe()).lineno) + ': Attachment file path "' + str(filepath) + '" OK') else: Log.error('File <' + str(__name__) + '> line ' + str(getframeinfo(currentframe()).lineno) + ': Invalid attachment file "' + str(filepath) + '", not attaching to email') continue fsize_bytes = os.path.getsize(filepath) fsize_mb = round(fsize_bytes / (1024 * 1024), 2) if fsize_mb + cum_size_mb < max_total_files_size: files_attachment_list_allowed.append(filepath) cum_size_mb += fsize_mb Log.info('File <' + str(__name__) + '> line ' + str(getframeinfo(currentframe()).lineno) + ': Appended file "' + str(filepath) + '" as email attachment size ' + str(fsize_mb) + 'MB, total cumulative ' + str(cum_size_mb) + 'MB') else: Log.warning('File <' + str(__name__) + '> line ' + str(getframeinfo(currentframe()).lineno) + ': File "' + str(filepath) + '" too big ' + str(fsize_mb) + 'MB. Cumulative = ' + str(fsize_mb + cum_size_mb) + ' Not attaching to email') return files_attachment_list_allowed
def confirm_form(self, answer): answer = StringUtils.trim(answer) if answer.lower() in self.text_list_confirm_words: self.set_state_form_completed_and_confirmed() self.reset_continuous_error_count() return True else: # Try to update all fields strictly, maybe user wants to change something result = self.set_all_field_value_from_answer(answer=answer) if result.is_updated: self.reset_continuous_error_count() else: self.increment_continuous_error_count() if self.is_error_threshold_hit(): Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Reset form after ' + str(self.fill_form_continuous_err_count) + ' error counts.') self.reset() # No form confirmation return False
def preprocess_list_all_langs( self, sentences_list, # The output required may differ for different further processings # Some may require POS Tagging, some may require lemmatization, some may require to remove # stop words, etc algorithm=None, ): langs_list = self.detect_lang(sentences_list=sentences_list, method='nwae') Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Done detecting ' + str(len(sentences_list)) + ' sentence languages: ' + str(langs_list)) # Get default lang as most common language detected langs_counter = collections.Counter(langs_list).most_common() self.lang_default = None for lang_count in langs_counter: if lang_count[0] != '': self.lang_default = lang_count[0] break # If still no default language (nothing detected at all from sentences passed in) if self.lang_default is None: self.lang_default = LangFeatures.LANG_EN langs_list = langs_list + [self.lang_default] Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unable to determine default language from langs ' + str(langs_counter) + ' Using default lang "' + str(self.lang_default) + '"') Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Most common language detected "' + str(self.lang_default) + '" from ' + str(langs_counter)) unique_langs = [l for l in list(set(langs_list)) if l] Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unique langs found: ' + str(unique_langs)) for lang in unique_langs: Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Creating lang "' + str(lang) + '" word segmenter.') try: obj_tmp = TxtPreprocessor( identifier_string=lang, dir_path_model=None, model_features_list=None, lang=lang, dir_wordlist=self.dir_wordlist, postfix_wordlist=self.postfix_wordlist, dir_wordlist_app=self.dir_app_wordlist, postfix_wordlist_app=self.postfix_app_wordlist, dirpath_synonymlist=self.dir_synlist, postfix_synonymlist=self.postfix_synlist, stopwords_list=self.stopwords_list, ) self.txt_preprcsr_by_lang[lang] = obj_tmp except Exception as ex_load_txtprcsr: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Failed to load text processor all lang for lang "' + str(lang) + '": ' + str(ex_load_txtprcsr)) sentences_list_processed = [] for i in range(len(sentences_list)): sent = sentences_list[i] lang = langs_list[i] if lang not in self.txt_preprcsr_by_lang.keys(): Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(lang) + '" not in keys ' + str(self.txt_preprcsr_by_lang.keys())) lang = self.lang_default sent_processed = self.txt_preprcsr_by_lang[lang].process_text( inputtext=sent, ) # commented out this part for now Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Preprocessed sentence "' + str(sent) + '" to "' + str(sent_processed) + '"') sentences_list_processed.append(sent_processed) return sentences_list_processed
def train( self, write_model_to_storage=True, write_training_data_to_storage=False, # Option to train a single y ID/label y_id=None, # To keep training logs here for caller's reference log_list_to_populate=None): prf_start = prf.Profiling.start() if self.training_data is None: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Cannot train without training data for identifier "' + self.identifier_string + '"') self.mutex_training.acquire() try: if type(log_list_to_populate) is list: self.logs_training = log_list_to_populate else: self.logs_training = [] Log.important(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Training for identifier=' + self.identifier_string + ', y_id ' + str(y_id) + '. Using key features remove quartile = ' + str(self.key_features_remove_quartile) + ', stop features = [' + str(self.stop_features) + ']' + ', weigh by EIDF = ' + str(self.weigh_idf), log_list=self.logs_training) # # Here training data must be prepared in the correct format already # Значит что множество свойств уже объединено как одно (unified features) # # Log.debugdebug( # str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) # + '\n\r\tTraining data:\n\r' + str(self.training_data.get_x().tolist()) # + '\n\r\tx names: ' + str(self.training_data.get_x_name()) # + '\n\r\ty labels: ' + str(self.training_data.get_y()) # ) # # Get IDF first # The function of these weights are nothing more than dimension reduction # TODO: IDF may not be the ideal weights, design an optimal one. # if self.weigh_idf: if MetricSpaceModel.USE_OPIMIZED_IDF: try: Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Initializing EIDF object.. try to read from file first', log_list=self.logs_training) # Try to read from file df_eidf_file = eidf.Eidf.read_eidf_from_storage( dir_path_model=self.dir_path_model, identifier_string=self.identifier_string, x_name=self.training_data.get_x_name()) Log.info(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Successfully Read EIDF from file.', log_list=self.logs_training) self.model_data.idf = np.array( df_eidf_file[eidf.Eidf.STORAGE_COL_EIDF]) except Exception as ex_eidf: Log.critical( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': No EIDF from file available. Recalculating EIDF..', log_list=self.logs_training) idf_opt_obj = eidf.Eidf( x=self.training_data.get_x(), y=self.training_data.get_y(), x_name=self.training_data.get_x_name()) idf_opt_obj.optimize(initial_w_as_standard_idf=True) self.model_data.idf = idf_opt_obj.get_w() else: # Sum x by class self.model_data.idf = eidf.Eidf.get_feature_weight_idf_default( x=self.training_data.get_x(), y=self.training_data.get_y(), x_name=self.training_data.get_x_name()) else: self.model_data.idf = np.array( [1.0] * self.training_data.get_x_name().shape[0], dtype=float) # Standardize to at least 2-dimensional, easier when weighting x self.model_data.idf = npUtil.NumpyUtil.convert_dimension( arr=self.model_data.idf, to_dim=2) Log.debugdebug(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + '\n\r\tEIDF values:\n\r' + str(self.model_data.idf), log_list=self.logs_training) # # Re-weigh again. This will change the x in self.training data # self.training_data.weigh_x(w=self.model_data.idf[0]) # # Initizalize model data # # Refetch again after weigh x = self.training_data.get_x() y = self.training_data.get_y() self.model_data.x_name = self.training_data.get_x_name() # Unique y or classes # We do this again because after weighing, it will remove bad rows, which might cause some y # to disappear self.model_data.y_unique = np.array(list(set(y))) Log.debugdebug(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + '\n\r\tx weighted by idf and renormalized:\n\r' + str(x.tolist()) + '\n\r\ty\n\r' + str(y) + '\n\r\tx_name\n\r' + str(self.model_data.x_name), log_list=self.logs_training) # # Get RFV for every command/intent, representative feature vectors by command type # # 1. Cluster training data of the same intent. # Instead of a single RFV to represent a single intent, we should have multiple. xy_clstr = MetricSpaceModel.get_clusters( x=x, y=y, x_name=self.model_data.x_name, log_training=self.logs_training) self.model_data.x_clustered = xy_clstr.x_cluster self.model_data.y_clustered = xy_clstr.y_cluster self.model_data.y_clustered_radius = xy_clstr.y_cluster_radius # # RFV Derivation # m = np.zeros( (len(self.model_data.y_unique), len(self.model_data.x_name))) # Temporary only this data frame df_x_ref = pd.DataFrame(m, columns=self.model_data.x_name, index=list(self.model_data.y_unique)) #print('***** y unique type: ' + str(type(self.model_data.y_unique)) + ', df_x_ref: ' + str(df_x_ref)) self.model_data.df_y_ref_radius = pd.DataFrame( { MetricSpaceModel.TERM_CLASS: list(self.model_data.y_unique), MetricSpaceModel.TERM_RADIUS: [MetricSpaceModel.HPS_MAX_EUCL_DIST] * len(self.model_data.y_unique), }, index=list(self.model_data.y_unique)) #print('***** df_x_ref: ' + str(self.model_data.df_y_ref_radius)) # # Derive x_ref and y_ref # for cs in self.model_data.y_unique: Log.debug(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Doing class [' + str(cs) + ']', log_list=self.logs_training) # Extract class points class_points = x[y == cs] # # Reference feature vector for the command is the average of all feature vectors # rfv = np.sum(class_points, axis=0) / class_points.shape[0] # Renormalize it again # At this point we don't have to check if it is a 0 vector, etc. as it was already done in TrainingDataModel # after weighing process normalize_factor = np.sum(np.multiply(rfv, rfv))**0.5 if normalize_factor < const.Constants.SMALL_VALUE: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Normalize factor for rfv in class "' + str(cs) + '" is 0.') rfv = rfv / normalize_factor # A single array will be created as a column dataframe, thus we have to name the index and not columns df_x_ref.at[cs] = rfv check_normalized = np.sum(np.multiply(rfv, rfv))**0.5 if abs(check_normalized - 1) > const.Constants.SMALL_VALUE: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Warning! RFV for class [' + str(cs) + '] not 1, but [' + str(check_normalized) + '].' Log.warning(errmsg, log_list=self.training_data) raise Exception(errmsg) else: Log.debug(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Check RFV class "' + str(cs) + '" normalized ok [' + str(check_normalized) + '].', log_list=self.logs_training) # # Get furthest point of classification to rfv # This will be used to accept or reject a classified point to a particular class, # once the nearest class is found (in which no class is found then). # # Minimum value of threshold, don't allow 0's radius_max = -1 for i in range(0, class_points.shape[0], 1): p = class_points[i] dist_vec = rfv - p dist = np.sum(np.multiply(dist_vec, dist_vec))**0.5 Log.debugdebug(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ' Class ' + str(cs) + ' check point ' + str(i) + ', distance= ' + str(dist) + '. Point ' + str(class_points[i]) + ' with RFV ' + str(rfv), log_list=self.logs_training) if dist > radius_max: radius_max = dist self.model_data.df_y_ref_radius[ MetricSpaceModel.TERM_RADIUS].at[cs] = dist Log.debug(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Class "' + str(cs) + '". Max Radius = ' + str(self.model_data.df_y_ref_radius[ MetricSpaceModel.TERM_RADIUS].loc[cs]), log_list=self.logs_training) df_x_ref.sort_index(inplace=True) self.model_data.y_ref = np.array(df_x_ref.index) self.model_data.x_ref = np.array(df_x_ref.values) Log.debug('**************** ' + str(self.model_data.y_ref)) if self.do_profiling: Log.important(str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ' PROFILING train(): ' + prf.Profiling.get_time_dif_str( prf_start, prf.Profiling.stop()), log_list=self.logs_training) if write_model_to_storage: self.persist_model_to_storage() if write_training_data_to_storage or (self.is_partial_training): self.persist_training_data_to_storage(td=self.training_data) except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Training exception for identifier "' + str(self.identifier_string) + '".'\ + ' Exception message ' + str(ex) + '.' Log.error(errmsg) raise ex finally: self.mutex_training.release() return self.logs_training
# -*- coding: utf-8 -*- from nwae.utils.Log import Log from inspect import getframeinfo, currentframe import nwae.lang.LangFeatures as lf import nwae.utils.UnitTest as ut try: import hanzidentifier as hz except Exception as ex: Log.warning( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Cannot import hanzidentifier: ' + str(ex)) pass # # Class LangCharacters: # This class lays the fundamentals for dealing with characters & strings of multiple languages. # We define Unicode blocks for the relevant language characters, including punctuations, etc. # Every alphabet or character has a Unicode value (max value is 2^32) # # But when required to store as a string variable, it has to undergo a transformation into say # UTF-8. This is purely for compression so we don't store each character as 4 bytes. # chr() converts a Unicode value to a Unicode string, e.g. the Unicode value 0x9a6c or 39532 # is converted to '马' (either stored as UTF-8 or some encoding). # # Another difference with R is that in Python, we always need to convert strings to Unicode form # for the above functions to work. In R this is handled transparently. # # The Python function ord() does the opposite, converts '马' back to it's integer Unicode value. # # Supports:
def add_latin_form_to_training_data(self): # # We only support this complication if the main language has a Latin Equivalent Form # We ignore if it is only an additional language, to reduce complexity # if not latinEqForm.LatinEquivalentForm.have_latin_equivalent_form( lang=self.language_main): Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': For "' + str(self.model_identifier) + '", language "' + str(self.language_main) + '", nothing to do for latin equivalent form.') return Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': For "' + str(self.model_identifier) + '", language "' + str(self.language_main) + '", adding to training data, the latin equivalent form.') for idx in self.df_training_data.index: text = str(self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_TEXT].loc[idx]) text_processed = str(self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_TEXT_SEGMENTED].loc[idx]) internal_counter = self.df_training_data[ TrDataPreprocessor.TD_INTERNAL_COUNTER].loc[idx] # # Process the sentence, word by word # word_sep = BasicPreprocessor.get_word_separator( lang=self.language_main) latin_form_sentence_arr = [] for word in text_processed.split(sep=word_sep): word_latin = latinEqForm.LatinEquivalentForm.get_latin_equivalent_form( lang=self.language_main, word=word) latin_form_sentence_arr.append(word_latin) latin_form_sentence_txt = word_sep.join(latin_form_sentence_arr) if latin_form_sentence_txt == text_processed: continue Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Processing latin equivalent form "' + str(latin_form_sentence_txt) + '" for sentence "' + str(text_processed) + '".') int_id = self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_INTENT_ID].loc[idx] int_name = self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_INTENT_NAME].loc[idx] row_to_append = None try: # Arguments be a list form, otherwise will not be able to create this DataFrame row_to_append = pd.DataFrame( data=self.__get_row_to_append_to_training_data( intent_id=[int_id], intent_name=[int_name], text=[text], text_id=[TrDataPreprocessor.TRDATA_ID_LATIN_FORM], processed_text=[latin_form_sentence_txt], lang_detected=[self.language_main], internal_counter=[internal_counter])) # # We are appending to a dataframe that might have different columns ordering # So we make sure they are in the same order, to avoid all the sort=False/True # warning messages by pandas due to required join() operation. # If in same order, then we avoid the join(). # self.df_training_data = self.df_training_data.append( row_to_append, sort=True) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Appended latin equivalent form "' + str(latin_form_sentence_txt) + '" with intent ID ' + str(int_id) + ' to list of training data. Row appended = ' + str(row_to_append)) except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Could not append row ' + str(row_to_append) + ' to dataframe for intent ID ' \ + str(int_id) + '. Exception ' + str(ex) Log.warning(errmsg) raise Exception(errmsg) self.__process_training_data_index() return
def process_text_training_data(self, ): # The algorithm to segment words works as follows: # If segmented text returned from DB is None or shorter than text, we will process the text. # However if the flag self.reprocess_all_text == True, we segment no matter what. Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': START SEGMENT & STEM DB TRAINING DATA, FORCE RESEGMENT ALL = ' + str(self.reprocess_all_text)) td_total_rows = self.df_training_data.shape[0] count = 0 for idx_row in self.df_training_data.index: count = count + 1 text_from_db = str(self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_TEXT].loc[idx_row]) text_processed_from_db = self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_TEXT_SEGMENTED].loc[idx_row] intent_td_id = self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_TRAINING_DATA_ID].loc[idx_row] intent_id = self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_INTENT_ID].loc[idx_row] intent_name = self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_INTENT_NAME].loc[idx_row] # Internal Counter internal_counter = self.df_training_data[ TrDataPreprocessor.TD_INTERNAL_COUNTER].loc[idx_row] Log.debugdebug('Processing index row "' + str(idx_row) + '" ' + str(self.df_training_data.loc[idx_row]) + '"') if type(text_from_db) is not str: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Text from DB "' + str(text_from_db) + '" not string type.') text_from_db = str(text_from_db) # When a text is updated in DB/storage, this field should be cleared in DB to NULL if text_processed_from_db is None: text_processed_from_db = '' possible_langs = self.lang_detect.detect(text=text_from_db) # Empty list if not possible_langs: lang_detected = self.language_main else: lang_detected = possible_langs[0] # If detected language not supported if lang_detected not in [self.language_main ] + self.languages_additional: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': For "' + str(self.model_identifier) + '", detected lang "' + str(lang_detected) + '" not in languages supported') lang_detected = self.language_main # Update data frame with language detected self.df_training_data[DaehuaTrainDataModel.COL_TDATA_TEXT_LANG].at[idx_row] = \ lang_detected #if lang_detected != self.language_main: Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(lang_detected) + '" main lang "' + str(self.language_main) + '" for text "' + str(text_from_db) + '".') # # Sanity check only. Should not happen since after every training data update, # NULL would be written back to the TextSegmented column. # Because we don't want to reprocess all text which takes time, so we guess first # is_likely_processed_text_changed = len( text_processed_from_db) < len(text_from_db) # If a language has verb conjugation, we cannot just compare length as the original text could be longer if self.lang_have_verb_conj[lang_detected]: # So we just hardcode is_likely_processed_text_changed = len( text_processed_from_db) <= 8 if is_likely_processed_text_changed: if (intent_td_id is not None) and (intent_td_id > 0): # Warn only if it is not our own inserted data Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Text "' + str(text_from_db) + '" likely has incorrect segmentation "' + str(text_processed_from_db) + '".') # # We only reprocess the text if there is some likelihood of change # if self.reprocess_all_text or is_likely_processed_text_changed: processed_text_str = self.txt_preprocessor[ lang_detected].process_text(inputtext=text_from_db, return_as_string=True) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Text "' + str(text_from_db) + '" processed text "' + str(processed_text_str) + '".') is_text_processed_changed = not (text_processed_from_db == processed_text_str) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': No ' + str(count) + ' of ' + str(td_total_rows) + ': Tr Data ID "' + str(intent_td_id) + '". Force segment = ' + str(self.reprocess_all_text) + '\n\r Text "' + str(text_from_db) + '". Processed to "' + str(processed_text_str) + '"' + ', changed = ' + str(is_text_processed_changed)) # Training ID 0 are those we inserted ourselves so no need to update anything if is_text_processed_changed: # Update the column self.df_training_data[DaehuaTrainDataModel.COL_TDATA_TEXT_SEGMENTED].at[idx_row] = \ processed_text_str # For intent name we inserted, no need to warn if (intent_td_id is not None) and (intent_td_id > 0): Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Processed text different. Text "' + str(text_from_db) + '\n\r new processed text "' + str(processed_text_str) + '"' + '\n\r old processed text "' + str(text_processed_from_db) + '"') row_changed = self.__get_row_to_append_to_training_data( intent_id=intent_id, intent_name=intent_name, text=text_from_db, text_id=intent_td_id, processed_text=processed_text_str, lang_detected=lang_detected, internal_counter=internal_counter) self.list_of_rows_with_changed_processed_text.append( row_changed) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Appended changed row: ' + str(row_changed)) else: Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Processed text ' + str(count) + ' ok "' + str(processed_text_str) + '" from "' + str(text_from_db) + '"') else: Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Training data ID ' + str(intent_td_id) + ': No ' + str(count) + ' of ' + str(td_total_rows) + ': Nothing to do, OK segmented/processed from DB "' + str(text_processed_from_db) + '"') return
def warn_korean(self): Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Korean splitting currently uses kkma which is super slow and unusable for production purposes' )
def preprocess_training_data_text(self): # Just add intent names into the training data, no text processing self.add_intent_name_to_training_data() self.process_text_training_data() self.add_latin_form_to_training_data() try: from nwae.ml.text.TxtTransform import TxtTransform # Conversion to padded docs res = TxtTransform(docs=list(self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_TEXT_SEGMENTED]), labels=list(self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_INTENT_ID]), langs=list(self.df_training_data[ DaehuaTrainDataModel.COL_TDATA_TEXT_LANG]) ).create_padded_docs() Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Padded Docs: ' + str(res.padded_encoded_docs) + ', Labels: ' + str(res.encoded_labels)) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Labels Categorical: ' + str(res.encoded_labels_categorical)) self.embedding_params = EmbeddingParams( x=res.padded_encoded_docs, x_original=res.original_docs, y=np.array(res.encoded_labels), y_original=res.y_original, x_one_hot_dict=res.x_one_hot_dict, y_one_hot_dict=res.y_one_hot_dict, max_sent_len=res.max_x_length, max_label_val=max(res.encoded_labels), vocab_size=res.vocabulary_dimension) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Converted ' + str(len(self.embedding_params.x)) + ' rows padded docs. Max sentence length = ' + str(self.embedding_params.max_sent_len) + ', max label value = ' + str(self.embedding_params.max_label_val) + ', vocabulary size = ' + str(self.embedding_params.vocab_size) + ', x one hot dict: ' + str(self.embedding_params.x_one_hot_dict)) Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Original docs:\n\r' + str(self.embedding_params.x_original) + '\n\rEncoded padded docs\n\r:' + str(self.embedding_params.x) + '\n\rOriginal labels\n\r' + str(self.embedding_params.y_original) + '\n\rEncoded labels\n\r' + str(self.embedding_params.y)) except Exception as ex_embed: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Error converting to training text to embed params: ' + str(ex_embed) Log.warning(errmsg) # Don't raise error # raise Exception(errmsg) return (self.df_training_data, self.embedding_params)
class LangCharacters(object): encoding = 'utf-8' def __init__(self, encoding='utf-8'): self.encoding = encoding return # # Latin # # Latin Unicode Block as 'int' list UNICODE_BLOCK_ORDINAL_LATIN_BASIC = tuple( range(0x0041, 0x005A+1, 1) ) +\ tuple( range(0x0061, 0x007A+1, 1) ) # Convert to Python Unicode Type list UNICODE_BLOCK_LATIN_BASIC = tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_LATIN_BASIC]) # Can be used interchangeably UNICODE_BLOCK_LATIN_AZ = UNICODE_BLOCK_LATIN_BASIC # Latin Extended UNICODE_BLOCK_ORDINAL_LATIN_EXTENDED = tuple( range(0x00C0, 0x00F6+1, 1) ) +\ tuple( range(0x00F8, 0x01BF+1, 1) ) +\ tuple( range(0x01C4, 0x024F+1, 1) ) UNICODE_BLOCK_LATIN_EXTENDED = tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_LATIN_EXTENDED]) # All Latin UNICODE_BLOCK_ORDINAL_LATIN_ALL = UNICODE_BLOCK_ORDINAL_LATIN_BASIC + UNICODE_BLOCK_ORDINAL_LATIN_EXTENDED UNICODE_BLOCK_LATIN_ALL = UNICODE_BLOCK_LATIN_BASIC + UNICODE_BLOCK_LATIN_EXTENDED # Just Latin specific to Vietnamese (actually, also French, Spanish, etc.) # It is actually a subset of the Latin Extended UNICODE_BLOCK_LATIN_VIETNAMESE =\ tuple('ăâàằầảẳẩãẵẫáắấạặậêèềẻểẽễéếẹệìỉĩíịôơòồờỏổởõỗỡóốớọộợưùừủửũữúứụựđýỳỷỹỵ') # Can be used interchangeably UNICODE_BLOCK_LATIN_VI = UNICODE_BLOCK_LATIN_VIETNAMESE UNICODE_BLOCK_LATIN_VI_AZ = UNICODE_BLOCK_LATIN_VI + UNICODE_BLOCK_LATIN_AZ # # CJK # UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS = tuple( range(0x4E00, 0x9FFF + 1, 1)) UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS =\ tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS] ) UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_A = tuple( range(0x3400, 0x4DBF + 1, 1)) UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS_EXT_A =\ tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_A] ) UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_B = tuple( range(0x20000, 0x2A6DF + 1, 1)) UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS_EXT_B =\ tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_B] ) UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_C = tuple( range(0x2A700, 0x2B73F + 1, 1)) UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS_EXT_C =\ tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_C] ) UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_D = tuple( range(0x2B740, 0x2B81F + 1, 1)) UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS_EXT_D =\ tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_D] ) UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_E = tuple( range(0x2B820, 0x2CEAF + 1, 1)) UNICODE_BLOCK_CJK_UNIFIED_IDEOGRAPHS_EXT_E = \ tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_E] ) UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS = tuple( range(0xF900, 0xFAFF + 1, 1)) UNICODE_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS = \ tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS] ) UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS_SUPP = tuple( range(0x2F800, 0x2FA1F + 1, 1)) UNICODE_BLOCK_CJK_COMPATIBILITY_IDEOGRAPHS_SUPP = \ tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS_SUPP] ) UNICODE_BLOCK_ORDINAL_CJK = UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS + UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_A +\ UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS +\ UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_B + UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_C +\ UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_D + UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_D +\ UNICODE_BLOCK_ORDINAL_CJK_UNIFIED_IDEOGRAPHS_EXT_E +\ UNICODE_BLOCK_ORDINAL_CJK_COMPATIBILITY_IDEOGRAPHS_SUPP UNICODE_BLOCK_CJK = tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK]) # This UNICODE_BLOCK_CJK is not a unique set, there are character repeats # import collections # c = collections.Counter(UNICODE_BLOCK_CJK) # char_repeats = [x for x in c.keys() if c[x]>1] # char_repeats_unicode = [hex(ord(x)) for x in char_repeats] # print(char_repeats) # print(char_repeats_unicode) # TODO # Some interesting notes below # Case 1: Simplified Chinese takes Precedence (all characters in Simplified Chinese are surely "simplified") # Historically Never Simplified Characters still in "Traditional" Hanja/Kanji/Chinese # hanzidentifier.is_simplified('入') = True # hanzidentifier.is_simplified('口') = True # meaning there is no traditional version to these characters at all # For example, in Japan you will see this Kanji '入口' (entrance) everywhere, which is the same in # simplified Chinese (China/Malaysia/Singapore), traditional Chinese (Taiwan/HK) and Hanja (Hangul 입구), # with exactly the same meanings. # This means we cannot use this Unicode blocks to decide the language, as Japanese Kanji, Hanja, # simplified/traditional Chinese will point to "simplified" # Case 2: Combination vs Individual Characters # Take the traditional word '辭退' (citui), and the simplified version '辞退'. If this is fed into code, # hanzidentifier.is_simplified('辭退') = False # hanzidentifier.is_simplified('辞退') = True # But the interesting thing is the 2nd character '退' is the same in both traditional and simplified # hanzidentifier.is_simplified('退') = True # So this means that without first "tokenizing" the sentence, there is no way to tell. # But this is chicken & egg, how to tokenize without first knowing the language? # However if every character in a text is labeled "simplified", then it should be simplified Chinese # & nothing else. But even this is not applicable to very short sentences like '入口'. # Looping over 80k symbols is fast enough, no need to worry try: UNICODE_BLOCK_ORDINAL_CJK_SIMPLIFIED = tuple( [u for u in UNICODE_BLOCK_ORDINAL_CJK if hz.is_simplified(chr(u))]) UNICODE_BLOCK_CJK_SIMPLIFIED = tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_SIMPLIFIED]) UNICODE_BLOCK_ORDINAL_CJK_TRADITIONAL = tuple([ u for u in UNICODE_BLOCK_ORDINAL_CJK if not hz.is_simplified(chr(u)) ]) UNICODE_BLOCK_CJK_TRADITIONAL = tuple([ chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CJK_TRADITIONAL ]) # Taking set difference will result in smaller set due to repeats in CJK # UNICODE_BLOCK_CJK_TRADITIONAL = tuple( set(UNICODE_BLOCK_CJK) - set(UNICODE_BLOCK_CJK_SIMPLIFIED) ) except Exception as ex: Log.warning( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Cannot get CJK simplified/traditional: ' + str(ex)) UNICODE_BLOCK_ORDINAL_CJK_SIMPLIFIED = None UNICODE_BLOCK_CJK_SIMPLIFIED = None UNICODE_BLOCK_ORDINAL_CJK_TRADITIONAL = None UNICODE_BLOCK_CJK_TRADITIONAL = None # # Cyrillic UNICODE_BLOCK_ORDINAL_CYRILLIC = tuple(range(0x0400, 0x04FF + 1, 1)) UNICODE_BLOCK_CYRILLIC = tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_CYRILLIC]) # Cyrillic Supplement # (Cyrillic letters for writing several minority languages, # including Abkhaz, Kurdish, Komi, Mordvin, Aleut, Azerbaijani, # and Jakovlev's Chuvash orthography) UNICODE_BLOCK_SUPPL_CYRILLIC = tuple(range(0x0500, 0x052F + 1, 1)) UNICODE_BLOCK_SUPPL_CYR = tuple( [chr(supl) for supl in UNICODE_BLOCK_SUPPL_CYRILLIC]) # Cyrillic Extanded-A # (Cyrillic letters used in Old Church Slavonic texts) UNICODE_BLOCK_EXT_A_CYRILLIC = tuple(range(0x2DE0, 0x2DFF + 1, 1)) UNICODE_BLOCK_EXT_A_CYR = tuple( [chr(supl) for supl in UNICODE_BLOCK_EXT_A_CYRILLIC]) # Cyrillic Extanded-B # (Cyrillic characters for writing Old Cyrillic and Old Abkhazian, # and combining numeric signs) UNICODE_BLOCK_EXT_B_CYRILLIC = tuple(range(0xA640, 0xA69F + 1, 1)) UNICODE_BLOCK_EXT_B_CYR = tuple( [chr(supl) for supl in UNICODE_BLOCK_EXT_B_CYRILLIC]) # Cyrillic Extanded-C # (Cyrillic numerals) UNICODE_BLOCK_EXT_C_CYRILLIC = tuple(range(0x1C80, 0x1C8F + 1, 1)) UNICODE_BLOCK_EXT_C_CYR = tuple( [chr(supl) for supl in UNICODE_BLOCK_EXT_C_CYRILLIC]) # Cyrillic Phonetic Extensions UNICODE_BLOCK_PHON_CYRILLIC = tuple(range(0x1D2B, 0x1D78 + 1, 1)) UNICODE_BLOCK_EXT_PHON_CYR = tuple( [chr(supl) for supl in UNICODE_BLOCK_PHON_CYRILLIC]) # Cyrillic Combining Half Marks # (Unicode block containing diacritic mark parts for spanning multiple characters) UNICODE_BLOCK_HALF_MARKS_CYRILLIC = tuple(range(0xFE2E, 0xFE2F + 1, 1)) UNICODE_BLOCK_HALF_MARKS_CYR = tuple( [chr(supl) for supl in UNICODE_BLOCK_HALF_MARKS_CYRILLIC]) # UNICODE block for ALL cyrillic characters UNICODE_BLOCK_CYRILLIC_ALL = UNICODE_BLOCK_CYRILLIC + UNICODE_BLOCK_HALF_MARKS_CYR + \ UNICODE_BLOCK_EXT_PHON_CYR + UNICODE_BLOCK_EXT_C_CYR + \ UNICODE_BLOCK_EXT_B_CYR + UNICODE_BLOCK_EXT_A_CYR + \ UNICODE_BLOCK_SUPPL_CYR # # Hangul # # This is the 11xx jamo code block, when computer sees a sequence of these jamos, they combine # them into Hangul syllables (or just Hangul) in the block below. # print(chr(0x110c) + chr(0x1161) + chr(0x1106) + chr(0x1169)) UNICODE_BLOCK_ORDINAL_HANGUL_JAMO = tuple(range(0x1100, 0x11FF + 1, 1)) UNICODE_BLOCK_HANGUL_JAMO = tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_HANGUL_JAMO]) # This is the 31xx hangul compatibility jamo block, # when computer sees a sequence of these jamos, they print out individually, without combining into Hangul syllables # print(chr(0x3148) + chr(0x314f) + chr(0x3141) + chr(0x3157)) UNICODE_BLOCK_ORDINAL_HANGUL_COMPATIBILITY_JAMO = tuple( range(0x3130, 0x318F + 1, 1)) UNICODE_BLOCK_HANGUL_COMPATIBILITY_JAMO = tuple([ chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_HANGUL_COMPATIBILITY_JAMO ]) # This block is for Hangul syllables (or just Hangul). E.g. '한', '굴', '자' '모' # whereas the above blocks are for single 자모 (字母 or alphabet). UNICODE_BLOCK_ORDINAL_HANGUL_SYLLABLE = tuple(range(0xAC00, 0xD7AF + 1, 1)) UNICODE_BLOCK_HANGUL_SYLLABLE = tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_HANGUL_SYLLABLE]) UNICODE_BLOCK_HANGUL_ALL_INCLUDING_SYLLABLE = \ UNICODE_BLOCK_HANGUL_JAMO + UNICODE_BLOCK_HANGUL_COMPATIBILITY_JAMO + UNICODE_BLOCK_HANGUL_SYLLABLE # # Japanese Hiragana/Katakana # UNICODE_BLOCK_ORDINAL_HIRAGANA = tuple(range(0x3040, 0x309F + 1, 1)) UNICODE_BLOCK_HIRAGANA = tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_HIRAGANA]) UNICODE_BLOCK_ORDINAL_KATAKANA = tuple(range(0x30A0, 0x30FF + 1, 1)) UNICODE_BLOCK_KATAKANA = tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_KATAKANA]) UNICODE_BLOCK_HIRAGANA_KATAKANA = UNICODE_BLOCK_HIRAGANA + UNICODE_BLOCK_KATAKANA UNICODE_BLOCK_HIRAGANA_KATAKANA_KANJI = \ UNICODE_BLOCK_HIRAGANA + UNICODE_BLOCK_KATAKANA + UNICODE_BLOCK_CJK # # Thai # From http://sites.psu.edu/symbolcodes/languages/asia/thai/thaichart/ # UNICODE_BLOCK_ORDINAL_THAI_CONSONANTS = tuple(range(0x0E01, 0x0E2E + 1, 1)) UNICODE_BLOCK_THAI_CONSONANTS = tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_THAI_CONSONANTS]) # The character ' ็' or chr(0x0E47) is unique, a consonant must appear before it, and another consonant after it # ['ะ', 'ั', 'า', 'ำ', 'ิ', 'ี', 'ึ', 'ื', 'ุ', 'ู', 'ฺ', '็'] UNICODE_BLOCK_ORDINAL_THAI_VOWELS_AFTER_CONSONANT = \ tuple( range(0x0E30, 0x0E3A+1, 1) ) + tuple( range(0x0E47, 0x0E47+1, 1) ) UNICODE_BLOCK_THAI_VOWELS_AFTER_CONSONANT =\ tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_THAI_VOWELS_AFTER_CONSONANT] ) # The character ' ็' or chr(0x0E47) is unique, a consonant must appear before it, and another consonant after it # ['เ', 'แ', 'โ', 'ใ', 'ไ', '็'] UNICODE_BLOCK_ORDINAL_THAI_VOWELS_BEFORE_CONSONANT = \ tuple( range(0x0E40, 0x0E44+1, 1) ) + tuple( range(0x0E47, 0x0E47+1, 1) ) UNICODE_BLOCK_THAI_VOWELS_BEFORE_CONSONANT = \ tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_THAI_VOWELS_BEFORE_CONSONANT] ) # Tone marks cannot be start of word (same with "vowels-after-consonant") # ['่', '้', '๊', '๋'] UNICODE_BLOCK_ORDINAL_THAI_TONEMARKS = tuple(range(0x0E48, 0x0E4B + 1, 1)) UNICODE_BLOCK_THAI_TONEMARKS = tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_THAI_TONEMARKS]) UNICODE_BLOCK_ORDINAL_THAI_NUMBERS = tuple(range(0x0E50, 0x0E59 + 1, 1)) UNICODE_BLOCK_THAI_NUMBERS = tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_THAI_NUMBERS]) UNICODE_BLOCK_ORDINAL_THAI_SIGNS_PUNCTUATIONS = tuple( range(0x0E2F, 0x0E2F+1, 1) ) +\ tuple( range(0x0E45, 0x0E46+1, 1) ) +\ tuple( range(0x0E4C, 0x0E4F+1, 1) ) +\ tuple( range(0x0E5A, 0x0E5B+1, 1) ) UNICODE_BLOCK_THAI_SIGNS_PUNCTUATIONS = tuple([ chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_THAI_SIGNS_PUNCTUATIONS ]) UNICODE_BLOCK_THAI = UNICODE_BLOCK_THAI_CONSONANTS +\ UNICODE_BLOCK_THAI_VOWELS_AFTER_CONSONANT +\ UNICODE_BLOCK_THAI_VOWELS_BEFORE_CONSONANT +\ UNICODE_BLOCK_THAI_TONEMARKS +\ UNICODE_BLOCK_THAI_NUMBERS +\ UNICODE_BLOCK_THAI_SIGNS_PUNCTUATIONS # # Punctuations, etc. # UNICODE_BLOCK_WORD_SEPARATORS =\ tuple(u' ,!.?()[]:;"«»\'') + tuple(u'?。,()') + tuple([chr(0xFF0C),chr(0xFF01),chr(0xFF0E),chr(0xFF1F)]) UNICODE_BLOCK_SENTENCE_SEPARATORS =\ tuple(u' !.?') + tuple([chr(0xFF01),chr(0xFF0E),chr(0xFF1F)]) # # Numbers: normal Latin and CJK halfwidth/fullwidth # UNICODE_BLOCK_ORDINAL_NUMBERS = tuple(range( 0x0030, 0x0039 + 1, 1)) + tuple(range(0xFF10, 0xFF19 + 1, 1)) UNICODE_BLOCK_NUMBERS = tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_NUMBERS]) # # Punctuations Only (Half-Width & Full-Width Forms) # UNICODE_BLOCK_ORDINAL_PUNCTUATIONS = tuple(range(0x0000, 0x007F+1, 1)) +\ tuple(range(0x2000, 0x206F+1, 1)) +\ tuple(range(0x3000, 0x303F+1, 1)) +\ tuple(range(0xFF00, 0xFF0F+1, 1)) +\ tuple(range(0xFF1A, 0xFF20+1, 1)) +\ tuple(range(0xFF3B, 0xFF40+1, 1)) +\ tuple(range(0xFF5B, 0xFF65+1, 1)) UNICODE_BLOCK_PUNCTUATIONS = tuple( [chr(ordinal) for ordinal in UNICODE_BLOCK_ORDINAL_PUNCTUATIONS]) # Remove non-punctuations from original list of punctuations UNICODE_BLOCK_PUNCTUATIONS = tuple( set(UNICODE_BLOCK_PUNCTUATIONS) - set(UNICODE_BLOCK_LATIN_ALL)) UNICODE_BLOCK_PUNCTUATIONS = tuple( set(UNICODE_BLOCK_PUNCTUATIONS) - set(UNICODE_BLOCK_WORD_SEPARATORS)) UNICODE_BLOCK_PUNCTUATIONS = tuple( set(UNICODE_BLOCK_PUNCTUATIONS) - set(UNICODE_BLOCK_SENTENCE_SEPARATORS)) UNICODE_BLOCK_PUNCTUATIONS = tuple( set(UNICODE_BLOCK_PUNCTUATIONS) - set(UNICODE_BLOCK_NUMBERS)) # # Get the valid Unicode Block for a given language # @staticmethod def get_language_charset(lang): # lang_std = lf.LangFeatures.map_to_lang_code_iso639_1( # lang_code = lang # ) lang_std = lf.LangFeatures.map_to_correct_lang_code_iso_639_1_or_3( lang_code=lang) if lang_std in [lf.LangFeatures.LANG_EN, lf.LangFeatures.LANG_VI]: return LangCharacters.UNICODE_BLOCK_LATIN_ALL if lang == lf.LangFeatures.LANG_ZH: return LangCharacters.UNICODE_BLOCK_CJK elif lang == lf.LangFeatures.LANG_TH: return LangCharacters.UNICODE_BLOCK_THAI elif lang == lf.LangFeatures.LANG_KO: return LangCharacters.UNICODE_BLOCK_HANGUL_ALL_INCLUDING_SYLLABLE elif lang == lf.LangFeatures.LANG_JA: return LangCharacters.UNICODE_BLOCK_HIRAGANA_KATAKANA_KANJI else: return [] @staticmethod def get_alphabet_charset(alphabet): # # Latin Type Blocks (English, Spanish, French, Vietnamese, etc.) # TODO Break into other language variants (done) # if alphabet == lf.LangFeatures.ALPHABET_LATIN_AZ: return LangCharacters.UNICODE_BLOCK_LATIN_AZ elif alphabet == lf.LangFeatures.ALPHABET_LATIN_VI: return LangCharacters.UNICODE_BLOCK_LATIN_VI + LangCharacters.UNICODE_BLOCK_LATIN_VIETNAMESE elif alphabet == lf.LangFeatures.ALPHABET_LATIN_VI_AZ: return LangCharacters.UNICODE_BLOCK_LATIN_VI + LangCharacters.UNICODE_BLOCK_LATIN_AZ elif alphabet == lf.LangFeatures.ALPHABET_LATIN: return LangCharacters.UNICODE_BLOCK_LATIN_ALL # Latin type blocks: # French; elif alphabet == lf.LangFeatures.ALPHABET_LATIN_FR: return LangCharacters.UNICODE_BLOCK_LATIN_ALL # Czech; elif alphabet == lf.LangFeatures.ALPHABET_LATIN_CZECH: return LangCharacters.UNICODE_BLOCK_LATIN_ALL # German elif alphabet == lf.LangFeatures.ALPHABET_LATIN_GERMAN: return LangCharacters.UNICODE_BLOCK_LATIN_ALL # Spanish elif alphabet == lf.LangFeatures.ALPHABET_LATIN_SPANISH: return LangCharacters.UNICODE_BLOCK_LATIN_ALL # English elif alphabet == lf.LangFeatures.ALPHABET_LATIN_ENG: return LangCharacters.UNICODE_BLOCK_LATIN_ALL # CJK Type Blocks (Korean, Chinese, Japanese) # TODO Break into Chinese variants (simplified, traditional, etc.), # Japanese, Hanja, etc. (done) elif alphabet == lf.LangFeatures.ALPHABET_HANGUL: return LangCharacters.UNICODE_BLOCK_HANGUL_ALL_INCLUDING_SYLLABLE elif alphabet == lf.LangFeatures.ALPHABET_CJK: return LangCharacters.UNICODE_BLOCK_CJK elif alphabet == lf.LangFeatures.ALPHABET_CJK_SIMPLIFIED: return LangCharacters.UNICODE_BLOCK_CJK_SIMPLIFIED elif alphabet == lf.LangFeatures.ALPHABET_CJK_TRADITIONAL: return LangCharacters.UNICODE_BLOCK_CJK_TRADITIONAL elif alphabet == lf.LangFeatures.ALPHABET_HIRAGANA_KATAKANA: return LangCharacters.UNICODE_BLOCK_HIRAGANA_KATAKANA elif alphabet == lf.LangFeatures.ALPHABET_JAPANESE: return LangCharacters.UNICODE_BLOCK_HIRAGANA_KATAKANA_KANJI # # Cyrillic Blocks (Russian, Belarusian, Ukrainian, etc.) # elif alphabet == lf.LangFeatures.ALPHABET_CYRILLIC: return LangCharacters.UNICODE_BLOCK_CYRILLIC_ALL # # Other Blocks # elif alphabet == lf.LangFeatures.ALPHABET_THAI: return LangCharacters.UNICODE_BLOCK_THAI @staticmethod def get_alphabet_charset_all(): alphabet_dict = {} for alp in lf.LangFeatures.ALPHABETS_ALL: alphabet_dict[alp] = LangCharacters.get_alphabet_charset( alphabet=alp) return alphabet_dict # # Given a string with allowed Unicode block, returns a string with only the allowed Unicode values # def filter_allowed_characters(self, unicode_list, s, include_word_separators=True, include_sentence_separators=True, include_numbers=True, include_punctuations=True): # Just in case user passes in the immutable tuples allowed_list = list(unicode_list).copy() if include_word_separators: allowed_list += LangCharacters.UNICODE_BLOCK_WORD_SEPARATORS if include_sentence_separators: allowed_list += LangCharacters.UNICODE_BLOCK_SENTENCE_SEPARATORS if include_numbers: allowed_list += [c for c in '0123456789'] if include_punctuations: allowed_list += LangCharacters.UNICODE_BLOCK_PUNCTUATIONS str_new = [c for c in s if (c in allowed_list)] return ''.join(str_new) # # This function returns whether the written language is normal Vietnamese (a mix of basic and extended Latin) # or purely using basic Latin (it is cultural of Vietnamese to leave out all the diacritics and use purely basic # Latin) # def get_vietnamese_type(self, s): # Must convert string to unicode string #if type(s) != unicode: #s = unicode(s, encoding=self.encoding) # First we remove the punctuations, numbers, etc. remove_block = LangCharacters.UNICODE_BLOCK_PUNCTUATIONS + LangCharacters.UNICODE_BLOCK_NUMBERS + \ LangCharacters.UNICODE_BLOCK_WORD_SEPARATORS + LangCharacters.UNICODE_BLOCK_SENTENCE_SEPARATORS ss = u'' for i in range(0, len(s), 1): if s[i] not in remove_block: ss = ss + s[i] is_latin_basic_count = 0 is_latin_extended_viet_count = 0 for i in range(0, len(ss), 1): latin_basic = ss[i] in LangCharacters.UNICODE_BLOCK_LATIN_BASIC latin_extended = ss[ i] in LangCharacters.UNICODE_BLOCK_LATIN_EXTENDED # print ( ss[i] + " Latin Basic = " + str(latin_basic) + ", Latin Extended = " + str(latin_extended) ) is_latin_basic_count += 1 * latin_basic is_latin_extended_viet_count += 1 * latin_extended latin_basic_percent = float(is_latin_basic_count / float(len(ss))) latin_extended_viet_percent = float(is_latin_extended_viet_count / float(len(ss))) if (latin_basic_percent + latin_extended_viet_percent) > 0.5: if latin_basic_percent > 0.98: return "latin.basic" else: if latin_extended_viet_percent > 0.1: return "latin.viet" else: return "latin.mix" else: return "other" # # Converts a string into a single number for various purposes when dealing with numbers are more convenient. # This single number is not necessarily unique. # def convert_string_to_number(self, s, verbose=0): lang = None syllable_end = [False] * len(s) if s[0] in self.UNICODE_BLOCK_THAI and len(s) > 1: # For Thai, we don't calculate the last syllable character, since that character is highly prone # to error. E.g. ส-วัด-ดี (สวัสดี) or ปัน-หา (ปัญหา). This is also our method of Thai spelling correction. # Characters that can never be start of syllable not_start_syllable_char = self.UNICODE_BLOCK_THAI_VOWELS_AFTER_CONSONANT + \ self.UNICODE_BLOCK_THAI_TONEMARKS lang = lf.LangFeatures.LANG_TH char_prev = s[0] for i in range(1, len(s) - 1, 1): char_prev = s[i - 1] char_cur = s[i] # This character can never be start of syllable if char_cur not in not_start_syllable_char: continue char_next = s[i + 1] # Case of 'เดือน', 'เมื่อ', 'เลข', etc. if char_cur in self.UNICODE_BLOCK_THAI_VOWELS_BEFORE_CONSONANT: syllable_end[i - 1] = True elif char_cur in self.UNICODE_BLOCK_THAI_CONSONANTS: # Case of 'การ', 'เดือน', 'ดารา' etc. if (char_next in self.UNICODE_BLOCK_THAI_VOWELS_AFTER_CONSONANT) and \ (char_prev not in self.UNICODE_BLOCK_THAI_VOWELS_BEFORE_CONSONANT): syllable_end[i - 1] = True # Case of 'งง', 'สด', etc. # elif ( char_prev in LangCharacters.UNICODE_BLOCK_THAI_TONEMARKS ): # syllable_end[i-1] = True # Last character is always end of syllable syllable_end[len(s) - 1] = True if verbose >= 1: sylsepstring = '' for i in range(0, len(s), 1): sylsepstring = sylsepstring + s[i] if syllable_end[i]: sylsepstring = sylsepstring + ' ' print(sylsepstring) x = 0 index = 1 # A string "abc" will be calculated as (97 + 2*98 + 3*99), Unicode for 'a' is 97, 'b' is 98, 'c' is 99 for i in range(0, len(s), 1): # We don't include a syllable ending consonant for Thai in the measure, since this character is prone # to spelling mistakes ignore = False if lang == lf.LangFeatures.LANG_TH: if s[i] in LangCharacters.UNICODE_BLOCK_THAI_CONSONANTS and syllable_end[ i]: # print('Ignore ' + s[i]) ignore = True if not ignore: un = ord(s[i]) # print('Index ' + str(index) + ', ' + s[i] + ', ' + str(un)) x = x + index * un index = index + 1 return x
import nwae.lang.nlp.SynonymList as slist from nwae.lang.preprocessing.BasicPreprocessor import BasicPreprocessor from nwae.utils.Log import Log from inspect import currentframe, getframeinfo # Library to convert Traditional Chinese to Simplified Chinese import hanziconv as hzc import nwae.utils.Profiling as prf import re try: """ Japanese word segmentation """ import nagisa except Exception as ex: Log.warning( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Error importing libraries for japanese tokenization: ' + str(ex) ) try: """ Korean word segmentation There are many problems with this library kkma, firstly it requires external JVM, it is quite slow and will also split wrong (e.g. '탈레반이' will be split wrongly to '탈', '레', '반이') or not in our desired application way (e.g. '장악한' split to '장악', '하', 'ㄴ') We should write our own, korean language is quite systematic, and we could control the following - by default a whole word '탈레반이' if not recognized should just keep as is, and split out only particles like '이' - naturally in most application the word '장악한' (verb) should not be split to ('장악', 'NNG'), ('하', 'XSV'), ('ㄴ', 'ETD') and only stemming or lemmatization should bring it to '장악하다' (verb) and not '장악' (noun) """ from konlpy.tag import Kkma
import nwae.ml.networkdesign.NetworkDesign as nwdesign import pandas as pd import numpy as np from datetime import datetime import os import sys # TODO Don't rely on buggy TF/Keras, write our own try: from keras.utils import to_categorical from tensorflow.keras.models import load_model # This one will not work in a multi-threaded environment #from keras.models import load_model except Exception as ex_keras: Log.warning( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Exception importing Keras modules: ' + str(ex_keras) ) class NnDenseModel(ModelInterface): MODEL_NAME = 'nn_dense' CONFIDENCE_LEVEL_SCORES_DEFAULT = {1: 10, 2: 15, 3: 20, 4:30, 5:40} def __init__( self, # NN layer configurations, etc. model_params, # Unique identifier to identify this set of trained data+other files after training identifier_string,
def convert_product_to_attributes( self, # Датафрейм с покупателями и продуктами df_product, # Столцы которые определяют уникальных клиентов unique_human_key_columns, unique_product_key_column, # Либо цена продуктов или количество unique_product_value_column, normalize_method = NORMALIZE_METHOD_NONE, # уменшить количество атрибутов max_attribute_columns = 0, # по проценту, убирать продуктов меньше такого квартиля filter_out_quantile_byvalue = 0.0, filter_out_quantile_bycount = 0.0, # Before any processing transform_prd_values_method = TRANSFORM_PRD_VALUES_METHOD_NONE, transform_logbase = 10.0, transform_after_aggregate = True, # By default transform only AFTER aggregation # осторожно здесь, этот неизвестный продукт будет присвоен 0-вектор (возможно), # поэтому если использовать метрику "euclidean", он будет "близок" другим векторам add_unknown_product = False, ): transform_before = self.TRANSFORM_PRD_VALUES_METHOD_NONE if not transform_after_aggregate: transform_before = transform_prd_values_method df_prd_agg, unique_product_list, unique_human_list = self.aggregate_products( df_product = df_product, unique_human_key_columns = unique_human_key_columns, unique_product_key_column = unique_product_key_column, unique_product_value_column = unique_product_value_column, transform_prd_values_method = transform_before, transform_logbase = transform_logbase, ) # # Этот шаг приведет к несколько проблемам, так как у некоторых клиентов будет превратить к вектору нулей # is_reduced = False unique_remaining_products = None np_remaining_attributes = None if (max_attribute_columns > 0) | (filter_out_quantile_byvalue > 0.0) | (filter_out_quantile_bycount > 0.0): byval_unique_top_products_by_order, byval_unique_remaining_products = self.find_top_products( df_product = df_product, unique_product_key_column = unique_product_key_column, unique_product_value_column = unique_product_value_column, # Do max filtering later top_x = 0, filter_out_quantile = filter_out_quantile_byvalue, aggregate_method = 'sum', ) bycnt_unique_top_products_by_order, bycnt_unique_remaining_products = self.find_top_products( df_product = df_product, unique_product_key_column = unique_product_key_column, unique_product_value_column = unique_product_value_column, # Do max filtering later top_x = 0, filter_out_quantile = filter_out_quantile_bycount, aggregate_method = 'count', ) filtered_out_products_by_count_condition = [ prd for prd in byval_unique_top_products_by_order if prd not in bycnt_unique_top_products_by_order ] Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Products filtered out by 2nd count condition ' + str(filtered_out_products_by_count_condition) ) # Убирать товары, не выполняется второе условие unique_top_products_by_order = [ prd for prd in byval_unique_top_products_by_order if prd in bycnt_unique_top_products_by_order ] unique_remaining_products = filtered_out_products_by_count_condition + byval_unique_remaining_products assert len(unique_top_products_by_order) + len(unique_remaining_products) == len(unique_product_list), \ 'Length of unique top products and remaining products must equal length of product list' Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Remaining ' + str(len(unique_remaining_products)) + ' least products: ' + str(unique_remaining_products) ) if max_attribute_columns > 0: max_final = min(len(unique_top_products_by_order), max_attribute_columns) if max_final < len(unique_top_products_by_order): filtered_out_by_max_attribute_products = unique_top_products_by_order[max_final:] unique_top_products_by_order = unique_top_products_by_order[0:max_final] unique_remaining_products = filtered_out_by_max_attribute_products + unique_remaining_products assert len(unique_top_products_by_order) + len(unique_remaining_products) == len(unique_product_list), \ 'Length of unique top products and remaining products must equal length of product list' # Change the removed product names to one name def change_name(prdname): if prdname in unique_remaining_products: return self.COLNAME_PRODUCTS_NOT_INCLUDED else: return prdname df_prd_agg[unique_product_key_column] = df_prd_agg[unique_product_key_column].apply(func=change_name) unique_product_list = unique_top_products_by_order + [self.COLNAME_PRODUCTS_NOT_INCLUDED] Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': After truncation, total unique products as attributes = ' + str(len(unique_product_list)) + '. Products: ' + str(unique_product_list) ) # Need to regroup again, since each pair member-COLNAME_PRODUCTS_NOT_INCLUDED will appear on multiple lines shape_ori = df_prd_agg.shape df_prd_agg = df_prd_agg.groupby( by=unique_human_key_columns + [unique_product_key_column], as_index=False, ).sum() Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': After second round grouping by human/product columns, from shape ' + str(shape_ori) + ' to new shape ' + str(df_prd_agg.shape) ) if transform_after_aggregate: Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': After aggregation, transform product values by "' + str(transform_prd_values_method) + '"' ) if transform_prd_values_method == self.TRANSFORM_PRD_VALUES_METHOD_UNITY: df_prd_agg[unique_product_value_column] = 1.0 * (df_prd_agg[unique_product_value_column] > 0.0) elif transform_prd_values_method == self.TRANSFORM_PRD_VALUES_METHOD_LOG: df_prd_agg[unique_product_value_column] = np.log(1 + df_prd_agg[unique_product_value_column]) / np.log(transform_logbase) else: pass """Датафрейм лишь с столбцом(ами) покупателей""" df_converted = df_prd_agg[unique_human_key_columns] df_converted = df_converted.drop_duplicates() Log.debugdebug(df_converted) assert len(df_converted) == len(unique_human_list), \ 'Length of final dataframe ' + str(len(df_converted)) + ' must be equal ' + str(len(unique_human_list)) # прибавить "неизвестный продукт" if add_unknown_product: if self.NAN_PRODUCT in unique_product_list: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Name clash with nan product name "' + str(self.NAN_PRODUCT) + '"' ) unique_product_list.append(self.NAN_PRODUCT) columns_running = list(df_converted.columns) """Продукт за продуктом, создавать столбец продукта""" for prd in unique_product_list: condition_only_this_product = df_prd_agg[unique_product_key_column] == prd df_prd_agg_part = df_prd_agg[condition_only_this_product] if len(df_prd_agg_part) == 0: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': For product "' + str(prd) + '", there are 0 sales, adding 0 column' ) df_converted[prd] = 0.0 else: columns_keep = unique_human_key_columns + [unique_product_value_column] df_prd_agg_part = df_prd_agg_part[columns_keep].reset_index(drop=True) Log.debugdebug(prd) Log.debugdebug(df_prd_agg_part) """Соединить цену/количество продукта с человеком""" df_converted = df_converted.merge( df_prd_agg_part, on = unique_human_key_columns, how = 'left' ) assert len(df_converted) == len(unique_human_list), \ 'After merge column "' + str(prd) + '" Length of final dataframe ' + str( len(df_converted)) + ' must be equal ' + str(len(unique_human_list)) """Новые имена столбцев""" columns_running = columns_running + [prd] df_converted.columns = columns_running df_converted[prd] = df_converted[prd].fillna(0.0) Log.debugdebug(df_converted) assert len(df_converted) == len(unique_human_list), \ 'Length of final dataframe ' + str(len(df_converted)) + ' must be equal ' + str(len(unique_human_list)) if is_reduced: df_converted[self.COLNAME_PRODUCTS_NOT_INCLUDED] = np_remaining_attributes Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Final human-product attributes shape: ' + str(df_converted.shape) ) Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Final mapped human-product attributes: ' ) Log.debugdebug(df_converted) original_cols = list(df_converted.columns) attr_cols = original_cols.copy() for col in unique_human_key_columns: attr_cols.remove(col) df_converted = self.normalize( df = df_converted, name_columns = unique_human_key_columns, attribute_columns = attr_cols, normalize_method = normalize_method, ) return df_converted, unique_product_list
def load_text_processor(self): try: self.load_text_processor_mutex.acquire() # Don't allow to load again if self.model_last_reloaded_counter == self.model.get_model_reloaded_counter( ): Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.identifier_string) + '" not reloading PredictClassTxtProcessor.') return Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.model_name) + '" ready. Loading synonym & word lists..') self.lang_detect = LangDetect() self.predict_class_txt_processor = {} for uh in [self.lang_main] + self.lang_additional: try: model_features_list = self.model.get_model_features( ).tolist() except Exception as ex_feature_list: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model "' + str(self.model_name) + '" identifier "' + str(self.identifier_string) + '" model feature list empty!') model_features_list = None self.predict_class_txt_processor[uh] = TxtPreprocessor( identifier_string=self.identifier_string, dir_path_model=self.dir_path_model, model_features_list=model_features_list, lang=uh, dirpath_synonymlist=self.dirpath_synonymlist, postfix_synonymlist=self.postfix_synonymlist, dir_wordlist=self.dir_wordlist, postfix_wordlist=self.postfix_wordlist, dir_wordlist_app=self.dir_wordlist_app, postfix_wordlist_app=self.postfix_wordlist_app, # TODO For certain languages like English, it is essential to include this # But at the same time must be very careful. By adding manual rules, for # example we include words 'it', 'is'.. But "It is" could be a very valid # training data that becomes excluded wrongly. stopwords_list=None, do_spelling_correction=self.do_spelling_correction, do_word_stemming=self.do_word_stemming, do_profiling=self.do_profiling) self.is_all_initializations_done = True # Manually update this model last reloaded counter self.model_last_reloaded_counter = self.model.get_model_reloaded_counter( ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Model name "' + str(self.model_name) + '", identifier "' + str(self.identifier_string) + '" All initializations done for model "' + str(self.identifier_string) + '". Model Reload counter = ' + str(self.model_last_reloaded_counter)) except Exception as ex: errmsg = \ str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Model name "' + str(self.model_name) \ + '", identifier "' + str(self.identifier_string) \ + '" Exception initializing synonym & word lists: ' + str(ex) Log.critical(errmsg) raise Exception(errmsg) finally: self.load_text_processor_mutex.release()
def __init__( self, write_lang_features_to_csv = False ): # # Language followed by flag for alphabet boundary, syllable boundary (either as one # character as in Chinese or space as in Korean), then word boundary (space) # The most NLP-inconvenient languages are those without word boundary, obviously. # Name, Code, Alphabet, CharacterType, SyllableSeparator, SyllableSeparatorType, WordSeparator, WordSeparatorType # # We need to define our own properties as even ISO 15924 specification does not contain them # # Hangul/CJK Language Family # try: self.PYCLANG = pycountry.languages except Exception as ex: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Cannot load pycountry languages: ' + str(ex) ) self.PYCLANG = None lang_index = 0 lang_ko = { self.C_LANG_ID: self.LANG_KO, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Hangul', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_HANGUL, self.C_HAVE_SYL_SEP: True, # TODO Not really right to say it is char but rather a "syllable_character" self.C_SYL_SEP_TYPE: self.T_CHAR, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } # # CJK Alphabet Family # lang_index += 1 lang_zh = { self.C_LANG_ID: self.LANG_ZH, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Chinese', self.C_HAVE_ALPHABET: False, self.C_CHAR_TYPE: self.ALPHABET_CJK, self.C_HAVE_SYL_SEP: True, self.C_SYL_SEP_TYPE: self.T_CHAR, self.C_HAVE_WORD_SEP: False, self.C_WORD_SEP_TYPE: self.T_NONE, self.C_HAVE_VERB_CONJ: False } # # Japanese Hiragana/Katakana # lang_index += 1 lang_ja = { self.C_LANG_ID: self.LANG_JA, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Japanese', self.C_HAVE_ALPHABET: False, self.C_CHAR_TYPE: self.ALPHABET_JAPANESE, self.C_HAVE_SYL_SEP: True, self.C_SYL_SEP_TYPE: self.T_CHAR, self.C_HAVE_WORD_SEP: False, self.C_WORD_SEP_TYPE: self.T_NONE, self.C_HAVE_VERB_CONJ: True } # # Cyrillic Alphabet Family # lang_index += 1 lang_ru = { self.C_LANG_ID: self.LANG_RU, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Russian', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_CYRILLIC, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } # # Thai Alphabet Family # lang_index += 1 lang_th = { self.C_LANG_ID: self.LANG_TH, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Thai', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_THAI, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: False, self.C_WORD_SEP_TYPE: self.T_NONE, self.C_HAVE_VERB_CONJ: False } # # Latin Alphabet Family # lang_index += 1 lang_en = { self.C_LANG_ID: self.LANG_EN, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'English', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_LATIN_AZ, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } lang_index += 1 lang_es = { self.C_LANG_ID: self.LANG_ES, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Spanish', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_LATIN, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } lang_index += 1 lang_fr = { self.C_LANG_ID: self.LANG_FR, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'French', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_LATIN, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } lang_index += 1 lang_de = { self.C_LANG_ID: self.LANG_DE, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'German', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_LATIN, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } lang_index += 1 lang_it = { self.C_LANG_ID: self.LANG_IT, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Italian', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_LATIN, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } lang_index += 1 lang_nl = { self.C_LANG_ID: self.LANG_NL, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Dutch', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_LATIN, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } lang_index += 1 lang_vi = { self.C_LANG_ID: self.LANG_VI, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Vietnamese', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_LATIN_VI_AZ, self.C_HAVE_SYL_SEP: True, self.C_SYL_SEP_TYPE: self.T_SPACE, self.C_HAVE_WORD_SEP: False, self.C_WORD_SEP_TYPE: self.T_NONE, self.C_HAVE_VERB_CONJ: False } lang_index += 1 lang_id = { self.C_LANG_ID: self.LANG_ID, self.C_LANG_NUMBER: lang_index, self.C_LANG_NAME: 'Indonesian', self.C_HAVE_ALPHABET: True, self.C_CHAR_TYPE: self.ALPHABET_LATIN_AZ, self.C_HAVE_SYL_SEP: False, self.C_SYL_SEP_TYPE: self.T_NONE, self.C_HAVE_WORD_SEP: True, self.C_WORD_SEP_TYPE: self.T_SPACE, self.C_HAVE_VERB_CONJ: True } self.langs = { # Hangul/CJK self.LANG_KO: lang_ko, self.LANG_JA: lang_ja, # CJK self.LANG_ZH: lang_zh, # Cyrillic self.LANG_RU: lang_ru, # Thai self.LANG_TH: lang_th, # Latin self.LANG_EN: lang_en, self.LANG_ES: lang_es, self.LANG_FR: lang_fr, self.LANG_DE: lang_de, self.LANG_IT: lang_it, self.LANG_NL: lang_nl, self.LANG_VI: lang_vi, self.LANG_ID: lang_id, } assert lang_index+1 == len(self.langs) # Add ISO 639-2 definitions for lang in self.langs.keys(): if self.PYCLANG is not None: lang_639 = self.PYCLANG.get(alpha_2=lang) self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_3] = lang_639.alpha_3 self.langs[lang][LangFeatures.C_LANG_639_2_NAME] = lang_639.name self.langs[lang][LangFeatures.C_LANG_639_2_SCOPE] = lang_639.scope self.langs[lang][LangFeatures.C_LANG_639_2_TYPE] = lang_639.type try: self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_2] = lang_639.alpha_2 except Exception: self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_2] = '' try: self.langs[lang][LangFeatures.C_LANG_639_2_BIBLIO] = lang_639.bibliographic except Exception: self.langs[lang][LangFeatures.C_LANG_639_2_BIBLIO] = '' else: self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_3] = '' self.langs[lang][LangFeatures.C_LANG_639_2_NAME] = '' self.langs[lang][LangFeatures.C_LANG_639_2_SCOPE] = '' self.langs[lang][LangFeatures.C_LANG_639_2_TYPE] = '' self.langs[lang][LangFeatures.C_LANG_639_2_ALPHA_2] = '' self.langs[lang][LangFeatures.C_LANG_639_2_BIBLIO] = '' # Copy 2-letter keys (ISO 639-1) to also 3-letter keys (ISO 639-3) # Means we can access the language structure using either ISO 639-1 or ISO 639-3 # If engineering standard ISO had been more far-sighted (after all 26*26=676 only) # we would not have to do this new_items = {} for key in self.langs.keys(): lang_iso_699_3 = self.langs[key][LangFeatures.C_LANG_639_2_ALPHA_3] if key != lang_iso_699_3: lang_dict = self.langs[key].copy() # Change lang id to 3-letter ISO 639-1 lang_dict[self.C_LANG_ID] = lang_iso_699_3 new_items[lang_iso_699_3] = lang_dict for lang_id3 in new_items: self.langs[lang_id3] = new_items[lang_id3] self.langfeatures = pd.DataFrame( self.langs.values() ) # Конечно более удобно хранить данные в csv файле.. # но проблема с путем файла и тп будет очень неприятна пользователем if write_lang_features_to_csv: self.langfeatures = self.langfeatures.sort_values(by=[self.C_LANG_NAME], ascending=True) self.langfeatures.to_csv('lang_features.csv', sep=',', index=False) return