def __init__( self, name, value, if_required, if_masked, # MEX expression to extract param from human sentence mex_expr, # For deserializing old objects so the old state is maintained value_just_updated=False, completed=False): self.name = name self.value = value self.if_required = if_required self.if_masked = if_masked # Field MEX self.mex_expr = mex_expr self.value_just_updated = value_just_updated # Already obtained the parameter from user conversation? self.completed = completed try: self.mex_obj = MatchExpression(pattern=self.mex_expr, lang=None) self.mex_var_name = self.mex_obj.get_mex_var_names()[0] self.mex_obj_no_var_expressions = MatchExpression.create_mex_obj_from_object_vars( var_name_str=self.mex_var_name, var_type_str=self.mex_obj.get_mex_var_type( var_name=self.mex_var_name), var_expressions_str='', var_len_range_list2=self.mex_obj.get_mex_var_length_range( var_name=self.mex_var_name), var_preferred_dir_str=self.mex_obj.get_mex_var_pref_dir( var_name=self.mex_var_name)) except Exception as ex_mex: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Failed to get mex var name for mex expr "' + str(self.mex_expr) + '", got exception "' + str(ex_mex) + '".') Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Field initialized: ' + str(self.to_json())) return
def __init__( self, title, instruction, if_need_confirm, # List of FormFields form_fields, # mex_form_model mex_form_model, # For deserializing old objects so the old state is mainted form_completed=False): self.title = title self.instruction = instruction self.if_need_confirm = if_need_confirm # List of FormFields self.form_fields = form_fields # Field MEX self.mex_form_model = mex_form_model self.mex_obj = MatchExpression( pattern=self.mex_form_model, lang=None, ) self.form_completed = form_completed
class FormField: KEY_NAME = 'name' KEY_VALUE = 'value' KEY_IF_REQUIRED = 'ifRequired' KEY_IF_MASKED = 'ifMasked' KEY_MEX_EXPR = 'mexExpr' # For deserializing old objects so the old state is maintained KEY_VALUE_JUST_UPDATED = 'valueJustUpdated' KEY_COMPLETED = 'completed' @staticmethod def deserialize(json_obj): return FormField.import_form_field(json_obj=json_obj) @staticmethod def import_form_field(json_obj): if_required = True if_masked = False completed = False value_just_updated = False # Non-compulsory keys if FormField.KEY_IF_REQUIRED in json_obj.keys(): if_required = json_obj[FormField.KEY_IF_REQUIRED] if FormField.KEY_IF_MASKED in json_obj.keys(): if_masked = json_obj[FormField.KEY_IF_MASKED] if FormField.KEY_VALUE_JUST_UPDATED in json_obj.keys(): value_just_updated = json_obj[FormField.KEY_VALUE_JUST_UPDATED] if FormField.KEY_COMPLETED in json_obj.keys(): completed = json_obj[FormField.KEY_COMPLETED] return FormField( # Compulsory key name=json_obj[FormField.KEY_NAME], # Compulsory key value=json_obj[FormField.KEY_VALUE], if_required=if_required, if_masked=if_masked, mex_expr=json_obj[FormField.KEY_MEX_EXPR], value_just_updated=value_just_updated, completed=completed) def __init__( self, name, value, if_required, if_masked, # MEX expression to extract param from human sentence mex_expr, # For deserializing old objects so the old state is maintained value_just_updated=False, completed=False): self.name = name self.value = value self.if_required = if_required self.if_masked = if_masked # Field MEX self.mex_expr = mex_expr self.value_just_updated = value_just_updated # Already obtained the parameter from user conversation? self.completed = completed try: self.mex_obj = MatchExpression(pattern=self.mex_expr, lang=None) self.mex_var_name = self.mex_obj.get_mex_var_names()[0] self.mex_obj_no_var_expressions = MatchExpression.create_mex_obj_from_object_vars( var_name_str=self.mex_var_name, var_type_str=self.mex_obj.get_mex_var_type( var_name=self.mex_var_name), var_expressions_str='', var_len_range_list2=self.mex_obj.get_mex_var_length_range( var_name=self.mex_var_name), var_preferred_dir_str=self.mex_obj.get_mex_var_pref_dir( var_name=self.mex_var_name)) except Exception as ex_mex: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Failed to get mex var name for mex expr "' + str(self.mex_expr) + '", got exception "' + str(ex_mex) + '".') Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Field initialized: ' + str(self.to_json())) return def set_value_just_updated(self): self.value_just_updated = True def reset_value_just_updated(self): self.value_just_updated = False def set_field_value( self, user_text, # Strict means match only with expressions strict_var_expressions=True): value = None # Try with var expressions first res = self.__set_field_value_from_text(text=user_text, exclude_var_expressions=False) if res is True: return True elif not strict_var_expressions: # Try to match with no text expressions, as user may just type the value alone res = self.__set_field_value_from_text( text=user_text, exclude_var_expressions=True) return res else: return False def __set_field_value_from_text(self, text, exclude_var_expressions=False): if exclude_var_expressions: params_dict = self.mex_obj_no_var_expressions.get_params( sentence=text, return_one_value=True) else: params_dict = self.mex_obj.get_params( sentence=text, # No need to return 2 sides return_one_value=True) if params_dict[self.mex_var_name] is not None: self.value = params_dict[self.mex_var_name] self.set_value_just_updated() self.completed = True return True else: return False # So that we can serialize state to file def to_json(self): return { FormField.KEY_NAME: self.name, FormField.KEY_VALUE: self.value, FormField.KEY_IF_REQUIRED: self.if_required, FormField.KEY_IF_MASKED: self.if_masked, FormField.KEY_MEX_EXPR: self.mex_expr, FormField.KEY_VALUE_JUST_UPDATED: self.value_just_updated, FormField.KEY_COMPLETED: self.completed }
def __init__( self, identifier_string, # If None, will not do spelling correction dir_path_model, # If None, will not replace any word with unknown symbol W_UNK model_features_list, lang, dirpath_synonymlist, postfix_synonymlist, dir_wordlist, postfix_wordlist, dir_wordlist_app, postfix_wordlist_app, # For certain languages like English, it is essential to include this, # otherwise predict accuracy will drop drastically. # But at the same time must be very careful. By adding manual rules, for # example we include words 'it', 'is'.. But "It is" could be a very valid # training data that becomes excluded wrongly. stopwords_list = None, do_spelling_correction = False, do_word_stemming = True, do_profiling = False ): self.identifier_string = identifier_string self.dir_path_model = dir_path_model self.model_features_list = model_features_list self.lang = langfeatures.LangFeatures.map_to_lang_code_iso639_1( lang_code = lang ) self.dirpath_synonymlist = dirpath_synonymlist self.postfix_synonymlist = postfix_synonymlist self.dir_wordlist = dir_wordlist self.postfix_wordlist = postfix_wordlist self.dir_wordlist_app = dir_wordlist_app self.postfix_wordlist_app = postfix_wordlist_app # Allowed root words are just the model features list self.allowed_root_words = self.model_features_list self.stopwords_list = stopwords_list self.do_spelling_correction = do_spelling_correction self.do_word_stemming = do_word_stemming self.do_profiling = do_profiling Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Using wordlist dir "' + str(self.dir_wordlist) + '", app wordlist dir "' + str(self.dir_wordlist_app) + '", synonym dir "' + str(self.dirpath_synonymlist) + '"' ) self.words_no_replace_with_special_symbols = \ list(langchar.LangCharacters.UNICODE_BLOCK_WORD_SEPARATORS) + \ list(langchar.LangCharacters.UNICODE_BLOCK_SENTENCE_SEPARATORS) + \ list(langchar.LangCharacters.UNICODE_BLOCK_PUNCTUATIONS) + \ list(BasicPreprocessor.ALL_SPECIAL_SYMBOLS) self.words_no_replace_with_special_symbols = list(set(self.words_no_replace_with_special_symbols)) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': For model "' + str(self.identifier_string) + '", words that will not replace with special symbols: ' + str(self.words_no_replace_with_special_symbols) ) # # We initialize word segmenter and synonym list after the model is ready # because it requires the model features so that root words of synonym lists # are only from the model features # self.wseg = None self.synonymlist = None self.spell_correction = None # Stemmer/Lemmatizer self.lang_have_verb_conj = False self.word_stemmer_lemmatizer = None ret_obj = langhelper.LangHelper.get_word_segmenter( lang = self.lang, dirpath_wordlist = self.dir_wordlist, postfix_wordlist = self.postfix_wordlist, dirpath_app_wordlist = self.dir_wordlist_app, postfix_app_wordlist = self.postfix_wordlist_app, dirpath_synonymlist = self.dirpath_synonymlist, postfix_synonymlist = self.postfix_synonymlist, # We can only allow root words to be words from the model features allowed_root_words = self.model_features_list, do_profiling = self.do_profiling ) self.wseg = ret_obj.wseg self.synonymlist = ret_obj.snnlist # # For spelling correction # if self.do_spelling_correction: try: self.spell_correction = spellcor.SpellCheckSentence( lang = self.lang, words_list = self.model_features_list, dir_path_model = self.dir_path_model, identifier_string = self.identifier_string, do_profiling = self.do_profiling ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Spelling Correction for model "' + str(self.identifier_string) + '" initialized successfully.' ) except Exception as ex_spellcor: self.spell_correction = None errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Error initializing spelling correction for model "' \ + str(self.identifier_string) \ + '", got exception "' + str(ex_spellcor) + '".' Log.error(errmsg) # # For stemmer / lemmatization # if self.do_word_stemming: lfobj = langfeatures.LangFeatures() self.lang_have_verb_conj = lfobj.have_verb_conjugation(lang=self.lang) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '" verb conjugation = ' + str(self.lang_have_verb_conj) + '.' ) self.word_stemmer_lemmatizer = None if self.lang_have_verb_conj: try: self.word_stemmer_lemmatizer = lmtz.Lemmatizer( lang=self.lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '" stemmer/lemmatizer initialized successfully.' ) except Exception as ex_stemmer: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Lang "' + str(self.lang) + ' stemmer/lemmatizer failed to initialize: ' \ + str(ex_stemmer) + '.' Log.error(errmsg) self.word_stemmer_lemmatizer = None self.mex_username_nonword = MatchExpression( pattern = 'u,' + MexBuiltInTypes.MEX_TYPE_USERNAME_NONWORD + ',', lang = None ) return
class TxtPreprocessor: def __init__( self, identifier_string, # If None, will not do spelling correction dir_path_model, # If None, will not replace any word with unknown symbol W_UNK model_features_list, lang, dirpath_synonymlist, postfix_synonymlist, dir_wordlist, postfix_wordlist, dir_wordlist_app, postfix_wordlist_app, # For certain languages like English, it is essential to include this, # otherwise predict accuracy will drop drastically. # But at the same time must be very careful. By adding manual rules, for # example we include words 'it', 'is'.. But "It is" could be a very valid # training data that becomes excluded wrongly. stopwords_list = None, do_spelling_correction = False, do_word_stemming = True, do_profiling = False ): self.identifier_string = identifier_string self.dir_path_model = dir_path_model self.model_features_list = model_features_list self.lang = langfeatures.LangFeatures.map_to_lang_code_iso639_1( lang_code = lang ) self.dirpath_synonymlist = dirpath_synonymlist self.postfix_synonymlist = postfix_synonymlist self.dir_wordlist = dir_wordlist self.postfix_wordlist = postfix_wordlist self.dir_wordlist_app = dir_wordlist_app self.postfix_wordlist_app = postfix_wordlist_app # Allowed root words are just the model features list self.allowed_root_words = self.model_features_list self.stopwords_list = stopwords_list self.do_spelling_correction = do_spelling_correction self.do_word_stemming = do_word_stemming self.do_profiling = do_profiling Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Using wordlist dir "' + str(self.dir_wordlist) + '", app wordlist dir "' + str(self.dir_wordlist_app) + '", synonym dir "' + str(self.dirpath_synonymlist) + '"' ) self.words_no_replace_with_special_symbols = \ list(langchar.LangCharacters.UNICODE_BLOCK_WORD_SEPARATORS) + \ list(langchar.LangCharacters.UNICODE_BLOCK_SENTENCE_SEPARATORS) + \ list(langchar.LangCharacters.UNICODE_BLOCK_PUNCTUATIONS) + \ list(BasicPreprocessor.ALL_SPECIAL_SYMBOLS) self.words_no_replace_with_special_symbols = list(set(self.words_no_replace_with_special_symbols)) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': For model "' + str(self.identifier_string) + '", words that will not replace with special symbols: ' + str(self.words_no_replace_with_special_symbols) ) # # We initialize word segmenter and synonym list after the model is ready # because it requires the model features so that root words of synonym lists # are only from the model features # self.wseg = None self.synonymlist = None self.spell_correction = None # Stemmer/Lemmatizer self.lang_have_verb_conj = False self.word_stemmer_lemmatizer = None ret_obj = langhelper.LangHelper.get_word_segmenter( lang = self.lang, dirpath_wordlist = self.dir_wordlist, postfix_wordlist = self.postfix_wordlist, dirpath_app_wordlist = self.dir_wordlist_app, postfix_app_wordlist = self.postfix_wordlist_app, dirpath_synonymlist = self.dirpath_synonymlist, postfix_synonymlist = self.postfix_synonymlist, # We can only allow root words to be words from the model features allowed_root_words = self.model_features_list, do_profiling = self.do_profiling ) self.wseg = ret_obj.wseg self.synonymlist = ret_obj.snnlist # # For spelling correction # if self.do_spelling_correction: try: self.spell_correction = spellcor.SpellCheckSentence( lang = self.lang, words_list = self.model_features_list, dir_path_model = self.dir_path_model, identifier_string = self.identifier_string, do_profiling = self.do_profiling ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Spelling Correction for model "' + str(self.identifier_string) + '" initialized successfully.' ) except Exception as ex_spellcor: self.spell_correction = None errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Error initializing spelling correction for model "' \ + str(self.identifier_string) \ + '", got exception "' + str(ex_spellcor) + '".' Log.error(errmsg) # # For stemmer / lemmatization # if self.do_word_stemming: lfobj = langfeatures.LangFeatures() self.lang_have_verb_conj = lfobj.have_verb_conjugation(lang=self.lang) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '" verb conjugation = ' + str(self.lang_have_verb_conj) + '.' ) self.word_stemmer_lemmatizer = None if self.lang_have_verb_conj: try: self.word_stemmer_lemmatizer = lmtz.Lemmatizer( lang=self.lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '" stemmer/lemmatizer initialized successfully.' ) except Exception as ex_stemmer: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Lang "' + str(self.lang) + ' stemmer/lemmatizer failed to initialize: ' \ + str(ex_stemmer) + '.' Log.error(errmsg) self.word_stemmer_lemmatizer = None self.mex_username_nonword = MatchExpression( pattern = 'u,' + MexBuiltInTypes.MEX_TYPE_USERNAME_NONWORD + ',', lang = None ) return def __is_username_nonword_type( self, word ): params_dict = self.mex_username_nonword.get_params( sentence=word, return_one_value=True # if False will return both (left,right) values ) is_username_nonword_type = params_dict['u'] is not None return is_username_nonword_type def __remove_stopwords( self, word_list ): if self.stopwords_list: word_list_remove = [] for w in word_list: if w not in self.stopwords_list: word_list_remove.append(w) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '", Word list "' + str(word_list) + '", removed stopwords to "' + str(word_list_remove) + '".' ) return word_list_remove else: return word_list def preprocess_list( self, sentences_list, ): return [self.process_text(inputtext=s, return_as_string=False) for s in sentences_list] # # Some things we do # 1. Replace special tokens like URLs with special symbols # 2. Segment text or word tokenization # 3. Convert to lowercase, clean/separate common punctuations from words # 4. Normalize text, replacing synonyms with single word # 5. Spelling correction # 6. Remove stopwords # 7. Stemming or Lemmatization # def process_text( self, inputtext, return_as_string = False, use_special_symbol_username_nonword = False ): # # 1st Round replace with very special symbols first, that must be done before # word segmentation or cleaning. # Be careful here, don't simply replace things. # For symbols that can wait until after word segmentation like numbers, unknown # words, we do later. # pat_rep_list = [ { 'pattern': MexBuiltInTypes.REGEX_URI, 'repl': ' ' + BasicPreprocessor.W_URI + ' ' }, ] inputtext_sym = su.StringUtils.trim(str(inputtext)) for pat_rep in pat_rep_list: inputtext_sym = re.sub( pattern = pat_rep['pattern'], repl = pat_rep['repl'], string = inputtext_sym ) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext) + '" special pattern replacement to: ' + str(inputtext_sym) ) # # Segment words # # Returns a word array, e.g. ['word1', 'word2', 'x', 'y',...] text_segmented_arr = self.wseg.segment_words( text = inputtext_sym, return_array_of_split_words = True ) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext) + '" segmented to: ' + str(text_segmented_arr) ) # # Remove basic punctuations stuck to word # # Will return None on error tmp_arr = BasicPreprocessor.clean_punctuations( sentence = text_segmented_arr ) if type(tmp_arr) in [list, tuple]: text_segmented_arr = tmp_arr Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext) + '" clean punctuations to: ' + str(text_segmented_arr) ) # # Replace words with root words # This step uses synonyms and replaces say "красивая", "милая", "симпатичная", all with "красивая" # This will reduce training data without needing to put all versions of the same thing. # text_normalized_arr = self.synonymlist.normalize_text_array( text_segmented_array = text_segmented_arr ) text_normalized_arr_lower = [s.lower() for s in text_normalized_arr] Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext) + '", normalized to "' + str(text_normalized_arr_lower) + '"' ) # # Spelling correction # if self.do_spelling_correction: if self.spell_correction is not None: text_normalized_arr_lower = self.spell_correction.check( text_segmented_arr = text_normalized_arr_lower ) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext) + '", corrected spelling to "' + str(text_normalized_arr_lower) + '".' ) # # Remove stopwords before stemming # text_normalized_arr_lower = self.__remove_stopwords( word_list = text_normalized_arr_lower ) # # Stemming / Lemmatization # Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '" do stemming = ' + str(self.do_word_stemming) + ', have verb conjugation = ' + str(self.lang_have_verb_conj) ) if self.do_word_stemming and self.lang_have_verb_conj: if self.word_stemmer_lemmatizer: for i in range(len(text_normalized_arr_lower)): text_normalized_arr_lower[i] = self.word_stemmer_lemmatizer.stem( word = text_normalized_arr_lower[i] ) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext) + '", stemmed to "' + str(text_normalized_arr_lower) + '".' ) # # 2nd round replace with special symbols for numbers, unrecognized vocabulary, etc. # MUST NOT accidentally replace our earlier special symbols like _uri, etc. # for i in range(len(text_normalized_arr_lower)): word = text_normalized_arr_lower[i] # # Punctuations, special symbols themselves, etc, will not undergo this process # if word in self.words_no_replace_with_special_symbols: continue # Check numbers first, re.match() is fast enough # Replace numbers with separate symbol if re.match(pattern='^[0-9]+$', string=word): Log.debugdebug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Found for number in word "' + str(word) + '"' ) text_normalized_arr_lower[i] = BasicPreprocessor.W_NUM elif self.model_features_list is not None: if word not in self.model_features_list: text_normalized_arr_lower[i] = BasicPreprocessor.W_UNK if use_special_symbol_username_nonword: # Check if it is a username_nonword form if self.__is_username_nonword_type(word=word): text_normalized_arr_lower[i] = BasicPreprocessor.W_USERNAME_NONWORD else: if use_special_symbol_username_nonword: if self.__is_username_nonword_type(word=word): text_normalized_arr_lower[i] = BasicPreprocessor.W_USERNAME_NONWORD # # Remove stopwords again after stemming # text_normalized_arr_lower = self.__remove_stopwords( word_list = text_normalized_arr_lower ) # # Finally remove empty words in array # text_normalized_arr_lower = [x for x in text_normalized_arr_lower if x != ''] Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(self.lang) + '", Done text processing to: ' + str(text_normalized_arr_lower) + ' from "' + str(inputtext) + '".' ) if return_as_string: print_separator = BasicPreprocessor.get_word_separator( lang = self.lang ) return print_separator.join(text_normalized_arr_lower) else: return text_normalized_arr_lower