Пример #1
0
 def __init__(
         self,
         name,
         value,
         if_required,
         if_masked,
         # MEX expression to extract param from human sentence
         mex_expr,
         # For deserializing old objects so the old state is maintained
         value_just_updated=False,
         completed=False):
     self.name = name
     self.value = value
     self.if_required = if_required
     self.if_masked = if_masked
     # Field MEX
     self.mex_expr = mex_expr
     self.value_just_updated = value_just_updated
     # Already obtained the parameter from user conversation?
     self.completed = completed
     try:
         self.mex_obj = MatchExpression(pattern=self.mex_expr, lang=None)
         self.mex_var_name = self.mex_obj.get_mex_var_names()[0]
         self.mex_obj_no_var_expressions = MatchExpression.create_mex_obj_from_object_vars(
             var_name_str=self.mex_var_name,
             var_type_str=self.mex_obj.get_mex_var_type(
                 var_name=self.mex_var_name),
             var_expressions_str='',
             var_len_range_list2=self.mex_obj.get_mex_var_length_range(
                 var_name=self.mex_var_name),
             var_preferred_dir_str=self.mex_obj.get_mex_var_pref_dir(
                 var_name=self.mex_var_name))
     except Exception as ex_mex:
         raise Exception(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) +
             ': Failed to get mex var name for mex expr "' +
             str(self.mex_expr) + '", got exception "' + str(ex_mex) + '".')
     Log.info(
         str(self.__class__) + ' ' +
         str(getframeinfo(currentframe()).lineno) +
         ': Field initialized: ' + str(self.to_json()))
     return
Пример #2
0
Файл: Form.py Проект: nwae/nwae
 def __init__(
     self,
     title,
     instruction,
     if_need_confirm,
     # List of FormFields
     form_fields,
     # mex_form_model
     mex_form_model,
     # For deserializing old objects so the old state is mainted
     form_completed=False):
     self.title = title
     self.instruction = instruction
     self.if_need_confirm = if_need_confirm
     # List of FormFields
     self.form_fields = form_fields
     # Field MEX
     self.mex_form_model = mex_form_model
     self.mex_obj = MatchExpression(
         pattern=self.mex_form_model,
         lang=None,
     )
     self.form_completed = form_completed
Пример #3
0
class FormField:

    KEY_NAME = 'name'
    KEY_VALUE = 'value'
    KEY_IF_REQUIRED = 'ifRequired'
    KEY_IF_MASKED = 'ifMasked'
    KEY_MEX_EXPR = 'mexExpr'
    # For deserializing old objects so the old state is maintained
    KEY_VALUE_JUST_UPDATED = 'valueJustUpdated'
    KEY_COMPLETED = 'completed'

    @staticmethod
    def deserialize(json_obj):
        return FormField.import_form_field(json_obj=json_obj)

    @staticmethod
    def import_form_field(json_obj):
        if_required = True
        if_masked = False
        completed = False
        value_just_updated = False

        # Non-compulsory keys
        if FormField.KEY_IF_REQUIRED in json_obj.keys():
            if_required = json_obj[FormField.KEY_IF_REQUIRED]
        if FormField.KEY_IF_MASKED in json_obj.keys():
            if_masked = json_obj[FormField.KEY_IF_MASKED]
        if FormField.KEY_VALUE_JUST_UPDATED in json_obj.keys():
            value_just_updated = json_obj[FormField.KEY_VALUE_JUST_UPDATED]
        if FormField.KEY_COMPLETED in json_obj.keys():
            completed = json_obj[FormField.KEY_COMPLETED]

        return FormField(
            # Compulsory key
            name=json_obj[FormField.KEY_NAME],
            # Compulsory key
            value=json_obj[FormField.KEY_VALUE],
            if_required=if_required,
            if_masked=if_masked,
            mex_expr=json_obj[FormField.KEY_MEX_EXPR],
            value_just_updated=value_just_updated,
            completed=completed)

    def __init__(
            self,
            name,
            value,
            if_required,
            if_masked,
            # MEX expression to extract param from human sentence
            mex_expr,
            # For deserializing old objects so the old state is maintained
            value_just_updated=False,
            completed=False):
        self.name = name
        self.value = value
        self.if_required = if_required
        self.if_masked = if_masked
        # Field MEX
        self.mex_expr = mex_expr
        self.value_just_updated = value_just_updated
        # Already obtained the parameter from user conversation?
        self.completed = completed
        try:
            self.mex_obj = MatchExpression(pattern=self.mex_expr, lang=None)
            self.mex_var_name = self.mex_obj.get_mex_var_names()[0]
            self.mex_obj_no_var_expressions = MatchExpression.create_mex_obj_from_object_vars(
                var_name_str=self.mex_var_name,
                var_type_str=self.mex_obj.get_mex_var_type(
                    var_name=self.mex_var_name),
                var_expressions_str='',
                var_len_range_list2=self.mex_obj.get_mex_var_length_range(
                    var_name=self.mex_var_name),
                var_preferred_dir_str=self.mex_obj.get_mex_var_pref_dir(
                    var_name=self.mex_var_name))
        except Exception as ex_mex:
            raise Exception(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Failed to get mex var name for mex expr "' +
                str(self.mex_expr) + '", got exception "' + str(ex_mex) + '".')
        Log.info(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Field initialized: ' + str(self.to_json()))
        return

    def set_value_just_updated(self):
        self.value_just_updated = True

    def reset_value_just_updated(self):
        self.value_just_updated = False

    def set_field_value(
        self,
        user_text,
        # Strict means match only with expressions
        strict_var_expressions=True):
        value = None
        # Try with var expressions first
        res = self.__set_field_value_from_text(text=user_text,
                                               exclude_var_expressions=False)
        if res is True:
            return True
        elif not strict_var_expressions:
            # Try to match with no text expressions, as user may just type the value alone
            res = self.__set_field_value_from_text(
                text=user_text, exclude_var_expressions=True)
            return res
        else:
            return False

    def __set_field_value_from_text(self, text, exclude_var_expressions=False):
        if exclude_var_expressions:
            params_dict = self.mex_obj_no_var_expressions.get_params(
                sentence=text, return_one_value=True)
        else:
            params_dict = self.mex_obj.get_params(
                sentence=text,
                # No need to return 2 sides
                return_one_value=True)
        if params_dict[self.mex_var_name] is not None:
            self.value = params_dict[self.mex_var_name]
            self.set_value_just_updated()
            self.completed = True
            return True
        else:
            return False

    # So that we can serialize state to file
    def to_json(self):
        return {
            FormField.KEY_NAME: self.name,
            FormField.KEY_VALUE: self.value,
            FormField.KEY_IF_REQUIRED: self.if_required,
            FormField.KEY_IF_MASKED: self.if_masked,
            FormField.KEY_MEX_EXPR: self.mex_expr,
            FormField.KEY_VALUE_JUST_UPDATED: self.value_just_updated,
            FormField.KEY_COMPLETED: self.completed
        }
Пример #4
0
    def __init__(
            self,
            identifier_string,
            # If None, will not do spelling correction
            dir_path_model,
            # If None, will not replace any word with unknown symbol W_UNK
            model_features_list,
            lang,
            dirpath_synonymlist,
            postfix_synonymlist,
            dir_wordlist,
            postfix_wordlist,
            dir_wordlist_app,
            postfix_wordlist_app,
            # For certain languages like English, it is essential to include this,
            # otherwise predict accuracy will drop drastically.
            # But at the same time must be very careful. By adding manual rules, for
            # example we include words 'it', 'is'.. But "It is" could be a very valid
            # training data that becomes excluded wrongly.
            stopwords_list = None,
            do_spelling_correction = False,
            do_word_stemming = True,
            do_profiling = False
    ):
        self.identifier_string = identifier_string
        self.dir_path_model = dir_path_model
        self.model_features_list = model_features_list
        
        self.lang = langfeatures.LangFeatures.map_to_lang_code_iso639_1(
            lang_code = lang
        )
        self.dirpath_synonymlist = dirpath_synonymlist
        self.postfix_synonymlist = postfix_synonymlist
        self.dir_wordlist = dir_wordlist
        self.postfix_wordlist = postfix_wordlist
        self.dir_wordlist_app = dir_wordlist_app
        self.postfix_wordlist_app = postfix_wordlist_app
        # Allowed root words are just the model features list
        self.allowed_root_words = self.model_features_list
        self.stopwords_list = stopwords_list
        self.do_spelling_correction = do_spelling_correction
        self.do_word_stemming = do_word_stemming
        self.do_profiling = do_profiling

        Log.info(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Using wordlist dir "' + str(self.dir_wordlist)
            + '", app wordlist dir "' + str(self.dir_wordlist_app)
            + '", synonym dir "' + str(self.dirpath_synonymlist) + '"'
        )

        self.words_no_replace_with_special_symbols = \
            list(langchar.LangCharacters.UNICODE_BLOCK_WORD_SEPARATORS) + \
            list(langchar.LangCharacters.UNICODE_BLOCK_SENTENCE_SEPARATORS) + \
            list(langchar.LangCharacters.UNICODE_BLOCK_PUNCTUATIONS) + \
            list(BasicPreprocessor.ALL_SPECIAL_SYMBOLS)

        self.words_no_replace_with_special_symbols = list(set(self.words_no_replace_with_special_symbols))
        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': For model "' + str(self.identifier_string)
            + '", words that will not replace with special symbols: '
            + str(self.words_no_replace_with_special_symbols)
        )

        #
        # We initialize word segmenter and synonym list after the model is ready
        # because it requires the model features so that root words of synonym lists
        # are only from the model features
        #
        self.wseg = None
        self.synonymlist = None
        self.spell_correction = None
        # Stemmer/Lemmatizer
        self.lang_have_verb_conj = False
        self.word_stemmer_lemmatizer = None

        ret_obj = langhelper.LangHelper.get_word_segmenter(
            lang                 = self.lang,
            dirpath_wordlist     = self.dir_wordlist,
            postfix_wordlist     = self.postfix_wordlist,
            dirpath_app_wordlist = self.dir_wordlist_app,
            postfix_app_wordlist = self.postfix_wordlist_app,
            dirpath_synonymlist  = self.dirpath_synonymlist,
            postfix_synonymlist  = self.postfix_synonymlist,
            # We can only allow root words to be words from the model features
            allowed_root_words   = self.model_features_list,
            do_profiling         = self.do_profiling
        )
        self.wseg = ret_obj.wseg
        self.synonymlist = ret_obj.snnlist

        #
        # For spelling correction
        #
        if self.do_spelling_correction:
            try:
                self.spell_correction = spellcor.SpellCheckSentence(
                    lang              = self.lang,
                    words_list        = self.model_features_list,
                    dir_path_model    = self.dir_path_model,
                    identifier_string = self.identifier_string,
                    do_profiling      = self.do_profiling
                )
                Log.important(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Spelling Correction for model "' + str(self.identifier_string)
                    + '" initialized successfully.'
                )
            except Exception as ex_spellcor:
                self.spell_correction = None
                errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                         + ': Error initializing spelling correction for model "' \
                         + str(self.identifier_string) \
                         + '", got exception "' + str(ex_spellcor) + '".'
                Log.error(errmsg)

        #
        # For stemmer / lemmatization
        #
        if self.do_word_stemming:
            lfobj = langfeatures.LangFeatures()
            self.lang_have_verb_conj = lfobj.have_verb_conjugation(lang=self.lang)
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Lang "' + str(self.lang) + '" verb conjugation = ' + str(self.lang_have_verb_conj) + '.'
            )
            self.word_stemmer_lemmatizer = None
            if self.lang_have_verb_conj:
                try:
                    self.word_stemmer_lemmatizer = lmtz.Lemmatizer(
                        lang=self.lang
                    )
                    Log.important(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Lang "' + str(self.lang) + '" stemmer/lemmatizer initialized successfully.'
                    )
                except Exception as ex_stemmer:
                    errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                             + ': Lang "' + str(self.lang) + ' stemmer/lemmatizer failed to initialize: ' \
                             + str(ex_stemmer) + '.'
                    Log.error(errmsg)
                    self.word_stemmer_lemmatizer = None

        self.mex_username_nonword = MatchExpression(
            pattern = 'u,' + MexBuiltInTypes.MEX_TYPE_USERNAME_NONWORD + ',',
            lang    = None
        )
        return
Пример #5
0
class TxtPreprocessor:

    def __init__(
            self,
            identifier_string,
            # If None, will not do spelling correction
            dir_path_model,
            # If None, will not replace any word with unknown symbol W_UNK
            model_features_list,
            lang,
            dirpath_synonymlist,
            postfix_synonymlist,
            dir_wordlist,
            postfix_wordlist,
            dir_wordlist_app,
            postfix_wordlist_app,
            # For certain languages like English, it is essential to include this,
            # otherwise predict accuracy will drop drastically.
            # But at the same time must be very careful. By adding manual rules, for
            # example we include words 'it', 'is'.. But "It is" could be a very valid
            # training data that becomes excluded wrongly.
            stopwords_list = None,
            do_spelling_correction = False,
            do_word_stemming = True,
            do_profiling = False
    ):
        self.identifier_string = identifier_string
        self.dir_path_model = dir_path_model
        self.model_features_list = model_features_list
        
        self.lang = langfeatures.LangFeatures.map_to_lang_code_iso639_1(
            lang_code = lang
        )
        self.dirpath_synonymlist = dirpath_synonymlist
        self.postfix_synonymlist = postfix_synonymlist
        self.dir_wordlist = dir_wordlist
        self.postfix_wordlist = postfix_wordlist
        self.dir_wordlist_app = dir_wordlist_app
        self.postfix_wordlist_app = postfix_wordlist_app
        # Allowed root words are just the model features list
        self.allowed_root_words = self.model_features_list
        self.stopwords_list = stopwords_list
        self.do_spelling_correction = do_spelling_correction
        self.do_word_stemming = do_word_stemming
        self.do_profiling = do_profiling

        Log.info(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Using wordlist dir "' + str(self.dir_wordlist)
            + '", app wordlist dir "' + str(self.dir_wordlist_app)
            + '", synonym dir "' + str(self.dirpath_synonymlist) + '"'
        )

        self.words_no_replace_with_special_symbols = \
            list(langchar.LangCharacters.UNICODE_BLOCK_WORD_SEPARATORS) + \
            list(langchar.LangCharacters.UNICODE_BLOCK_SENTENCE_SEPARATORS) + \
            list(langchar.LangCharacters.UNICODE_BLOCK_PUNCTUATIONS) + \
            list(BasicPreprocessor.ALL_SPECIAL_SYMBOLS)

        self.words_no_replace_with_special_symbols = list(set(self.words_no_replace_with_special_symbols))
        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': For model "' + str(self.identifier_string)
            + '", words that will not replace with special symbols: '
            + str(self.words_no_replace_with_special_symbols)
        )

        #
        # We initialize word segmenter and synonym list after the model is ready
        # because it requires the model features so that root words of synonym lists
        # are only from the model features
        #
        self.wseg = None
        self.synonymlist = None
        self.spell_correction = None
        # Stemmer/Lemmatizer
        self.lang_have_verb_conj = False
        self.word_stemmer_lemmatizer = None

        ret_obj = langhelper.LangHelper.get_word_segmenter(
            lang                 = self.lang,
            dirpath_wordlist     = self.dir_wordlist,
            postfix_wordlist     = self.postfix_wordlist,
            dirpath_app_wordlist = self.dir_wordlist_app,
            postfix_app_wordlist = self.postfix_wordlist_app,
            dirpath_synonymlist  = self.dirpath_synonymlist,
            postfix_synonymlist  = self.postfix_synonymlist,
            # We can only allow root words to be words from the model features
            allowed_root_words   = self.model_features_list,
            do_profiling         = self.do_profiling
        )
        self.wseg = ret_obj.wseg
        self.synonymlist = ret_obj.snnlist

        #
        # For spelling correction
        #
        if self.do_spelling_correction:
            try:
                self.spell_correction = spellcor.SpellCheckSentence(
                    lang              = self.lang,
                    words_list        = self.model_features_list,
                    dir_path_model    = self.dir_path_model,
                    identifier_string = self.identifier_string,
                    do_profiling      = self.do_profiling
                )
                Log.important(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Spelling Correction for model "' + str(self.identifier_string)
                    + '" initialized successfully.'
                )
            except Exception as ex_spellcor:
                self.spell_correction = None
                errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                         + ': Error initializing spelling correction for model "' \
                         + str(self.identifier_string) \
                         + '", got exception "' + str(ex_spellcor) + '".'
                Log.error(errmsg)

        #
        # For stemmer / lemmatization
        #
        if self.do_word_stemming:
            lfobj = langfeatures.LangFeatures()
            self.lang_have_verb_conj = lfobj.have_verb_conjugation(lang=self.lang)
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Lang "' + str(self.lang) + '" verb conjugation = ' + str(self.lang_have_verb_conj) + '.'
            )
            self.word_stemmer_lemmatizer = None
            if self.lang_have_verb_conj:
                try:
                    self.word_stemmer_lemmatizer = lmtz.Lemmatizer(
                        lang=self.lang
                    )
                    Log.important(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Lang "' + str(self.lang) + '" stemmer/lemmatizer initialized successfully.'
                    )
                except Exception as ex_stemmer:
                    errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                             + ': Lang "' + str(self.lang) + ' stemmer/lemmatizer failed to initialize: ' \
                             + str(ex_stemmer) + '.'
                    Log.error(errmsg)
                    self.word_stemmer_lemmatizer = None

        self.mex_username_nonword = MatchExpression(
            pattern = 'u,' + MexBuiltInTypes.MEX_TYPE_USERNAME_NONWORD + ',',
            lang    = None
        )
        return

    def __is_username_nonword_type(
            self,
            word
    ):
        params_dict = self.mex_username_nonword.get_params(
            sentence=word,
            return_one_value=True  # if False will return both (left,right) values
        )
        is_username_nonword_type = params_dict['u'] is not None
        return is_username_nonword_type

    def __remove_stopwords(
            self,
            word_list
    ):
        if self.stopwords_list:
            word_list_remove = []
            for w in word_list:
                if w not in self.stopwords_list:
                    word_list_remove.append(w)
            Log.debug(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Lang "' + str(self.lang) + '", Word list "' + str(word_list)
                + '", removed stopwords to "' + str(word_list_remove) + '".'
            )
            return word_list_remove
        else:
            return word_list

    def preprocess_list(
            self,
            sentences_list,
    ):
        return [self.process_text(inputtext=s, return_as_string=False) for s in sentences_list]

    #
    # Some things we do
    #   1. Replace special tokens like URLs with special symbols
    #   2. Segment text or word tokenization
    #   3. Convert to lowercase, clean/separate common punctuations from words
    #   4. Normalize text, replacing synonyms with single word
    #   5. Spelling correction
    #   6. Remove stopwords
    #   7. Stemming or Lemmatization
    #
    def process_text(
            self,
            inputtext,
            return_as_string = False,
            use_special_symbol_username_nonword = False
    ):
        #
        # 1st Round replace with very special symbols first, that must be done before
        # word segmentation or cleaning.
        # Be careful here, don't simply replace things.
        # For symbols that can wait until after word segmentation like numbers, unknown
        # words, we do later.
        #
        pat_rep_list = [
            {
                'pattern': MexBuiltInTypes.REGEX_URI,
                'repl': ' ' + BasicPreprocessor.W_URI + ' '
            },
        ]
        inputtext_sym =  su.StringUtils.trim(str(inputtext))
        for pat_rep in pat_rep_list:
            inputtext_sym = re.sub(
                pattern = pat_rep['pattern'],
                repl    = pat_rep['repl'],
                string  = inputtext_sym
            )
        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext)
            + '" special pattern replacement to: ' + str(inputtext_sym)
        )

        #
        # Segment words
        #
        # Returns a word array, e.g. ['word1', 'word2', 'x', 'y',...]
        text_segmented_arr = self.wseg.segment_words(
            text = inputtext_sym,
            return_array_of_split_words = True
        )
        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext)
            + '" segmented to: ' + str(text_segmented_arr)
        )

        #
        # Remove basic punctuations stuck to word
        #
        # Will return None on error
        tmp_arr = BasicPreprocessor.clean_punctuations(
            sentence = text_segmented_arr
        )
        if type(tmp_arr) in [list, tuple]:
            text_segmented_arr = tmp_arr
        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext)
            + '" clean punctuations to: ' + str(text_segmented_arr)
        )

        #
        # Replace words with root words
        # This step uses synonyms and replaces say "красивая", "милая", "симпатичная", all with "красивая"
        # This will reduce training data without needing to put all versions of the same thing.
        #
        text_normalized_arr = self.synonymlist.normalize_text_array(
            text_segmented_array = text_segmented_arr
        )

        text_normalized_arr_lower = [s.lower() for s in text_normalized_arr]

        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext)
            + '", normalized to "' + str(text_normalized_arr_lower) + '"'
        )

        #
        # Spelling correction
        #
        if self.do_spelling_correction:
            if self.spell_correction is not None:
                text_normalized_arr_lower = self.spell_correction.check(
                    text_segmented_arr = text_normalized_arr_lower
                )
                Log.info(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext)
                    + '", corrected spelling to "' + str(text_normalized_arr_lower) + '".'
                )

        #
        # Remove stopwords before stemming
        #
        text_normalized_arr_lower = self.__remove_stopwords(
            word_list = text_normalized_arr_lower
        )

        #
        # Stemming / Lemmatization
        #
        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Lang "' + str(self.lang)
            + '" do stemming = ' + str(self.do_word_stemming)
            + ', have verb conjugation = ' + str(self.lang_have_verb_conj)
        )
        if self.do_word_stemming and self.lang_have_verb_conj:
            if self.word_stemmer_lemmatizer:
                for i in range(len(text_normalized_arr_lower)):
                    text_normalized_arr_lower[i] = self.word_stemmer_lemmatizer.stem(
                        word = text_normalized_arr_lower[i]
                    )
                Log.debug(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext)
                    + '", stemmed to "' + str(text_normalized_arr_lower) + '".'
                )

        #
        # 2nd round replace with special symbols for numbers, unrecognized vocabulary, etc.
        # MUST NOT accidentally replace our earlier special symbols like _uri, etc.
        #
        for i in range(len(text_normalized_arr_lower)):
            word = text_normalized_arr_lower[i]

            #
            # Punctuations, special symbols themselves, etc, will not undergo this process
            #
            if word in self.words_no_replace_with_special_symbols:
                continue

            # Check numbers first, re.match() is fast enough
            # Replace numbers with separate symbol
            if re.match(pattern='^[0-9]+$', string=word):
                Log.debugdebug(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Found for number in word "' + str(word) + '"'
                )
                text_normalized_arr_lower[i] = BasicPreprocessor.W_NUM
            elif self.model_features_list is not None:
                if word not in self.model_features_list:
                    text_normalized_arr_lower[i] = BasicPreprocessor.W_UNK
                    if use_special_symbol_username_nonword:
                        # Check if it is a username_nonword form
                        if self.__is_username_nonword_type(word=word):
                            text_normalized_arr_lower[i] = BasicPreprocessor.W_USERNAME_NONWORD
            else:
                if use_special_symbol_username_nonword:
                    if self.__is_username_nonword_type(word=word):
                        text_normalized_arr_lower[i] = BasicPreprocessor.W_USERNAME_NONWORD

        #
        # Remove stopwords again after stemming
        #
        text_normalized_arr_lower = self.__remove_stopwords(
            word_list = text_normalized_arr_lower
        )

        #
        # Finally remove empty words in array
        #
        text_normalized_arr_lower = [x for x in text_normalized_arr_lower if x != '']

        Log.info(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Lang "' + str(self.lang)
            + '", Done text processing to: ' + str(text_normalized_arr_lower)
            + ' from "' + str(inputtext) + '".'
        )

        if return_as_string:
            print_separator = BasicPreprocessor.get_word_separator(
                lang = self.lang
            )
            return print_separator.join(text_normalized_arr_lower)
        else:
            return text_normalized_arr_lower