Python MatchExpression примеры использования

Язык программирования: Python

Пространство имен/Пакет: mex.MatchExpression

Класс/Тип: MatchExpression

Примеров на hotexamples.com: 5

Python MatchExpression - 5 примеров найдено. Это лучшие примеры Python кода для mex.MatchExpression.MatchExpression, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

MatchExpression(3)

create_mex_obj_from_object_vars(1)

get_mex_var_length_range(1)

get_mex_var_names(1)

get_mex_var_pref_dir(1)

get_mex_var_type(1)

Пример #1

Показать файл

Файл: FormField.py Проект: nwae/nwae

 def __init__(
         self,
         name,
         value,
         if_required,
         if_masked,
         # MEX expression to extract param from human sentence
         mex_expr,
         # For deserializing old objects so the old state is maintained
         value_just_updated=False,
         completed=False):
     self.name = name
     self.value = value
     self.if_required = if_required
     self.if_masked = if_masked
     # Field MEX
     self.mex_expr = mex_expr
     self.value_just_updated = value_just_updated
     # Already obtained the parameter from user conversation?
     self.completed = completed
     try:
         self.mex_obj = MatchExpression(pattern=self.mex_expr, lang=None)
         self.mex_var_name = self.mex_obj.get_mex_var_names()[0]
         self.mex_obj_no_var_expressions = MatchExpression.create_mex_obj_from_object_vars(
             var_name_str=self.mex_var_name,
             var_type_str=self.mex_obj.get_mex_var_type(
                 var_name=self.mex_var_name),
             var_expressions_str='',
             var_len_range_list2=self.mex_obj.get_mex_var_length_range(
                 var_name=self.mex_var_name),
             var_preferred_dir_str=self.mex_obj.get_mex_var_pref_dir(
                 var_name=self.mex_var_name))
     except Exception as ex_mex:
         raise Exception(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) +
             ': Failed to get mex var name for mex expr "' +
             str(self.mex_expr) + '", got exception "' + str(ex_mex) + '".')
     Log.info(
         str(self.__class__) + ' ' +
         str(getframeinfo(currentframe()).lineno) +
         ': Field initialized: ' + str(self.to_json()))
     return

Пример #2

Показать файл

Файл: Form.py Проект: nwae/nwae

 def __init__(
     self,
     title,
     instruction,
     if_need_confirm,
     # List of FormFields
     form_fields,
     # mex_form_model
     mex_form_model,
     # For deserializing old objects so the old state is mainted
     form_completed=False):
     self.title = title
     self.instruction = instruction
     self.if_need_confirm = if_need_confirm
     # List of FormFields
     self.form_fields = form_fields
     # Field MEX
     self.mex_form_model = mex_form_model
     self.mex_obj = MatchExpression(
         pattern=self.mex_form_model,
         lang=None,
     )
     self.form_completed = form_completed

Пример #3

Показать файл

Файл: FormField.py Проект: nwae/nwae

class FormField:

    KEY_NAME = 'name'
    KEY_VALUE = 'value'
    KEY_IF_REQUIRED = 'ifRequired'
    KEY_IF_MASKED = 'ifMasked'
    KEY_MEX_EXPR = 'mexExpr'
    # For deserializing old objects so the old state is maintained
    KEY_VALUE_JUST_UPDATED = 'valueJustUpdated'
    KEY_COMPLETED = 'completed'

    @staticmethod
    def deserialize(json_obj):
        return FormField.import_form_field(json_obj=json_obj)

    @staticmethod
    def import_form_field(json_obj):
        if_required = True
        if_masked = False
        completed = False
        value_just_updated = False

        # Non-compulsory keys
        if FormField.KEY_IF_REQUIRED in json_obj.keys():
            if_required = json_obj[FormField.KEY_IF_REQUIRED]
        if FormField.KEY_IF_MASKED in json_obj.keys():
            if_masked = json_obj[FormField.KEY_IF_MASKED]
        if FormField.KEY_VALUE_JUST_UPDATED in json_obj.keys():
            value_just_updated = json_obj[FormField.KEY_VALUE_JUST_UPDATED]
        if FormField.KEY_COMPLETED in json_obj.keys():
            completed = json_obj[FormField.KEY_COMPLETED]

        return FormField(
            # Compulsory key
            name=json_obj[FormField.KEY_NAME],
            # Compulsory key
            value=json_obj[FormField.KEY_VALUE],
            if_required=if_required,
            if_masked=if_masked,
            mex_expr=json_obj[FormField.KEY_MEX_EXPR],
            value_just_updated=value_just_updated,
            completed=completed)

    def __init__(
            self,
            name,
            value,
            if_required,
            if_masked,
            # MEX expression to extract param from human sentence
            mex_expr,
            # For deserializing old objects so the old state is maintained
            value_just_updated=False,
            completed=False):
        self.name = name
        self.value = value
        self.if_required = if_required
        self.if_masked = if_masked
        # Field MEX
        self.mex_expr = mex_expr
        self.value_just_updated = value_just_updated
        # Already obtained the parameter from user conversation?
        self.completed = completed
        try:
            self.mex_obj = MatchExpression(pattern=self.mex_expr, lang=None)
            self.mex_var_name = self.mex_obj.get_mex_var_names()[0]
            self.mex_obj_no_var_expressions = MatchExpression.create_mex_obj_from_object_vars(
                var_name_str=self.mex_var_name,
                var_type_str=self.mex_obj.get_mex_var_type(
                    var_name=self.mex_var_name),
                var_expressions_str='',
                var_len_range_list2=self.mex_obj.get_mex_var_length_range(
                    var_name=self.mex_var_name),
                var_preferred_dir_str=self.mex_obj.get_mex_var_pref_dir(
                    var_name=self.mex_var_name))
        except Exception as ex_mex:
            raise Exception(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Failed to get mex var name for mex expr "' +
                str(self.mex_expr) + '", got exception "' + str(ex_mex) + '".')
        Log.info(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Field initialized: ' + str(self.to_json()))
        return

    def set_value_just_updated(self):
        self.value_just_updated = True

    def reset_value_just_updated(self):
        self.value_just_updated = False

    def set_field_value(
        self,
        user_text,
        # Strict means match only with expressions
        strict_var_expressions=True):
        value = None
        # Try with var expressions first
        res = self.__set_field_value_from_text(text=user_text,
                                               exclude_var_expressions=False)
        if res is True:
            return True
        elif not strict_var_expressions:
            # Try to match with no text expressions, as user may just type the value alone
            res = self.__set_field_value_from_text(
                text=user_text, exclude_var_expressions=True)
            return res
        else:
            return False

    def __set_field_value_from_text(self, text, exclude_var_expressions=False):
        if exclude_var_expressions:
            params_dict = self.mex_obj_no_var_expressions.get_params(
                sentence=text, return_one_value=True)
        else:
            params_dict = self.mex_obj.get_params(
                sentence=text,
                # No need to return 2 sides
                return_one_value=True)
        if params_dict[self.mex_var_name] is not None:
            self.value = params_dict[self.mex_var_name]
            self.set_value_just_updated()
            self.completed = True
            return True
        else:
            return False

    # So that we can serialize state to file
    def to_json(self):
        return {
            FormField.KEY_NAME: self.name,
            FormField.KEY_VALUE: self.value,
            FormField.KEY_IF_REQUIRED: self.if_required,
            FormField.KEY_IF_MASKED: self.if_masked,
            FormField.KEY_MEX_EXPR: self.mex_expr,
            FormField.KEY_VALUE_JUST_UPDATED: self.value_just_updated,
            FormField.KEY_COMPLETED: self.completed
        }

Пример #4

Показать файл

Файл: TxtPreprocessor.py Проект: nwae/nwae

    def __init__(
            self,
            identifier_string,
            # If None, will not do spelling correction
            dir_path_model,
            # If None, will not replace any word with unknown symbol W_UNK
            model_features_list,
            lang,
            dirpath_synonymlist,
            postfix_synonymlist,
            dir_wordlist,
            postfix_wordlist,
            dir_wordlist_app,
            postfix_wordlist_app,
            # For certain languages like English, it is essential to include this,
            # otherwise predict accuracy will drop drastically.
            # But at the same time must be very careful. By adding manual rules, for
            # example we include words 'it', 'is'.. But "It is" could be a very valid
            # training data that becomes excluded wrongly.
            stopwords_list = None,
            do_spelling_correction = False,
            do_word_stemming = True,
            do_profiling = False
    ):
        self.identifier_string = identifier_string
        self.dir_path_model = dir_path_model
        self.model_features_list = model_features_list
        
        self.lang = langfeatures.LangFeatures.map_to_lang_code_iso639_1(
            lang_code = lang
        )
        self.dirpath_synonymlist = dirpath_synonymlist
        self.postfix_synonymlist = postfix_synonymlist
        self.dir_wordlist = dir_wordlist
        self.postfix_wordlist = postfix_wordlist
        self.dir_wordlist_app = dir_wordlist_app
        self.postfix_wordlist_app = postfix_wordlist_app
        # Allowed root words are just the model features list
        self.allowed_root_words = self.model_features_list
        self.stopwords_list = stopwords_list
        self.do_spelling_correction = do_spelling_correction
        self.do_word_stemming = do_word_stemming
        self.do_profiling = do_profiling

        Log.info(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Using wordlist dir "' + str(self.dir_wordlist)
            + '", app wordlist dir "' + str(self.dir_wordlist_app)
            + '", synonym dir "' + str(self.dirpath_synonymlist) + '"'
        )

        self.words_no_replace_with_special_symbols = \
            list(langchar.LangCharacters.UNICODE_BLOCK_WORD_SEPARATORS) + \
            list(langchar.LangCharacters.UNICODE_BLOCK_SENTENCE_SEPARATORS) + \
            list(langchar.LangCharacters.UNICODE_BLOCK_PUNCTUATIONS) + \
            list(BasicPreprocessor.ALL_SPECIAL_SYMBOLS)

        self.words_no_replace_with_special_symbols = list(set(self.words_no_replace_with_special_symbols))
        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': For model "' + str(self.identifier_string)
            + '", words that will not replace with special symbols: '
            + str(self.words_no_replace_with_special_symbols)
        )

        #
        # We initialize word segmenter and synonym list after the model is ready
        # because it requires the model features so that root words of synonym lists
        # are only from the model features
        #
        self.wseg = None
        self.synonymlist = None
        self.spell_correction = None
        # Stemmer/Lemmatizer
        self.lang_have_verb_conj = False
        self.word_stemmer_lemmatizer = None

        ret_obj = langhelper.LangHelper.get_word_segmenter(
            lang                 = self.lang,
            dirpath_wordlist     = self.dir_wordlist,
            postfix_wordlist     = self.postfix_wordlist,
            dirpath_app_wordlist = self.dir_wordlist_app,
            postfix_app_wordlist = self.postfix_wordlist_app,
            dirpath_synonymlist  = self.dirpath_synonymlist,
            postfix_synonymlist  = self.postfix_synonymlist,
            # We can only allow root words to be words from the model features
            allowed_root_words   = self.model_features_list,
            do_profiling         = self.do_profiling
        )
        self.wseg = ret_obj.wseg
        self.synonymlist = ret_obj.snnlist

        #
        # For spelling correction
        #
        if self.do_spelling_correction:
            try:
                self.spell_correction = spellcor.SpellCheckSentence(
                    lang              = self.lang,
                    words_list        = self.model_features_list,
                    dir_path_model    = self.dir_path_model,
                    identifier_string = self.identifier_string,
                    do_profiling      = self.do_profiling
                )
                Log.important(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Spelling Correction for model "' + str(self.identifier_string)
                    + '" initialized successfully.'
                )
            except Exception as ex_spellcor:
                self.spell_correction = None
                errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                         + ': Error initializing spelling correction for model "' \
                         + str(self.identifier_string) \
                         + '", got exception "' + str(ex_spellcor) + '".'
                Log.error(errmsg)

        #
        # For stemmer / lemmatization
        #
        if self.do_word_stemming:
            lfobj = langfeatures.LangFeatures()
            self.lang_have_verb_conj = lfobj.have_verb_conjugation(lang=self.lang)
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Lang "' + str(self.lang) + '" verb conjugation = ' + str(self.lang_have_verb_conj) + '.'
            )
            self.word_stemmer_lemmatizer = None
            if self.lang_have_verb_conj:
                try:
                    self.word_stemmer_lemmatizer = lmtz.Lemmatizer(
                        lang=self.lang
                    )
                    Log.important(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Lang "' + str(self.lang) + '" stemmer/lemmatizer initialized successfully.'
                    )
                except Exception as ex_stemmer:
                    errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                             + ': Lang "' + str(self.lang) + ' stemmer/lemmatizer failed to initialize: ' \
                             + str(ex_stemmer) + '.'
                    Log.error(errmsg)
                    self.word_stemmer_lemmatizer = None

        self.mex_username_nonword = MatchExpression(
            pattern = 'u,' + MexBuiltInTypes.MEX_TYPE_USERNAME_NONWORD + ',',
            lang    = None
        )
        return

Пример #5

Показать файл

Файл: TxtPreprocessor.py Проект: nwae/nwae

class TxtPreprocessor:

    def __init__(
            self,
            identifier_string,
            # If None, will not do spelling correction
            dir_path_model,
            # If None, will not replace any word with unknown symbol W_UNK
            model_features_list,
            lang,
            dirpath_synonymlist,
            postfix_synonymlist,
            dir_wordlist,
            postfix_wordlist,
            dir_wordlist_app,
            postfix_wordlist_app,
            # For certain languages like English, it is essential to include this,
            # otherwise predict accuracy will drop drastically.
            # But at the same time must be very careful. By adding manual rules, for
            # example we include words 'it', 'is'.. But "It is" could be a very valid
            # training data that becomes excluded wrongly.
            stopwords_list = None,
            do_spelling_correction = False,
            do_word_stemming = True,
            do_profiling = False
    ):
        self.identifier_string = identifier_string
        self.dir_path_model = dir_path_model
        self.model_features_list = model_features_list
        
        self.lang = langfeatures.LangFeatures.map_to_lang_code_iso639_1(
            lang_code = lang
        )
        self.dirpath_synonymlist = dirpath_synonymlist
        self.postfix_synonymlist = postfix_synonymlist
        self.dir_wordlist = dir_wordlist
        self.postfix_wordlist = postfix_wordlist
        self.dir_wordlist_app = dir_wordlist_app
        self.postfix_wordlist_app = postfix_wordlist_app
        # Allowed root words are just the model features list
        self.allowed_root_words = self.model_features_list
        self.stopwords_list = stopwords_list
        self.do_spelling_correction = do_spelling_correction
        self.do_word_stemming = do_word_stemming
        self.do_profiling = do_profiling

        Log.info(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Using wordlist dir "' + str(self.dir_wordlist)
            + '", app wordlist dir "' + str(self.dir_wordlist_app)
            + '", synonym dir "' + str(self.dirpath_synonymlist) + '"'
        )

        self.words_no_replace_with_special_symbols = \
            list(langchar.LangCharacters.UNICODE_BLOCK_WORD_SEPARATORS) + \
            list(langchar.LangCharacters.UNICODE_BLOCK_SENTENCE_SEPARATORS) + \
            list(langchar.LangCharacters.UNICODE_BLOCK_PUNCTUATIONS) + \
            list(BasicPreprocessor.ALL_SPECIAL_SYMBOLS)

        self.words_no_replace_with_special_symbols = list(set(self.words_no_replace_with_special_symbols))
        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': For model "' + str(self.identifier_string)
            + '", words that will not replace with special symbols: '
            + str(self.words_no_replace_with_special_symbols)
        )

        #
        # We initialize word segmenter and synonym list after the model is ready
        # because it requires the model features so that root words of synonym lists
        # are only from the model features
        #
        self.wseg = None
        self.synonymlist = None
        self.spell_correction = None
        # Stemmer/Lemmatizer
        self.lang_have_verb_conj = False
        self.word_stemmer_lemmatizer = None

        ret_obj = langhelper.LangHelper.get_word_segmenter(
            lang                 = self.lang,
            dirpath_wordlist     = self.dir_wordlist,
            postfix_wordlist     = self.postfix_wordlist,
            dirpath_app_wordlist = self.dir_wordlist_app,
            postfix_app_wordlist = self.postfix_wordlist_app,
            dirpath_synonymlist  = self.dirpath_synonymlist,
            postfix_synonymlist  = self.postfix_synonymlist,
            # We can only allow root words to be words from the model features
            allowed_root_words   = self.model_features_list,
            do_profiling         = self.do_profiling
        )
        self.wseg = ret_obj.wseg
        self.synonymlist = ret_obj.snnlist

        #
        # For spelling correction
        #
        if self.do_spelling_correction:
            try:
                self.spell_correction = spellcor.SpellCheckSentence(
                    lang              = self.lang,
                    words_list        = self.model_features_list,
                    dir_path_model    = self.dir_path_model,
                    identifier_string = self.identifier_string,
                    do_profiling      = self.do_profiling
                )
                Log.important(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Spelling Correction for model "' + str(self.identifier_string)
                    + '" initialized successfully.'
                )
            except Exception as ex_spellcor:
                self.spell_correction = None
                errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                         + ': Error initializing spelling correction for model "' \
                         + str(self.identifier_string) \
                         + '", got exception "' + str(ex_spellcor) + '".'
                Log.error(errmsg)

        #
        # For stemmer / lemmatization
        #
        if self.do_word_stemming:
            lfobj = langfeatures.LangFeatures()
            self.lang_have_verb_conj = lfobj.have_verb_conjugation(lang=self.lang)
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Lang "' + str(self.lang) + '" verb conjugation = ' + str(self.lang_have_verb_conj) + '.'
            )
            self.word_stemmer_lemmatizer = None
            if self.lang_have_verb_conj:
                try:
                    self.word_stemmer_lemmatizer = lmtz.Lemmatizer(
                        lang=self.lang
                    )
                    Log.important(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Lang "' + str(self.lang) + '" stemmer/lemmatizer initialized successfully.'
                    )
                except Exception as ex_stemmer:
                    errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                             + ': Lang "' + str(self.lang) + ' stemmer/lemmatizer failed to initialize: ' \
                             + str(ex_stemmer) + '.'
                    Log.error(errmsg)
                    self.word_stemmer_lemmatizer = None

        self.mex_username_nonword = MatchExpression(
            pattern = 'u,' + MexBuiltInTypes.MEX_TYPE_USERNAME_NONWORD + ',',
            lang    = None
        )
        return

    def __is_username_nonword_type(
            self,
            word
    ):
        params_dict = self.mex_username_nonword.get_params(
            sentence=word,
            return_one_value=True  # if False will return both (left,right) values
        )
        is_username_nonword_type = params_dict['u'] is not None
        return is_username_nonword_type

    def __remove_stopwords(
            self,
            word_list
    ):
        if self.stopwords_list:
            word_list_remove = []
            for w in word_list:
                if w not in self.stopwords_list:
                    word_list_remove.append(w)
            Log.debug(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Lang "' + str(self.lang) + '", Word list "' + str(word_list)
                + '", removed stopwords to "' + str(word_list_remove) + '".'
            )
            return word_list_remove
        else:
            return word_list

    def preprocess_list(
            self,
            sentences_list,
    ):
        return [self.process_text(inputtext=s, return_as_string=False) for s in sentences_list]

    #
    # Some things we do
    #   1. Replace special tokens like URLs with special symbols
    #   2. Segment text or word tokenization
    #   3. Convert to lowercase, clean/separate common punctuations from words
    #   4. Normalize text, replacing synonyms with single word
    #   5. Spelling correction
    #   6. Remove stopwords
    #   7. Stemming or Lemmatization
    #
    def process_text(
            self,
            inputtext,
            return_as_string = False,
            use_special_symbol_username_nonword = False
    ):
        #
        # 1st Round replace with very special symbols first, that must be done before
        # word segmentation or cleaning.
        # Be careful here, don't simply replace things.
        # For symbols that can wait until after word segmentation like numbers, unknown
        # words, we do later.
        #
        pat_rep_list = [
            {
                'pattern': MexBuiltInTypes.REGEX_URI,
                'repl': ' ' + BasicPreprocessor.W_URI + ' '
            },
        ]
        inputtext_sym =  su.StringUtils.trim(str(inputtext))
        for pat_rep in pat_rep_list:
            inputtext_sym = re.sub(
                pattern = pat_rep['pattern'],
                repl    = pat_rep['repl'],
                string  = inputtext_sym
            )
        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext)
            + '" special pattern replacement to: ' + str(inputtext_sym)
        )

        #
        # Segment words
        #
        # Returns a word array, e.g. ['word1', 'word2', 'x', 'y',...]
        text_segmented_arr = self.wseg.segment_words(
            text = inputtext_sym,
            return_array_of_split_words = True
        )
        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext)
            + '" segmented to: ' + str(text_segmented_arr)
        )

        #
        # Remove basic punctuations stuck to word
        #
        # Will return None on error
        tmp_arr = BasicPreprocessor.clean_punctuations(
            sentence = text_segmented_arr
        )
        if type(tmp_arr) in [list, tuple]:
            text_segmented_arr = tmp_arr
        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext)
            + '" clean punctuations to: ' + str(text_segmented_arr)
        )

        #
        # Replace words with root words
        # This step uses synonyms and replaces say "красивая", "милая", "симпатичная", all with "красивая"
        # This will reduce training data without needing to put all versions of the same thing.
        #
        text_normalized_arr = self.synonymlist.normalize_text_array(
            text_segmented_array = text_segmented_arr
        )

        text_normalized_arr_lower = [s.lower() for s in text_normalized_arr]

        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext)
            + '", normalized to "' + str(text_normalized_arr_lower) + '"'
        )

        #
        # Spelling correction
        #
        if self.do_spelling_correction:
            if self.spell_correction is not None:
                text_normalized_arr_lower = self.spell_correction.check(
                    text_segmented_arr = text_normalized_arr_lower
                )
                Log.info(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext)
                    + '", corrected spelling to "' + str(text_normalized_arr_lower) + '".'
                )

        #
        # Remove stopwords before stemming
        #
        text_normalized_arr_lower = self.__remove_stopwords(
            word_list = text_normalized_arr_lower
        )

        #
        # Stemming / Lemmatization
        #
        Log.debug(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Lang "' + str(self.lang)
            + '" do stemming = ' + str(self.do_word_stemming)
            + ', have verb conjugation = ' + str(self.lang_have_verb_conj)
        )
        if self.do_word_stemming and self.lang_have_verb_conj:
            if self.word_stemmer_lemmatizer:
                for i in range(len(text_normalized_arr_lower)):
                    text_normalized_arr_lower[i] = self.word_stemmer_lemmatizer.stem(
                        word = text_normalized_arr_lower[i]
                    )
                Log.debug(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Lang "' + str(self.lang) + '", Text "' + str(inputtext)
                    + '", stemmed to "' + str(text_normalized_arr_lower) + '".'
                )

        #
        # 2nd round replace with special symbols for numbers, unrecognized vocabulary, etc.
        # MUST NOT accidentally replace our earlier special symbols like _uri, etc.
        #
        for i in range(len(text_normalized_arr_lower)):
            word = text_normalized_arr_lower[i]

            #
            # Punctuations, special symbols themselves, etc, will not undergo this process
            #
            if word in self.words_no_replace_with_special_symbols:
                continue

            # Check numbers first, re.match() is fast enough
            # Replace numbers with separate symbol
            if re.match(pattern='^[0-9]+$', string=word):
                Log.debugdebug(
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                    + ': Found for number in word "' + str(word) + '"'
                )
                text_normalized_arr_lower[i] = BasicPreprocessor.W_NUM
            elif self.model_features_list is not None:
                if word not in self.model_features_list:
                    text_normalized_arr_lower[i] = BasicPreprocessor.W_UNK
                    if use_special_symbol_username_nonword:
                        # Check if it is a username_nonword form
                        if self.__is_username_nonword_type(word=word):
                            text_normalized_arr_lower[i] = BasicPreprocessor.W_USERNAME_NONWORD
            else:
                if use_special_symbol_username_nonword:
                    if self.__is_username_nonword_type(word=word):
                        text_normalized_arr_lower[i] = BasicPreprocessor.W_USERNAME_NONWORD

        #
        # Remove stopwords again after stemming
        #
        text_normalized_arr_lower = self.__remove_stopwords(
            word_list = text_normalized_arr_lower
        )

        #
        # Finally remove empty words in array
        #
        text_normalized_arr_lower = [x for x in text_normalized_arr_lower if x != '']

        Log.info(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Lang "' + str(self.lang)
            + '", Done text processing to: ' + str(text_normalized_arr_lower)
            + ' from "' + str(inputtext) + '".'
        )

        if return_as_string:
            print_separator = BasicPreprocessor.get_word_separator(
                lang = self.lang
            )
            return print_separator.join(text_normalized_arr_lower)
        else:
            return text_normalized_arr_lower