Exemplo n.º 1
0
    def wait_for_all_initializations_to_be_done(self, wait_max_time=10):
        if self.is_all_initializations_done:
            return

        count = 1
        sleep_time_wait_initializations = 0.1
        while not self.is_all_initializations_done:
            Log.warning(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Model not yet fully initialized, sleep for ' +
                str(count * sleep_time_wait_initializations) + ' secs now..')
            if count * sleep_time_wait_initializations > wait_max_time:
                errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                         + ': Waited too long ' + str(count * sleep_time_wait_initializations)\
                         + ' secs. Raising exception..'
                raise Exception(errmsg)
            time.sleep(sleep_time_wait_initializations)
            count = count + 1
        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Initializations all done for model "' +
            str(self.identifier_string) + '" READY.')
        return
Exemplo n.º 2
0
 def send(self, user, password, recipients_list, message):
     try:
         if password not in [None, '']:
             self.server.login(user=user, password=password)
             Log.important(
                 str(self.__class__) + ' ' +
                 str(getframeinfo(currentframe()).lineno) +
                 ': Login for user "' + str(user) + '" successful.')
         else:
             # If no password passed in, no need to do login
             Log.warning(
                 str(self.__class__) + ' ' +
                 str(getframeinfo(currentframe()).lineno) +
                 ': Not doing login for user "' + str(user) +
                 '", no password given "' + str(password) + '"')
         self.server.sendmail(from_addr=user,
                              to_addrs=recipients_list,
                              msg=message)
         Log.important(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) + ': Message from ' +
             str(user) + ' to ' + str(recipients_list) +
             ' sent successfully. Closing server..')
         self.server.close()
         Log.info(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) + ': Mail server "' +
             str(self.mail_server_url) + '" closed')
     except Exception as ex:
         errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                  + ': Exception sending mail from ' + str(user) + ' to ' + str(recipients_list)\
                  + '. Got exception ' + str(ex) + '.'
         Log.error(errmsg)
         raise Exception(errmsg)
Exemplo n.º 3
0
 def __init_smtp(self):
     Log.important(
         str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
         + ': Trying to initialize mail server "' + str(self.mail_server_url)
         + '" port ' + str(self.mail_server_port) + ' using mode "' + str(self.mode) + '"...'
     )
     if self.mode == self.MAIL_MODE_SSL:
         # Create a secure SSL context
         # self.context = ssl.create_default_context()
         self.server = smtplib.SMTP_SSL(
             host=self.mail_server_url,
             port=self.mail_server_port,
             # context=self.context
         )
         self.server.ehlo()
     elif self.mode == self.MAIL_MODE_SMTP:
         self.server = smtplib.SMTP(host=self.mail_server_url,
                                    port=self.mail_server_port)
         self.server.ehlo()
     else:
         self.server = smtplib.SMTP(host=self.mail_server_url,
                                    port=self.mail_server_port)
         self.server.ehlo()
         self.server.starttls()
     Log.important(
         str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
         + ': SMTP mode "' + str(self.mode) + '" successfully initialized.'
     )
     return
Exemplo n.º 4
0
    def add_parent(self, parent):
        if parent.dead_node:
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Parent "' + str(parent.name)
                + '" is dead node (cant have children), not adding parent for node "' + str(self.name) + '"'
            )
            return

        assert type(parent) is MultiTreeNode
        if parent.name in self.parent_names:
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': For node "' + str(self.name) + '" parent "' + str(parent.name) + '" already exists'
            )
        else:
            # Don't add if already exists as parent, anywhere higher up the tree hierarchy
            if self.is_higher_level(node=parent, supposed_child_node=self):
                return
            # Update for both parent and child
            self.parents.append(parent)
            self.update()
            parent.children.append(self)
            parent.update()
            Log.debug(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': For node "' + str(self.name) + '" successfully added parent "' + str(parent.name) + '"'
            )
Exemplo n.º 5
0
 def check_if_model_updated(self):
     updated_time = os.path.getmtime(self.fpath_updated_file)
     Log.debugdebug(
         str(self.__class__) + ' ' +
         str(getframeinfo(currentframe()).lineno) + ': Model identifier "' +
         str(self.identifier_string) + '" last updated time ' +
         str(self.model_updated_time) + ', updated "' + str(updated_time) +
         '".')
     if (updated_time > self.model_updated_time):
         Log.important(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) +
             ': Model update time for identifier "' +
             str(self.identifier_string) + '" - "' +
             str(datetime.fromtimestamp(updated_time)) +
             '" is newer than "' +
             str(datetime.fromtimestamp(self.model_updated_time)) +
             '". Reloading model...')
         try:
             self.mutex_training.acquire()
             # Reset model flags to not ready
             self.model_loaded = False
             self.model_updated_time = updated_time
         finally:
             self.mutex_training.release()
         return True
     else:
         return False
Exemplo n.º 6
0
    def __init__(self,
                 lang=LangFeatures.LANG_KO,
                 audio_source=SOURCE_MIC,
                 audio_file=None,
                 engine=ENGINE_GOOGLE,
                 auth_info=None):
        self.lang = lang
        self.audio_source = audio_source
        self.audio_file = audio_file
        self.engine = engine
        self.auth_info = auth_info

        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': Language "' +
            str(self.lang) + '" audio source "' + str(self.audio_source) + '"')

        if self.audio_source == self.SOURCE_MIC:
            self.sr_source = sr.Microphone()
        elif self.audio_source == self.SOURCE_FILE:
            self.sr_source = sr.AudioFile(audio_file)
        else:
            raise Exception(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Unsupported audio source "' + str(self.audio_source) + '"')
        return
Exemplo n.º 7
0
    def __init__(self, lang):
        self.lang = LangFeatures.map_to_lang_code_iso639_1(lang_code=lang)
        self.raw_words = None
        self.common_words = None

        lfobj = LangFeatures()
        self.lang_have_verb_conj = lfobj.have_verb_conjugation(lang=self.lang)
        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': Lang "' +
            str(self.lang) + '" verb conjugation = ' +
            str(self.lang_have_verb_conj) + '.')
        self.word_stemmer = None
        if self.lang_have_verb_conj:
            try:
                self.word_stemmer = Lemmatizer(lang=self.lang)
                Log.important(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) + ': Lang "' +
                    str(self.lang) +
                    '" stemmer/lemmatizer initialized successfully.')
            except Exception as ex_stemmer:
                errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                         + ': Lang "' + str(self.lang) + ' stemmer/lemmatizer failed to initialize: ' \
                         + str(ex_stemmer) + '.'
                Log.warning(errmsg)
                self.word_stemmer = None

        return
Exemplo n.º 8
0
    def __load_sample_model(self,
                            embed_input_dim=1000,
                            embed_output_dim=64,
                            embed_input_len=20,
                            lstm_units=128):
        #
        # Layers Design
        #
        lstm_model = keras.Sequential()
        # Add an Embedding layer expecting input vocab of size 1000, and
        # output embedding dimension of size 64.
        lstm_model.add(
            keras.layers.Embedding(input_dim=embed_input_dim,
                                   output_dim=embed_output_dim,
                                   input_length=embed_input_len))
        # Add a LSTM layer with 128 internal units.
        lstm_model.add(keras.layers.LSTM(lstm_units))
        # Add a Dense layer with 10 units and softmax activation.
        lstm_model.add(keras.layers.Dense(10, activation='softmax'))

        # Finally compile the model
        lstm_model.compile(optimizer='rmsprop',
                           loss='categorical_crossentropy',
                           metrics=['accuracy'])
        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Model compiled successfully.')

        lstm_model.summary()
        return lstm_model
Exemplo n.º 9
0
    def run(self):
        try:
            self.__mutex_training.acquire()
            self.bot_training_start_time = dt.datetime.now()
            self.log_training = []

            self.__pre_process_training_data()

            self.train()

            self.bot_training_end_time = dt.datetime.now()
        except Exception as ex:
            errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                     + ': Training Identifier ' + str(self.identifier_string) + '" training exception: ' + str(ex) + '.'
            Log.critical(errmsg)
            raise Exception(errmsg)
        finally:
            self.is_training_done = True
            self.__mutex_training.release()

        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Train mode "' + str(self.train_mode)
            + '". Training Identifier ' + str(self.identifier_string) + '" trained successfully.'
        )
        return self.log_training
Exemplo n.º 10
0
 def check_prediction_stats(
     self,
     X,
     Y,
     y_predicted,
 ):
     Log.info(
         str(self.__class__) + ' ' +
         str(getframeinfo(currentframe()).lineno) +
         ': Checking prediction stats..')
     # print(y_predicted)
     # print(type(y_predicted))
     # print(y_predicted.shape)
     # print(np.sum(y_predicted, axis=1).tolist())
     # Compare some data
     count_correct = 0
     for i in range(X.shape[0]):
         data_i = X[i]
         label_i = Y[i]
         prob_distribution = y_predicted[i]
         top_x = NumpyUtil.get_top_indexes(data=prob_distribution,
                                           ascending=False,
                                           top_x=5)
         if top_x[0] == label_i:
             count_correct += 1
         Log.debug(
             str(i) + '. ' + str(data_i) + ': Label=' + str(label_i) +
             ', predicted=' + str(top_x))
     Log.important('Boosting Accuracy = ' +
                   str(100 * count_correct / X.shape[0]) + '%.')
     return
Exemplo n.º 11
0
    def __recognize_file(self):
        need_convert_format = re.sub(pattern='(.*[.])([a-zA-Z0-9]+$)',
                                     repl='\\2',
                                     string=self.audio_file).lower() != 'wav'
        audio_filepath_wav = self.audio_file

        if need_convert_format:
            Log.important(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Converting "' +
                str(self.audio_file) + '" to wav format..')
            audio_filepath_wav = AudioUtils().convert_format(
                filepath=self.audio_file)

        # Initialize recognizer class (for recognizing the speech)
        r = sr.Recognizer()

        # Reading Audio file as source
        # listening the audio file and store in audio_text variable

        with sr.AudioFile(audio_filepath_wav) as source:
            audio_text = r.listen(source)

            # recoginize_() method will throw a request error if the API is unreachable, hence using exception handling
            try:

                if self.engine == SpeechRecognition.ENGINE_GOOGLE:
                    text = r.recognize_google(audio_text, language=self.lang)
                elif self.engine == SpeechRecognition.ENGINE_GOOGLE_CLOUD:
                    text = r.recognize_google_cloud(
                        audio_text,
                        credentials_json=self.auth_info,
                        language=self.lang)
                elif self.engine == SpeechRecognition.ENGINE_BING:
                    text = r.recognize_bing(audio_text,
                                            key=self.auth_info,
                                            language=self.lang)
                else:
                    raise Exception(
                        str(self.__class__) + ' ' +
                        str(getframeinfo(currentframe()).lineno) +
                        ': Unsuported engine "' + str(self.engine) + '".')
                Log.info(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Converting audio transcripts into text ...')
                Log.important(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Recognized "' + str(self.lang) + '" text "' +
                    str(text) + '" from audio file "' + str(self.audio_file) +
                    '"')
                return text
            except Exception as ex:
                Log.error(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Exception converting audio transcript from "' +
                    str(self.audio_file) + '": ' + str(ex))
Exemplo n.º 12
0
    def add_intent_name_to_training_data(self):
        #
        # We need to add intent name into the training data also
        #
        df_intent_id_name = pd.DataFrame({
            DaehuaTrainDataModel.COL_TDATA_INTENT_ID:
            self.df_training_data[DaehuaTrainDataModel.COL_TDATA_INTENT_ID],
            DaehuaTrainDataModel.COL_TDATA_INTENT_NAME:
            self.df_training_data[DaehuaTrainDataModel.COL_TDATA_INTENT_NAME]
        })
        # Make unique by dropping duplicate intent IDs
        df_intent_id_name.drop_duplicates(inplace=True)

        for idx in df_intent_id_name.index:
            intId = df_intent_id_name[
                DaehuaTrainDataModel.COL_TDATA_INTENT_ID].loc[idx]
            try:
                int_name = str(df_intent_id_name[
                    DaehuaTrainDataModel.COL_TDATA_INTENT_NAME].loc[idx])

                # Arguments be a list form, otherwise will not be able to create this DataFrame
                row_to_append = pd.DataFrame(
                    data=self.__get_row_to_append_to_training_data(
                        intent_id=[intId],
                        intent_name=[int_name],
                        text=[int_name],
                        text_id=[TrDataPreprocessor.TRDATA_ID_INTENT_NAME],
                        # Make sure to write back this value with processed text
                        processed_text=[None],
                        lang_detected=[None],
                        internal_counter=[self.df_training_data.shape[0]]))

                #
                # We are appending to a dataframe that might have different columns ordering
                # So we make sure they are in the same order, to avoid all the sort=False/True
                # warning messages by pandas due to required join() operation.
                # If in same order, then we avoid the join().
                #
                self.df_training_data = self.df_training_data.append(
                    row_to_append, sort=True)
                Log.important(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Appended intent name "' + str(int_name) +
                    '" with intent ID ' + str(intId) +
                    ' to list of training data. Row appended = ' +
                    str(row_to_append))
            except Exception as ex:
                errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                    + ': Could not append to dataframe or could not get intent name for intent ID ' \
                    + str(intId) + '. Exception ' + str(ex)
                Log.warning(errmsg)
                raise Exception(errmsg)

        self.__process_training_data_index()

        return self.df_training_data
Exemplo n.º 13
0
    def __init__(
            self
    ):
        self.lang_features = LangFeatures()

        # Map alphabet name to unicode character set array
        self.alphabet_dict = {}
        for alp in self.TESTS_BY_ORDER:
            self.alphabet_dict[alp] = LangCharacters.get_alphabet_charset(
                alphabet = alp
            )
        Log.info(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Alphabets used: ' + str(self.alphabet_dict.keys())
        )

        self.langs_with_no_word_sep = self.lang_features.get_languages_with_no_word_separator()
        Log.debugdebug('Langs with no word sep: ' + str(self.langs_with_no_word_sep))

        # Load common words
        self.common_words = {}
        self.common_words[LangFeatures.LANG_EN] = English()
        self.common_words[LangFeatures.LANG_ES] = Spanish()
        self.common_words[LangFeatures.LANG_FR] = French()
        self.common_words[LangFeatures.LANG_ID] = Indonesian()
        self.common_words[LangFeatures.LANG_VI] = Vietnamese()

        # Load stemmers
        self.word_stemmer = {}
        for lang in self.SUPPORTED_LANGS:
            lang_have_verb_conj = self.lang_features.have_verb_conjugation(
                lang = lang
            )
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Lang "' + str(lang) + '" verb conjugation = ' + str(lang_have_verb_conj) + '.'
            )
            self.word_stemmer[lang] = None
            if lang_have_verb_conj:
                try:
                    self.word_stemmer[lang] = Lemmatizer(
                        lang = lang
                    )
                    Log.important(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Lang "' + str(lang) + '" stemmer/lemmatizer initialized successfully.'
                    )
                except Exception as ex_stemmer:
                    errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                             + ': Lang "' + str(lang) + ' stemmer/lemmatizer failed to initialize: ' \
                             + str(ex_stemmer) + '.'
                    Log.warning(errmsg)

        self.profiler_detect_alp = ProfilingHelper(profiler_name = str(self.__class__))

        return
Exemplo n.º 14
0
Arquivo: Form.py Projeto: nwae/nwae
 def reset_fields_to_incomplete(self):
     Log.important(
         str(self.__class__) + ' ' +
         str(getframeinfo(currentframe()).lineno) +
         ': Reset form fields to incomplete for form ' +
         str(self.to_json()))
     for i in range(len(self.form_fields)):
         fld = self.form_fields[i]
         fld.completed = False
         fld.value = None
Exemplo n.º 15
0
 def join(self, timeout=None):
     Log.important(
         str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
         ': Model Identifier "' + str(self.identifier_string) +
         '" join called..')
     self.stoprequest.set()
     super(ModelInterface, self).join(timeout=timeout)
     Log.important(
         str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
         ': Model Identifier "' + str(self.identifier_string) +
         '" Background Thread ended..')
Exemplo n.º 16
0
 def reset(self):
     Log.important('Form reset')
     self.set_state_none()
     # The current field we are trying to extract from user
     self.conv_current_field_index = None
     self.conv_current_field_name = None
     # Previous field set by user
     # self.conv_completed_fields = []
     # Reset fields
     self.form.reset_fields_to_incomplete()
     return
Exemplo n.º 17
0
    def preprocess_training_data(self):
        if not self.is_training_data_ready:
            try:
                #
                # The external interface must pass back 2 parameters, a DataFrame of preprocessed training data
                # and Embedding Layer params
                #
                self.df_training_data_pp, self.embedding_params = self.training_data_source.fetch_and_preprocess_data(
                )

                Log.important(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Successfully preprocessed training data. Max label val = '
                    + str(self.embedding_params.max_label_val) +
                    ', max sentence length = ' +
                    str(self.embedding_params.max_sent_len) +
                    ', vocabulary size = ' +
                    str(self.embedding_params.vocab_size) +
                    ', x one hot dict: ' +
                    str(self.embedding_params.x_one_hot_dict))

                self.training_data = TextTrainer.convert_preprocessed_text_to_training_data_model(
                    model_name=self.model_name,
                    training_dataframe=self.df_training_data_pp,
                    embedding_x=self.embedding_params.x,
                    embedding_y=self.embedding_params.y,
                    embedding_x_one_hot_dict=self.embedding_params.
                    x_one_hot_dict,
                    embedding_y_one_hot_dict=self.embedding_params.
                    y_one_hot_dict,
                    word_freq_model=self.word_freq_model,
                )
            except Exception as ex:
                errmsg = \
                    str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                    + ': Exception calling external object type "' + str(type(self.training_data_source)) \
                    + '" method fetch_and_preprocess_data(), exception msg: ' + str(ex)
                Log.error(errmsg)
                raise Exception(errmsg)

        if type(self.training_data) is not tdm.TrainingDataModel:
            raise Exception(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': "' +
                str(self.identifier_string) + '": Wrong training data type "' +
                str(type(self.training_data)) + '".')

        # Train a single y/label ID only, regardless of train mode
        if self.y_id is not None:
            # Filter by this y/label only
            self.training_data.filter_by_y_id(y_id=self.y_id)

        return
Exemplo n.º 18
0
 def persist_model_to_storage(self):
     prf_start = prf.Profiling.start()
     self.model_data.persist_model_to_storage(
         log_training=self.logs_training)
     if self.do_profiling:
         Log.important(str(self.__class__) +
                       str(getframeinfo(currentframe()).lineno) +
                       ' PROFILING persist_model_to_storage(): ' +
                       prf.Profiling.get_time_dif_str(
                           prf_start, prf.Profiling.stop()),
                       log_list=self.logs_training)
     return
Exemplo n.º 19
0
    def run_unit_test(self):
        res_final = ut.ResultObj(count_ok=0, count_fail=0)

        res = self.test_textcluster_english()
        res_final.update(other_res_obj=res)

        res = self.test_textcluster_chinese()
        res_final.update(other_res_obj=res)

        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': PASSED ' +
            str(res_final.count_ok) + ', FAILED ' + str(res_final.count_fail))

        return res_final
Exemplo n.º 20
0
    def check(
            self,
            # Text array e.g. ['это', 'мое', 'предложение']
            text_segmented_arr,
            max_cost=1):
        start_prf = prf.Profiling.start()

        #
        # 1. Правописание Отдельных Слов Без Контекста.
        #

        len_text = len(text_segmented_arr)
        corrected_text_arr = []
        # Get the list of words in the model
        for i in range(len_text):
            w = text_segmented_arr[i]
            if (w is None) or (len(w) == 0):
                continue

            w_corrected = w

            if w not in self.words_list:
                df_correction_matches = self.spell_check_word.search_close_words(
                    word=w, max_cost=max_cost)
                # Забрать первое слово с максимальным весом
                if df_correction_matches is not None:
                    # В случае индексы не в порядке
                    top_loc = df_correction_matches.index[0]
                    w_corrected = df_correction_matches[
                        SpellCheckWord.COL_CORRECTED_WORD].loc[top_loc]

            corrected_text_arr.append(w_corrected)

        #
        #   2. Правописание Предложения с Контекстом
        #
        # TODO

        if self.do_profiling:
            ms = 1000 * prf.Profiling.get_time_dif_secs(
                start=start_prf, stop=prf.Profiling.stop())
            Log.important(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Spelling correction for ' + str(text_segmented_arr) +
                ' to ' + str(corrected_text_arr) + ' took ' +
                str(round(ms, 2)) + 'ms')
        return corrected_text_arr
Exemplo n.º 21
0
    def train(
            self,
            X,
            Y
    ):
        # Defining the size of the embedding
        embed_size = 2

        # Defining the neural network
        inp = Input(shape=(X.shape[1],))
        Log.debug('Input shape: ' + str(X.shape))
        # Middle layer is the embedding vector we seek to extract
        # "linear" because this will serve as the word definition, to be input to other neural networks
        x = Dense(units=embed_size, activation='linear')(inp)
        # Standard softmax final layer
        x = Dense(units=Y.shape[1], activation='softmax')(x)
        model = Model(inputs=inp, outputs=x)
        Log.debug('Output shape: ' + str(Y.shape))
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        model.summary()

        # Optimizing the network weights
        model.fit(
            x=X,
            y=Y,
            batch_size=256,
            epochs=100
        )

        # Obtaining the weights from the neural network.
        # These are the so called word embeddings

        # The input layer (embedding weights)
        weights = model.get_weights()[0]
        Log.important(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Weights extracted as embedding layer: ' + str(weights)
        )
        print(len(weights))

        # Creating a dictionary to store the embeddings in. The key is a unique word and
        # the value is the numeric vector
        embedding_dict = {}
        for word in self.word_index_dict.keys():
            embedding_dict.update({
                word: weights[self.word_index_dict.get(word)]
            })
        return embedding_dict
Exemplo n.º 22
0
    def fit_gradient_boosting(
        self,
        X_train,
        Y_train,
        num_class,
        feature_names,
        num_round=10,
        # boosting_model 'binary:logistic',
        classtype='multi:softprob',
        save_model_path=None,
    ):
        # Convert labels to categorical one-hot encoding
        labels_categorical = kerasutils.to_categorical(Y_train,
                                                       num_classes=num_class)
        dtrain = self.convert_to_xgboost_data_format(
            data=X_train,
            labels=Y_train,
            feature_names=feature_names,
        )
        param = {
            'max_depth': 3,
            'eta': 1,
            'objective': classtype,
            'num_class': num_class
        }
        param['nthread'] = 4
        param['eval_metric'] = 'auc'

        # evallist = [(dtest, 'test')]

        self.model = xgb.train(
            param,
            dtrain,
            num_round,
            # evallist
        )
        model_dump = self.model.get_dump()
        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Boosting model class type "' + str(classtype) +
            '" trained successfully.' + ' Number of trees = ' +
            str(len(model_dump)) + ', Feature names: ' +
            str(self.model.feature_names))
        if save_model_path is not None:
            pickle.dump(self.model, open(save_model_path, "wb"))
        return self.model
Exemplo n.º 23
0
    def run_unit_test(self):
        res_final = ut.ResultObj(count_ok=0, count_fail=0)

        from nwae.lang.config.Config import Config
        config = Config.get_cmdline_params_and_init_config_singleton(
            Derived_Class=Config,
            default_config_file=Config.CONFIG_FILE_PATH_DEFAULT)

        # DATAFRAME = pd.read_csv('task1.3.csv', sep=";")
        df = pd.read_csv(
            '/usr/local/git/nwae/nwae.lang/data/sample.intents.csv', sep=";")
        print(df)
        # from class 'pandas.core.series.Series' to class 'list' (here are all sents, same as in EXAMPLE_TEXTS):
        sents = pd.Series(df['sentence'], dtype="string").tolist()
        langs = pd.Series(df['lang'], dtype="string").tolist()
        print('Langs ' + str(langs))

        tp = TextPreprscrAllLang(
            dir_wordlist=config.get_config(
                param=Config.PARAM_NLP_DIR_WORDLIST),
            postfix_wordlist=config.get_config(
                param=Config.PARAM_NLP_POSTFIX_WORDLIST),
            dir_app_wordlist=config.get_config(
                param=Config.PARAM_NLP_DIR_APP_WORDLIST),
            postfix_app_wordlist=config.get_config(
                param=Config.PARAM_NLP_POSTFIX_APP_WORDLIST),
            dir_synlist=config.get_config(
                param=Config.PARAM_NLP_DIR_SYNONYMLIST),
            postfix_synlist=config.get_config(
                param=Config.PARAM_NLP_POSTFIX_SYNONYMLIST),
        )

        tp.preprocess_list_all_langs(sentences_list=sents)

        langs_detected = tp.detect_lang(sentences_list=sents)
        correct_pct, correct_count, total_count = self.get_stats_lang_detect(
            sentences_list=sents,
            langs_real=langs,
            langs_detected=langs_detected,
        )
        Log.important('Method language detection. Correct ' +
                      str(correct_pct) + '%, ' + str(correct_count) + '/' +
                      str(total_count))

        return res_final
Exemplo n.º 24
0
 def build_tree_roots(self):
     # Find root tree nodes
     self.tree_roots = {}
     for name in self.tree_nodes.keys():
         Log.debug(
             str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
             + ': Checking if ' + str(name) + ' is a tree root...'
         )
         node = self.tree_nodes[name]
         if not node.is_dead_node():
             if node.is_tree_root():
                 self.tree_roots[name] = node
                 self.tree_roots_depth[name] = self.calculate_tree_depth(node=node)
     Log.important(
         str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
         + ': Found ' + str(len(self.tree_roots)) + ' tree roots'
     )
     return
Exemplo n.º 25
0
 def clean_nan_values(
         data,
         colnames,
         nan_string = DEFAULT_NAN_STRING
 ):
     # MUST convert column to string, so all NAs, N/As become string
     for name in colnames:
         # Create new copy
         col_series = np.array(data[name])
         condition_null = np.array(data[name].isnull())
         # Count nan rows
         count_nan = np.sum(condition_null * 1)
         Log.important(
             str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)
             + ': Total ' + str(name) + ' rows with NULL = ' + str(count_nan)
         )
         # Replace nan
         col_series[condition_null] = nan_string
         data[name] = col_series.astype(dtype=str)
     return data
Exemplo n.º 26
0
    def __run_lang_unit_test(self):
        res_final = ut.ResultObj(count_ok=0, count_fail=0)

        for txt_expected in UtTxtPreprocessor.TESTS[self.lang]:
            txt = txt_expected[0]
            expected = txt_expected[1]
            observed = self.txt_preprocessor.process_text(
                inputtext=txt,
                return_as_string=False,
                use_special_symbol_username_nonword=True)
            res_final.update_bool(
                res_bool=ut.UnitTest.assert_true(observed=observed,
                                                 expected=expected,
                                                 test_comment='test "' +
                                                 str(txt) + '"'))

        Log.important('***** ' + str(self.lang) + ' PASSED ' +
                      str(res_final.count_ok) + ', FAILED ' +
                      str(res_final.count_fail) + ' *****')
        return res_final
Exemplo n.º 27
0
    def wait_for_model_to_be_ready(self, wait_max_time=10):
        #
        # Model reloaded without us knowing, e.g. user trained it, etc.
        #
        if self.model_last_reloaded_counter != self.model.get_model_reloaded_counter(
        ):
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                + 'Model "' + str(self.identifier_string) + '" last counter '
                + str(self.model_last_reloaded_counter) + ' not equal to model counter '
                + str(self.model.get_model_reloaded_counter())
                + '. Model updated, thus we must update our text processor.'
            )
            #
            # Должен опять загрузить потому что класс TxtPreprocessor нужны данные из модели
            #
            self.load_text_processor()

        if self.model.is_model_ready():
            return

        count = 1
        sleep_time_wait_model = 0.1
        while not self.model.is_model_ready():
            Log.warning(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Model "' +
                str(self.identifier_string) + '" not yet ready, sleep for ' +
                str(count * sleep_time_wait_model) + ' secs now..')
            if count * sleep_time_wait_model > wait_max_time:
                errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                         + ': Waited for model "' + str(self.identifier_string)\
                         + '" too long ' + str(count * sleep_time_wait_model) + ' secs. Raising exception..'
                raise Exception(errmsg)
            time.sleep(sleep_time_wait_model)
            count = count + 1
        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': Model "' +
            str(self.identifier_string) + '" READY.')
        return
Exemplo n.º 28
0
    def __process_training_data_index(self):
        # Sort by Intent ID and reset index
        self.df_training_data = self.df_training_data.sort_values(
            # By sorting also the internal counter, means we keep the original order within an
            # intent class, and the added Intent Name row will be last within the class
            [
                DaehuaTrainDataModel.COL_TDATA_INTENT_ID,
                TrDataPreprocessor.TD_INTERNAL_COUNTER
            ],
            ascending=True)
        self.df_training_data = self.df_training_data.reset_index(drop=True)

        # Now derive the training data index
        Log.important(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Assigning numbers to training data based on intent...')
        # Add intent index
        self.df_training_data[DaehuaTrainDataModel.COL_TDATA_INTENT_INDEX] =\
            [0]*self.df_training_data.shape[0]
        prev_cat_int = ''
        prev_cat_int_index = 0
        for i in range(0, self.df_training_data.shape[0], 1):
            cur_cat_int = self.df_training_data[
                DaehuaTrainDataModel.COL_TDATA_INTENT_ID].loc[i]
            if cur_cat_int != prev_cat_int:
                prev_cat_int = cur_cat_int
                prev_cat_int_index = 0
            prev_cat_int_index = prev_cat_int_index + 1
            self.df_training_data[
                DaehuaTrainDataModel.
                COL_TDATA_INTENT_INDEX].at[i] = prev_cat_int_index

        Log.debug(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': After process training data index, 10 Lines training data:\n\r'
            + str(self.df_training_data.columns) + '\n\r' +
            str(self.df_training_data[1:10].values) + '\n\r: Shape: ' +
            str(self.df_training_data.shape))
        return
Exemplo n.º 29
0
    def run(self):
        Log.important(
            str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
            ': Model Identifier "' + str(self.identifier_string) +
            '" Background Thread started..')
        if not self.is_model_ready():
            self.load_model_parameters()
            self.model_reload_counter += 1

        sleep_time = 10
        while True:
            if self.stoprequest.isSet():
                Log.important(
                    str(__name__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Model Identifier "' + str(self.identifier_string) +
                    '" Breaking from forever thread...')
                break
            if self.check_if_model_updated():
                try:
                    self.__mutex_load_model.acquire()
                    self.load_model_parameters()
                    if not self.is_model_ready():
                        Log.important(
                            str(__name__) + ' ' +
                            str(getframeinfo(currentframe()).lineno) +
                            ': Model "' + self.identifier_string +
                            '" failed to load. Try again in ' +
                            str(sleep_time) + ' secs..')
                    else:
                        self.model_reload_counter += 1
                finally:
                    self.__mutex_load_model.release()
            time.sleep(sleep_time)
Exemplo n.º 30
0
 def get_model_file_prefix(dir_path_model, model_name, identifier_string,
                           is_partial_training):
     # Prefix or dir
     prefix_or_dir = dir_path_model + '/' + model_name + '.' + identifier_string
     if is_partial_training:
         # Check if directory exists
         if not os.path.isdir(prefix_or_dir):
             Log.important(
                 str(__name__) + ' ' +
                 str(getframeinfo(currentframe()).lineno) + ': Path "' +
                 str(prefix_or_dir) +
                 '" does not exist. Trying to create this directory...')
             try:
                 os.mkdir(path=prefix_or_dir)
                 Log.important(
                     str(__name__) + ' ' +
                     str(getframeinfo(currentframe()).lineno) + ': Path "' +
                     str(prefix_or_dir) + '" successfully created.')
             except Exception as ex:
                 errmsg =\
                     str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                     + ': Error creating directory "' + str(prefix_or_dir) + '". Exception ' + str(ex) + '.'
                 Log.error(errmsg)
                 raise Exception(errmsg)
         return prefix_or_dir
     else:
         Log.important(
             str(__name__) + ' ' +
             str(getframeinfo(currentframe()).lineno) +
             ': Using path prefix "' + str(prefix_or_dir) + '"')
         return prefix_or_dir