Пример #1
0
 def profile_time(self, start_time, additional_info=''):
     total_time = Profiling.get_time_dif_secs(start=start_time, stop=Profiling.stop(), decimals=5)
     self.__mutex.acquire()
     try:
         self.profiler_times = np.append(self.profiler_times, [total_time])
         if self.algorithm == self.ALGORITHM_STANDARD:
             l = len(self.profiler_times)
             self.running_median = np.round(np.median(self.profiler_times), 5)
             self.running_average = np.round(np.average(self.profiler_times), 5)
             if l > self.max_list_len:
                 self.profiler_times = self.profiler_times[1:l]
         elif self.algorithm == self.ALGORITHM_EMA:
             if self.ema is None:
                 self.ema = total_time
             self.ema = ( (1-self.EMA_ALPHA) * self.ema ) + ( self.EMA_ALPHA * total_time )
             self.ema = np.round(self.ema, 5)
             l = None
             self.running_median = None
             self.running_average = self.ema
         else:
             raise Exception('Not implemented algorithm "' + str(self.algorithm) + '"')
         Log.info(
             str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
             + ': Profiling "' + str(self.profiler_name)  + ' ' + str(additional_info)
             + '" took ' + str(total_time) + 's, running average '
             + str(self.running_average) + 's, running median = ' + str(self.running_median)
             + 's (total len=' + str(l) + ')'
         )
         # print(self.profiler_times)
     finally:
         self.__mutex.release()
Пример #2
0
    def run_unit_test(self):
        dt = LangDetect()
        res_final = ut.ResultObj(count_ok=0, count_fail=0)

        start_all_time = Profiling.start()

        for text, expected in LangDetectUnitTest.TEST_TEXT_LANG:
            start_time = Profiling.start()
            observed = dt.detect(text=text)
            ms = round(
                1000 * Profiling.get_time_dif_secs(start=start_time,
                                                   stop=Profiling.stop()), 2)
            Log.debug('Took ' + str(ms) + ' ms')

            res_final.update_bool(
                res_bool=ut.UnitTest.assert_true(observed=observed,
                                                 expected=expected,
                                                 test_comment='test lang "' +
                                                 str(expected) + '", text "' +
                                                 str(text) + '"'))

        end_all_time = Profiling.stop()
        avg_per_text_ms = 1000 * Profiling.get_time_dif_secs(
            start=start_all_time, stop=end_all_time) / len(
                LangDetectUnitTest.TEST_TEXT_LANG)
        Log.info(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': Average ' +
            str(round(avg_per_text_ms, 2)) + 'ms per text (total ' +
            str(len(LangDetectUnitTest.TEST_TEXT_LANG)) + ' sentences)')

        return res_final
Пример #3
0
 def convert_format(self, filepath, to_format='wav'):
     file_extension = self.get_audio_filepath_extension(filepath=filepath)
     filepath_converted = re.sub(pattern='[.][a-zA-Z0-9]+$',
                                 repl='.wav',
                                 string=filepath)
     Log.info(
         str(self.__class__) + ' ' +
         str(getframeinfo(currentframe()).lineno) + ': Convert "' +
         str(filepath) + '" with extension "' + str(file_extension) +
         '" New filepath "' + str(filepath_converted) + '"')
     try:
         track = AudioSegment.from_file(file=filepath,
                                        format=file_extension)
         Log.debug(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) + ': Converting "' +
             str(filepath) + '" to "' + str(filepath_converted) + '"..')
         file_handle = track.export(filepath_converted, format=to_format)
         file_handle.close()
         Log.info(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) +
             ': Successful Conversion from "' + str(filepath) + '" to "' +
             str(filepath_converted) + '"..')
         return filepath_converted
     except Exception as ex:
         raise Exception(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) +
             ': Exception converting "' + str(filepath) + '" to "' +
             str(filepath_converted) + '": ' + str(ex))
Пример #4
0
    def __init__(
            self,
            unique_states,
            unique_observables,
    ):
        assert type(unique_states) in [list, tuple]
        assert type(unique_observables) in [list, tuple]

        # Обязательно такие числа 0, 1, 2, ...
        assert sorted(unique_states) == list(range(len(unique_states)))
        assert sorted(unique_observables) == list(range(len(unique_observables)))

        self.unique_states = unique_states
        # add one extra state for start
        self.state_none = max(self.unique_states)+1
        self.unique_states = [self.state_none] + self.unique_states
        self.n_h = len(self.unique_states)
        Log.info(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Unique states: ' + str(self.unique_states)
        )

        # add one extra observable for start
        self.unique_observables = unique_observables
        self.observable_none = max(self.unique_observables)+1
        self.unique_observables = [self.observable_none] + self.unique_observables
        self.n_o = len(self.unique_observables)
        Log.info(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Unique observables: ' + str(self.unique_observables)
        )
        return
Пример #5
0
 def check_prediction_stats(
     self,
     X,
     Y,
     y_predicted,
 ):
     Log.info(
         str(self.__class__) + ' ' +
         str(getframeinfo(currentframe()).lineno) +
         ': Checking prediction stats..')
     # print(y_predicted)
     # print(type(y_predicted))
     # print(y_predicted.shape)
     # print(np.sum(y_predicted, axis=1).tolist())
     # Compare some data
     count_correct = 0
     for i in range(X.shape[0]):
         data_i = X[i]
         label_i = Y[i]
         prob_distribution = y_predicted[i]
         top_x = NumpyUtil.get_top_indexes(data=prob_distribution,
                                           ascending=False,
                                           top_x=5)
         if top_x[0] == label_i:
             count_correct += 1
         Log.debug(
             str(i) + '. ' + str(data_i) + ': Label=' + str(label_i) +
             ', predicted=' + str(top_x))
     Log.important('Boosting Accuracy = ' +
                   str(100 * count_correct / X.shape[0]) + '%.')
     return
Пример #6
0
 def send(self, user, password, recipients_list, message):
     try:
         if password not in [None, '']:
             self.server.login(user=user, password=password)
             Log.important(
                 str(self.__class__) + ' ' +
                 str(getframeinfo(currentframe()).lineno) +
                 ': Login for user "' + str(user) + '" successful.')
         else:
             # If no password passed in, no need to do login
             Log.warning(
                 str(self.__class__) + ' ' +
                 str(getframeinfo(currentframe()).lineno) +
                 ': Not doing login for user "' + str(user) +
                 '", no password given "' + str(password) + '"')
         self.server.sendmail(from_addr=user,
                              to_addrs=recipients_list,
                              msg=message)
         Log.important(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) + ': Message from ' +
             str(user) + ' to ' + str(recipients_list) +
             ' sent successfully. Closing server..')
         self.server.close()
         Log.info(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) + ': Mail server "' +
             str(self.mail_server_url) + '" closed')
     except Exception as ex:
         errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                  + ': Exception sending mail from ' + str(user) + ' to ' + str(recipients_list)\
                  + '. Got exception ' + str(ex) + '.'
         Log.error(errmsg)
         raise Exception(errmsg)
Пример #7
0
    def process_common_words(self, word_split_token=' '):
        try:
            self.raw_words = StringUtils.trim(self.raw_words)
            self.raw_words = re.sub(pattern='[\xa0\t\n\r]',
                                    repl=word_split_token,
                                    string=self.raw_words)
            self.raw_words = self.raw_words.lower()
        except Exception as ex:
            errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                     + ': Error processing raw words. Exception: ' + str(ex)
            Log.error(errmsg)
            raise Exception(errmsg)

        try:
            self.common_words = self.raw_words.split(word_split_token)
            # Remove None, '', {}, etc.
            self.common_words = [w for w in self.common_words if w]

            word_stems = self.add_word_stems()
            if word_stems:
                self.common_words = word_stems + self.common_words

            self.common_words = sorted(set(self.common_words))
            Log.info(
                str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                + ': Loaded ' + str(len(self.common_words)) + ' common words of lang "' + str(self.lang) + '".'
            )
        except Exception as ex:
            errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                     + ': Error processing common words. Exception: ' + str(ex)
            Log.error(errmsg)
            raise Exception(errmsg)

        return
Пример #8
0
    def normalize(
            df,
            name_columns,
            attribute_columns,
            normalize_method,
    ):
        Log.info(
            str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Start normalizing process by "' + str(normalize_method) + '"...'
        )
        if normalize_method == SuggestDataProfile.NORMALIZE_METHOD_PROB:
            df_attr = df[attribute_columns]
            df_attr = df_attr.apply(lambda x: x / sum(x), axis=1)
        elif normalize_method == SuggestDataProfile.NORMALIZE_METHOD_UNIT:
            df_attr = df[attribute_columns]
            df_attr = df_attr.apply(lambda x: x / sum(x ** 2) ** 0.5, axis=1)
        else:
            return df

        for col in name_columns:
            df_attr[col] = df[col]
        keep_cols = name_columns + attribute_columns
        df = df_attr[keep_cols].reset_index(drop=True)

        return df
Пример #9
0
    def __recognize_file(self):
        need_convert_format = re.sub(pattern='(.*[.])([a-zA-Z0-9]+$)',
                                     repl='\\2',
                                     string=self.audio_file).lower() != 'wav'
        audio_filepath_wav = self.audio_file

        if need_convert_format:
            Log.important(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': Converting "' +
                str(self.audio_file) + '" to wav format..')
            audio_filepath_wav = AudioUtils().convert_format(
                filepath=self.audio_file)

        # Initialize recognizer class (for recognizing the speech)
        r = sr.Recognizer()

        # Reading Audio file as source
        # listening the audio file and store in audio_text variable

        with sr.AudioFile(audio_filepath_wav) as source:
            audio_text = r.listen(source)

            # recoginize_() method will throw a request error if the API is unreachable, hence using exception handling
            try:

                if self.engine == SpeechRecognition.ENGINE_GOOGLE:
                    text = r.recognize_google(audio_text, language=self.lang)
                elif self.engine == SpeechRecognition.ENGINE_GOOGLE_CLOUD:
                    text = r.recognize_google_cloud(
                        audio_text,
                        credentials_json=self.auth_info,
                        language=self.lang)
                elif self.engine == SpeechRecognition.ENGINE_BING:
                    text = r.recognize_bing(audio_text,
                                            key=self.auth_info,
                                            language=self.lang)
                else:
                    raise Exception(
                        str(self.__class__) + ' ' +
                        str(getframeinfo(currentframe()).lineno) +
                        ': Unsuported engine "' + str(self.engine) + '".')
                Log.info(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Converting audio transcripts into text ...')
                Log.important(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Recognized "' + str(self.lang) + '" text "' +
                    str(text) + '" from audio file "' + str(self.audio_file) +
                    '"')
                return text
            except Exception as ex:
                Log.error(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Exception converting audio transcript from "' +
                    str(self.audio_file) + '": ' + str(ex))
Пример #10
0
 def load_model(self, path):
     # Now, load the model for use on a new dataset
     loaded_model = pickle.load(open(path, "rb"))
     Log.info(
         str(self.__class__) + ' ' +
         str(getframeinfo(currentframe()).lineno) +
         ': Loaded model from file "' + str(path) + '", feature names: ' +
         str(loaded_model.feature_names))
     return loaded_model
Пример #11
0
    def __init__(
            self
    ):
        self.lang_features = LangFeatures()

        # Map alphabet name to unicode character set array
        self.alphabet_dict = {}
        for alp in self.TESTS_BY_ORDER:
            self.alphabet_dict[alp] = LangCharacters.get_alphabet_charset(
                alphabet = alp
            )
        Log.info(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Alphabets used: ' + str(self.alphabet_dict.keys())
        )

        self.langs_with_no_word_sep = self.lang_features.get_languages_with_no_word_separator()
        Log.debugdebug('Langs with no word sep: ' + str(self.langs_with_no_word_sep))

        # Load common words
        self.common_words = {}
        self.common_words[LangFeatures.LANG_EN] = English()
        self.common_words[LangFeatures.LANG_ES] = Spanish()
        self.common_words[LangFeatures.LANG_FR] = French()
        self.common_words[LangFeatures.LANG_ID] = Indonesian()
        self.common_words[LangFeatures.LANG_VI] = Vietnamese()

        # Load stemmers
        self.word_stemmer = {}
        for lang in self.SUPPORTED_LANGS:
            lang_have_verb_conj = self.lang_features.have_verb_conjugation(
                lang = lang
            )
            Log.important(
                str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                + ': Lang "' + str(lang) + '" verb conjugation = ' + str(lang_have_verb_conj) + '.'
            )
            self.word_stemmer[lang] = None
            if lang_have_verb_conj:
                try:
                    self.word_stemmer[lang] = Lemmatizer(
                        lang = lang
                    )
                    Log.important(
                        str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
                        + ': Lang "' + str(lang) + '" stemmer/lemmatizer initialized successfully.'
                    )
                except Exception as ex_stemmer:
                    errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                             + ': Lang "' + str(lang) + ' stemmer/lemmatizer failed to initialize: ' \
                             + str(ex_stemmer) + '.'
                    Log.warning(errmsg)

        self.profiler_detect_alp = ProfilingHelper(profiler_name = str(self.__class__))

        return
Пример #12
0
    def set_all_field_value_from_answer(self, answer):
        dict_fld_name_values_updated = {}
        for fld in self.form.form_fields:
            value = self.__set_field_value_from_answer(
                answer=answer,
                form_field=fld,
                # For setting not targeted field, make sure it is strict
                strict_var_expressions=True)
            if value is not None:
                Log.info('********* Field "' + str(fld.name) +
                         '" updated value = ' + str(value))
                dict_fld_name_values_updated[fld.name] = fld.name

        return retFieldsUpdated(dict_name_values=dict_fld_name_values_updated)
Пример #13
0
    def train_from_partial_models(
            self,
            write_model_to_storage=True,
            write_training_data_to_storage=False,
            # Log training events
            logs=None):
        #
        # Load EIDF first
        # TODO How to ensure there are no missing words?
        #
        x_name = self.training_data.get_x_name()
        try:
            if type(logs) is list:
                self.logs_training = logs
            else:
                self.logs_training = []

            Log.info(str(self.__class__) + ' ' +
                     str(getframeinfo(currentframe()).lineno) +
                     ': Initializing IDF object.. try to read from file first',
                     log_list=self.logs_training)
            # Try to read from file
            df_eidf_file = eidf.Eidf.read_eidf_from_storage(
                dir_path_model=self.dir_path_model,
                identifier_string=self.identifier_string,
                x_name=x_name,
                log_training=self.logs_training)
            Log.debug(str(self.__class__) + ' ' +
                      str(getframeinfo(currentframe()).lineno) +
                      ': Successfully Read EIDF from file',
                      log_list=self.logs_training)
            self.model_data.idf = np.array(
                df_eidf_file[eidf.Eidf.STORAGE_COL_EIDF])
        except Exception as ex_eidf:
            errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\
                     + ': No EIDF from file available. Exception ' + str(ex_eidf)
            Log.critical(errmsg, log_list=self.logs_training)
            raise Exception(errmsg)

        # Standardize to at least 2-dimensional, easier when weighting x
        self.model_data.idf = npUtil.NumpyUtil.convert_dimension(
            arr=self.model_data.idf, to_dim=2)

        #
        # Combines
        #
        self.model_data.load_model_from_partial_trainings_data(
            td_latest=self.training_data, log_training=self.logs_training)
        return self.logs_training
Пример #14
0
    def __send_email(self, text_subject, text_msg, files, ignore_limit):
        email_msg = SendMail.prepare_message(
            from_addr=self.from_addr,
            to_addrs_list=self.alert_recipients,
            subject=text_subject,
            text=text_msg,
            files=files)
        try:
            # Check how many already sent this hour
            if datetime.now().hour != self.current_hour:
                self.current_hour = datetime.now().hour
                self.emails_sent_this_hour = 0

            if not ignore_limit:
                if self.emails_sent_this_hour >= self.limit_per_hour:
                    Log.warning(
                        str(self.__class__) + ' ' +
                        str(getframeinfo(currentframe()).lineno) +
                        ': Send email alert limit ' +
                        str(self.limit_per_hour) +
                        ' per hour hit. Not sending subject: "' +
                        str(text_subject) + '", message: ' + str(text_msg))
                    return
            else:
                Log.info(
                    str(self.__class__) + ' ' +
                    str(getframeinfo(currentframe()).lineno) +
                    ': Ignoring send limit of ' + str(self.limit_per_hour) +
                    ' per hour.')

            if self.fake_send:
                print('Fake send email from "' + str(self.from_addr) +
                      '" to: ' + str(self.alert_recipients) + ' Message:\n\r' +
                      str(email_msg))
            else:
                SendMail(mode=self.mail_mode,
                         mail_server_url=self.mail_server_url,
                         mail_server_port=self.mail_server_port).send(
                             user=self.from_addr,
                             password=self.password,
                             recipients_list=self.alert_recipients,
                             message=email_msg)
            self.emails_sent_this_hour += 1
        except Exception as ex_mail:
            Log.error(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': Error sending email: ' + str(ex_mail) +
                '. Could not send message: ' + str(email_msg))
Пример #15
0
 def run_unit_test(self):
     for wfm in [
             WordFreqDocMatrix.BY_FREQ_NORM,
             WordFreqDocMatrix.BY_SIGMOID_FREQ_NORM,
             # WordFreqDocMatrix.BY_LOG_FREQ_NORM,
     ]:
         Log.info(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) +
             ': Start testing using word freq model "' + str(wfm) + '"')
         self.unit_test_train(word_freq_model=wfm)
         self.unit_test_predict_classes(word_freq_model=wfm,
                                        include_match_details=True,
                                        top=2)
     return self.res_final
Пример #16
0
 def stop_model_thread(self):
     # Kill any background jobs
     try:
         Log.info(
             str(self.__class__) +
             str(getframeinfo(currentframe()).lineno) + ': "' +
             str(self.identifier_string) +
             '" Stopping model background job..')
         self.model.stoprequest.set()
     except Exception as ex:
         Log.error(
             str(self.__class__) +
             str(getframeinfo(currentframe()).lineno) + ': "' +
             str(self.identifier_string) +
             '" Stop model background job exception: ' + str(ex))
Пример #17
0
    def run_unit_test(self):
        res_final = ut.ResultObj(count_ok=0, count_fail=0)

        from nwae.ml.metricspace.ut.UtMetricSpaceModel import UnitTestMetricSpaceModel
        x_text = UnitTestMetricSpaceModel.DATA_TEXTS
        y = UnitTestMetricSpaceModel.DATA_Y

        predict = PredictClass(
            model_name=ModelHelper.MODEL_NAME_HYPERSPHERE_METRICSPACE,
            identifier_string=UnitTestMetricSpaceModel.IDENTIFIER_STRING,
            dir_path_model=self.ut_params.dirpath_model,
            lang=LangFeatures.LANG_KO,
            dir_wordlist=self.ut_params.dirpath_wordlist,
            postfix_wordlist=self.ut_params.postfix_wordlist,
            dir_wordlist_app=self.ut_params.dirpath_app_wordlist,
            postfix_wordlist_app=self.ut_params.postfix_app_wordlist,
            dirpath_synonymlist=self.ut_params.dirpath_synonymlist,
            postfix_synonymlist=self.ut_params.postfix_synonymlist,
            # чуть-чуть не правильно, потому-что мы используем модель тестировки из UtMetricSpaceModel.py
            # и там последним тестом был сигмоид. но если бы это было не так, тест все равно бы прошел
            word_freq_model=FeatureVector.COL_SIGMOID_FREQ,
            do_spelling_correction=False,
            do_profiling=True)

        for i in range(len(x_text)):
            label = y[i]
            text_arr = x_text[i]
            text = ' '.join(text_arr)
            # Return all results in the top 5
            res = predict.predict_class_text_features(
                inputtext=text,
                match_pct_within_top_score=0,
                include_match_details=True,
                top=5,
            )
            res_final.update_bool(res_bool=ut.UnitTest.assert_true(
                observed=res.predict_result.predicted_classes[0],
                expected=label,
                test_comment='Test "' + str(text) + '" label ' + str(label)))
            Log.info(
                str(self.__class__) +
                str(getframeinfo(currentframe()).lineno) + ': Match Details' +
                str(res.predict_result.match_details))

        # Kill any background jobs
        predict.stop_model_thread()

        return res_final
Пример #18
0
    def recommend_products_by_product_names_only(
        self,
        product_names_list,
        df_product_dna,
        # List type, e.g. ['league']
        unique_prdname_cols,
        replace_purchased_product_with_nan=True,
    ):
        assert len(unique_prdname_cols
                   ) == 1, 'Multi-column product names not supported yet'

        attributes_list = self.extract_attributes_list(
            df=df_product_dna,
            unique_name_colums_list=unique_prdname_cols,
        )
        np_attributes_list = np.array(attributes_list)
        # # Collapse to 1-dimensional vector
        # np_product_names = df_product_dna[unique_prdname_cols].to_numpy().squeeze()
        Log.info(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) +
            ': Extracted attributes list from product dna: ' +
            str(np_attributes_list)
            # + ', product list: ' + str(np_product_names)
        )

        condition = False
        for prd in product_names_list:
            condition = condition | (df_product_dna[unique_prdname_cols[0]]
                                     == prd)
        df_prd_only = df_product_dna[condition]

        np_probs = df_prd_only[attributes_list].values
        # Sort by highest probability to lowest
        indxs_dist_sort = np.flip(np.argsort(np_probs), axis=1)

        np_recommendations = np_attributes_list[indxs_dist_sort]
        # если список продуктов было раньше сокращен, то продукты которые убраны не будут смены
        if replace_purchased_product_with_nan:
            for i in range(len(np_recommendations)):
                condition = np_recommendations[i] == product_names_list[i]
                purchased_before = np_recommendations[i][condition]
                replace_x = [(r in purchased_before)
                             for r in np_recommendations[i]]
                np_recommendations[i][
                    replace_x] = SuggestDataProfile.NAN_PRODUCT

        return np_recommendations.tolist()
Пример #19
0
    def __set_field_value_from_answer(self, answer, form_field,
                                      strict_var_expressions):
        res = form_field.set_field_value(
            user_text=answer,
            # Allow to match also single word or number (e.g. "79.5"),
            # without any var expressions (with expressions, "Amount is 79.5")
            strict_var_expressions=strict_var_expressions)
        Log.info('Updated field "' + str(form_field.name) + '" = ' + str(res))
        if res is True:
            # Confirm question we can build elsewhere
            # confirm_question = \
            #     str(form_field.name).lower() + ': "' + str(value) + '"' \
            #     + '? ' + str(self.text_confirm_question)
            return form_field.value

        return None
Пример #20
0
 def add_wordlist(
         self,
         dirpath,
         postfix,
         array_words = None,
 ):
     if self.use_external_lib:
         Log.info(
             str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
             + ': Not adding word list for language "' + str(self.lang) + '" using external lib'
         )
         return
     self.lang_wordlist.append_wordlist(
         dirpath     = dirpath,
         postfix     = postfix,
         array_words = array_words,
     )
Пример #21
0
    def scrape(self):
        self.sentences_scraped = ScrapeUrl(
        ).get_training_data_by_scraping_urls(
            url_list=self.url_list,
            tag_to_find='p',
            min_char_per_sent=0,
            max_char_per_sent=9999,
            write_to_filepath=None,
        )
        Log.info(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': Scraped ' +
            str(len(self.sentences_scraped)) + ' from urls ' +
            str(self.url_list))

        self.sentences_processed = self.txt_preprocessor.preprocess_list_all_langs(
            sentences_list=self.sentences_scraped)
Пример #22
0
    def add_word_stems(self):
        if self.word_stemmer is None:
            return None

        stems = []
        for w in self.common_words:
            w_stem = self.word_stemmer.stem(word=w)
            if w_stem == w:
                continue
            else:
                stems.append(w_stem)

        stems = sorted(set(stems))
        Log.info(
            str(self.__class__) + ' ' +
            str(getframeinfo(currentframe()).lineno) + ': Loaded ' +
            str(len(stems)) + ' unique word stems: ' + str(stems))
        return stems
Пример #23
0
 def __init__(
         self,
         name,
         value,
         if_required,
         if_masked,
         # MEX expression to extract param from human sentence
         mex_expr,
         # For deserializing old objects so the old state is maintained
         value_just_updated=False,
         completed=False):
     self.name = name
     self.value = value
     self.if_required = if_required
     self.if_masked = if_masked
     # Field MEX
     self.mex_expr = mex_expr
     self.value_just_updated = value_just_updated
     # Already obtained the parameter from user conversation?
     self.completed = completed
     try:
         self.mex_obj = MatchExpression(pattern=self.mex_expr, lang=None)
         self.mex_var_name = self.mex_obj.get_mex_var_names()[0]
         self.mex_obj_no_var_expressions = MatchExpression.create_mex_obj_from_object_vars(
             var_name_str=self.mex_var_name,
             var_type_str=self.mex_obj.get_mex_var_type(
                 var_name=self.mex_var_name),
             var_expressions_str='',
             var_len_range_list2=self.mex_obj.get_mex_var_length_range(
                 var_name=self.mex_var_name),
             var_preferred_dir_str=self.mex_obj.get_mex_var_pref_dir(
                 var_name=self.mex_var_name))
     except Exception as ex_mex:
         raise Exception(
             str(self.__class__) + ' ' +
             str(getframeinfo(currentframe()).lineno) +
             ': Failed to get mex var name for mex expr "' +
             str(self.mex_expr) + '", got exception "' + str(ex_mex) + '".')
     Log.info(
         str(self.__class__) + ' ' +
         str(getframeinfo(currentframe()).lineno) +
         ': Field initialized: ' + str(self.to_json()))
     return
Пример #24
0
 def __init__(self, lang=LangFeatures.LANG_EN):
     self.lang = LangFeatures.map_to_lang_code_iso639_1(lang_code=lang)
     Ssl.disable_ssl_check()
     try:
         if nltk.download(Corpora.NLTK_COMTRANS):
             Log.info(
                 str(self.__class__) + ' ' +
                 str(getframeinfo(currentframe()).lineno) +
                 ': NLTK download of "' + Corpora.NLTK_COMTRANS + '" OK.')
         else:
             raise Exception('Download "' + str(Corpora.NLTK_COMTRANS) +
                             '" returned False')
     except Exception as ex:
         errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \
                  + ': NLTK download of "' + str(Corpora.NLTK_COMTRANS) + '" exception: ' \
                  + str(ex) + '.'
         Log.error(errmsg)
         raise Exception(errmsg)
     return
Пример #25
0
    def fit(
            self,
            # 'entropy' (information concept) or 'gini' (impurity concept)
            criterion = 'gini',
            max_tree_depth = 10,
            min_samples_split = 20,
            min_impurity_decrease = 0.0,
            output_graph_path = None,
            output_code_path = None,
            output_code_newline = '\n'
    ):
        dtree = DecisionTreeClassifier(
            criterion = criterion,
            max_depth = max_tree_depth,
            min_samples_split = min_samples_split,
            min_impurity_decrease = min_impurity_decrease,
        )
        dtree = dtree.fit(self.df_X, self.df_y)
        data = tree.export_graphviz(
            dtree,
            out_file = None,
            feature_names = self.feature_names,
        )
        code = self.tree_to_code(
            tree = dtree,
            feature_names = self.feature_names,
            newline = output_code_newline
        )
        if output_code_path is not None:
            f = open(output_code_path, 'w')
            f.write(code)
            f.close()

        Log.info(
            str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
            + ': Decision tree built successfully: ' + str(data) + ', tree converted to code:\n\r' + str(code)
        )

        if output_graph_path is not None:
            graph = pydotplus.graph_from_dot_data(data)
            graph.write_png(output_graph_path)
        return dtree
Пример #26
0
    def test_corpora_general(
        self,
        data_from_internet=True,
        write_to_file_path=None,
        sample_fpath=None,
    ):
        if data_from_internet:
            sentences_list = ScrapeUrl().get_training_data_by_scraping_urls(
                url_list=[
                    'https://slowcook.netlify.app/mix/2050-recipe-of-homemade-nakji-bokkeum-korean-spicy-octopus-stirfry/',
                    'https://www.bbc.com/ukrainian/vert-earth-russian-47766544',
                    'https://www.say7.info/cook/recipe/118-Plov.html',
                    'https://ru.wikipedia.org/wiki/IU_(%D0%BF%D0%B5%D0%B2%D0%B8%D1%86%D0%B0)',
                ],
                tag_to_find='p',
                min_char_per_sent=50,
                max_char_per_sent=500,
                write_to_filepath=write_to_file_path,
            )
            Log.info(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) +
                ': TOTAL SCRAPED = ' + str(len(sentences_list)))
        else:
            assert sample_fpath is not None
            sentences_list = ScrapeUrl().get_training_data_from_file(
                filepath=sample_fpath,
                min_char_per_sent=0,
                max_char_per_sent=np.inf,
            )
            for i in range(len(sentences_list)):
                s = sentences_list[i]
                import re
                sentences_list[i] = re.sub(pattern='&nbsp', repl='', string=s)
            # sentences_list = [s for s in sentences_list if (len(s) >= 10) and (len(s) <= 30)]
            Log.info(
                str(self.__class__) + ' ' +
                str(getframeinfo(currentframe()).lineno) + ': TOTAL READ = ' +
                str(len(sentences_list)))
            # [print(s) for s in sentences_list]

        return sentences_list
Пример #27
0
 def test_japanese(self):
     try:
         import nagisa
     except:
         Log.info(
             str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)
             + ': Not testing japanese, cannot load nagisa'
         )
         return ut.ResultObj(count_ok=0, count_fail=0)
     list_sent_exp = [
         ['本日はチャットサービスをご利用いただき、ありがとうございます。オペレーターと接続中です。',
          ['本日', 'は', 'チャット', 'サービス', 'を', 'ご', '利用', 'いただき', '、', 'ありがとう', 'ござい', 'ます', '。', 'オペレーター', 'と', '接続', '中', 'です', '。']],
         ['江戸時代には江戸前や江戸前海などの呼び名があった。',
          ['江戸', '時代', 'に', 'は', '江戸', '前', 'や', '江戸', '前海', 'など', 'の', '呼び名', 'が', 'あっ', 'た', '。']],
     ]
     retv = self.do_unit_test(
         word_segmenter = self.get_word_segmenter(lang = lf.LangFeatures.LANG_JA),
         list_sent_exp  = list_sent_exp
     )
     return retv
Пример #28
0
 def map_to_correct_lang_code_iso_639_1_or_3(
         # 2 character language code
         lang_code
 ):
     # общая ошибка 'cn' вместо 'zh'
     if lang_code in (LangFeatures.LANG_CN, LangFeatures.LANG_ZH_CN):
         return LangFeatures.LANG_ZH
     # общая ошибка 'vn' вместо 'vi'
     elif lang_code == LangFeatures.LANG_VN:
         return LangFeatures.LANG_VI
     else:
         if lang_code in LangFeatures.ALL_ISO639_1_SUPPORTED_LANGS:
             return lang_code
         elif lang_code in LangFeatures.ALL_ISO639_3_SUPPORTED_LANGS:
             return lang_code
         else:
             Log.info(
                 str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)
                 + ': Unsupported language code "' + str(lang_code) + '" return unchanged "' + str(lang_code) + '"'
             )
             return lang_code
Пример #29
0
    def __attach_file_check_validity_and_size(
            files_attachment_list,
            max_total_files_size=MAX_TOTAL_FILES_SIZE_MB_EMAIL_ATTCH):
        if files_attachment_list is None:
            return []

        files_attachment_list_allowed = []

        cum_size_mb = 0.0
        for filepath in files_attachment_list:
            if os.path.isfile(filepath):
                Log.info('File <' + str(__name__) + '> line ' +
                         str(getframeinfo(currentframe()).lineno) +
                         ': Attachment file path "' + str(filepath) + '" OK')
            else:
                Log.error('File <' + str(__name__) + '> line ' +
                          str(getframeinfo(currentframe()).lineno) +
                          ': Invalid attachment file "' + str(filepath) +
                          '", not attaching to email')
                continue

            fsize_bytes = os.path.getsize(filepath)
            fsize_mb = round(fsize_bytes / (1024 * 1024), 2)

            if fsize_mb + cum_size_mb < max_total_files_size:
                files_attachment_list_allowed.append(filepath)
                cum_size_mb += fsize_mb
                Log.info('File <' + str(__name__) + '> line ' +
                         str(getframeinfo(currentframe()).lineno) +
                         ': Appended file "' + str(filepath) +
                         '" as email attachment size ' + str(fsize_mb) +
                         'MB, total cumulative ' + str(cum_size_mb) + 'MB')
            else:
                Log.warning('File <' + str(__name__) + '> line ' +
                            str(getframeinfo(currentframe()).lineno) +
                            ': File "' + str(filepath) + '" too big ' +
                            str(fsize_mb) + 'MB. Cumulative = ' +
                            str(fsize_mb + cum_size_mb) +
                            ' Not attaching to email')
        return files_attachment_list_allowed
Пример #30
0
 def get_training_data_by_scraping(
     self,
     url,
     tag_to_find='p',
     min_char_per_sent=0,
     max_char_per_sent=np.inf,
     rm_html_markup=False,
     unquote_html=False,
 ):
     # Пример данных из википедии
     sentences_list_from_wiki_scraping = Scrape().scrape_url(
         url=url, tag_to_find=tag_to_find)
     Log.info(
         str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
         ': Scraped ' + str(len(sentences_list_from_wiki_scraping)) +
         ' sentences from url "' + str(url) + '"')
     sentences_list = []
     for s in sentences_list_from_wiki_scraping:
         s = StringUtils.trim(s)
         s = BeautifulSoup(s).text
         s_clean = s
         if rm_html_markup:
             # Remove all patterns '<...>'
             html_tags_re = re.compile(r'<[^>]+>')
             s_clean = re.sub(html_tags_re, '', string=s)
         if unquote_html:
             # Convert strings like '%3Fmode%3DLSD%26mid%3Dshm%26sid1%3D102%26oid%3D421%26aid%3D0005537039'
             # into '?mode=LSD&mid=shm&sid1=102&oid=421&aid=0005537039'
             s_clean = urllib.parse.unquote(string=s)
         len_s = len(s_clean)
         if (len_s >= min_char_per_sent) and (len_s <= max_char_per_sent):
             sentences_list.append(s_clean)
         Log.debug('From\n\r\t"' + str(s) + '" to\n\r\t"' + str(s_clean) +
                   '"')
     Log.info(
         str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) +
         ': Filtered to ' + str(len(sentences_list)) +
         ' sentences from url "' + str(url) + '"')
     return sentences_list