def profile_time(self, start_time, additional_info=''): total_time = Profiling.get_time_dif_secs(start=start_time, stop=Profiling.stop(), decimals=5) self.__mutex.acquire() try: self.profiler_times = np.append(self.profiler_times, [total_time]) if self.algorithm == self.ALGORITHM_STANDARD: l = len(self.profiler_times) self.running_median = np.round(np.median(self.profiler_times), 5) self.running_average = np.round(np.average(self.profiler_times), 5) if l > self.max_list_len: self.profiler_times = self.profiler_times[1:l] elif self.algorithm == self.ALGORITHM_EMA: if self.ema is None: self.ema = total_time self.ema = ( (1-self.EMA_ALPHA) * self.ema ) + ( self.EMA_ALPHA * total_time ) self.ema = np.round(self.ema, 5) l = None self.running_median = None self.running_average = self.ema else: raise Exception('Not implemented algorithm "' + str(self.algorithm) + '"') Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Profiling "' + str(self.profiler_name) + ' ' + str(additional_info) + '" took ' + str(total_time) + 's, running average ' + str(self.running_average) + 's, running median = ' + str(self.running_median) + 's (total len=' + str(l) + ')' ) # print(self.profiler_times) finally: self.__mutex.release()
def run_unit_test(self): dt = LangDetect() res_final = ut.ResultObj(count_ok=0, count_fail=0) start_all_time = Profiling.start() for text, expected in LangDetectUnitTest.TEST_TEXT_LANG: start_time = Profiling.start() observed = dt.detect(text=text) ms = round( 1000 * Profiling.get_time_dif_secs(start=start_time, stop=Profiling.stop()), 2) Log.debug('Took ' + str(ms) + ' ms') res_final.update_bool( res_bool=ut.UnitTest.assert_true(observed=observed, expected=expected, test_comment='test lang "' + str(expected) + '", text "' + str(text) + '"')) end_all_time = Profiling.stop() avg_per_text_ms = 1000 * Profiling.get_time_dif_secs( start=start_all_time, stop=end_all_time) / len( LangDetectUnitTest.TEST_TEXT_LANG) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Average ' + str(round(avg_per_text_ms, 2)) + 'ms per text (total ' + str(len(LangDetectUnitTest.TEST_TEXT_LANG)) + ' sentences)') return res_final
def convert_format(self, filepath, to_format='wav'): file_extension = self.get_audio_filepath_extension(filepath=filepath) filepath_converted = re.sub(pattern='[.][a-zA-Z0-9]+$', repl='.wav', string=filepath) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Convert "' + str(filepath) + '" with extension "' + str(file_extension) + '" New filepath "' + str(filepath_converted) + '"') try: track = AudioSegment.from_file(file=filepath, format=file_extension) Log.debug( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Converting "' + str(filepath) + '" to "' + str(filepath_converted) + '"..') file_handle = track.export(filepath_converted, format=to_format) file_handle.close() Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Successful Conversion from "' + str(filepath) + '" to "' + str(filepath_converted) + '"..') return filepath_converted except Exception as ex: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Exception converting "' + str(filepath) + '" to "' + str(filepath_converted) + '": ' + str(ex))
def __init__( self, unique_states, unique_observables, ): assert type(unique_states) in [list, tuple] assert type(unique_observables) in [list, tuple] # Обязательно такие числа 0, 1, 2, ... assert sorted(unique_states) == list(range(len(unique_states))) assert sorted(unique_observables) == list(range(len(unique_observables))) self.unique_states = unique_states # add one extra state for start self.state_none = max(self.unique_states)+1 self.unique_states = [self.state_none] + self.unique_states self.n_h = len(self.unique_states) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unique states: ' + str(self.unique_states) ) # add one extra observable for start self.unique_observables = unique_observables self.observable_none = max(self.unique_observables)+1 self.unique_observables = [self.observable_none] + self.unique_observables self.n_o = len(self.unique_observables) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unique observables: ' + str(self.unique_observables) ) return
def check_prediction_stats( self, X, Y, y_predicted, ): Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Checking prediction stats..') # print(y_predicted) # print(type(y_predicted)) # print(y_predicted.shape) # print(np.sum(y_predicted, axis=1).tolist()) # Compare some data count_correct = 0 for i in range(X.shape[0]): data_i = X[i] label_i = Y[i] prob_distribution = y_predicted[i] top_x = NumpyUtil.get_top_indexes(data=prob_distribution, ascending=False, top_x=5) if top_x[0] == label_i: count_correct += 1 Log.debug( str(i) + '. ' + str(data_i) + ': Label=' + str(label_i) + ', predicted=' + str(top_x)) Log.important('Boosting Accuracy = ' + str(100 * count_correct / X.shape[0]) + '%.') return
def send(self, user, password, recipients_list, message): try: if password not in [None, '']: self.server.login(user=user, password=password) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Login for user "' + str(user) + '" successful.') else: # If no password passed in, no need to do login Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Not doing login for user "' + str(user) + '", no password given "' + str(password) + '"') self.server.sendmail(from_addr=user, to_addrs=recipients_list, msg=message) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Message from ' + str(user) + ' to ' + str(recipients_list) + ' sent successfully. Closing server..') self.server.close() Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Mail server "' + str(self.mail_server_url) + '" closed') except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Exception sending mail from ' + str(user) + ' to ' + str(recipients_list)\ + '. Got exception ' + str(ex) + '.' Log.error(errmsg) raise Exception(errmsg)
def process_common_words(self, word_split_token=' '): try: self.raw_words = StringUtils.trim(self.raw_words) self.raw_words = re.sub(pattern='[\xa0\t\n\r]', repl=word_split_token, string=self.raw_words) self.raw_words = self.raw_words.lower() except Exception as ex: errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Error processing raw words. Exception: ' + str(ex) Log.error(errmsg) raise Exception(errmsg) try: self.common_words = self.raw_words.split(word_split_token) # Remove None, '', {}, etc. self.common_words = [w for w in self.common_words if w] word_stems = self.add_word_stems() if word_stems: self.common_words = word_stems + self.common_words self.common_words = sorted(set(self.common_words)) Log.info( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Loaded ' + str(len(self.common_words)) + ' common words of lang "' + str(self.lang) + '".' ) except Exception as ex: errmsg = str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': Error processing common words. Exception: ' + str(ex) Log.error(errmsg) raise Exception(errmsg) return
def normalize( df, name_columns, attribute_columns, normalize_method, ): Log.info( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Start normalizing process by "' + str(normalize_method) + '"...' ) if normalize_method == SuggestDataProfile.NORMALIZE_METHOD_PROB: df_attr = df[attribute_columns] df_attr = df_attr.apply(lambda x: x / sum(x), axis=1) elif normalize_method == SuggestDataProfile.NORMALIZE_METHOD_UNIT: df_attr = df[attribute_columns] df_attr = df_attr.apply(lambda x: x / sum(x ** 2) ** 0.5, axis=1) else: return df for col in name_columns: df_attr[col] = df[col] keep_cols = name_columns + attribute_columns df = df_attr[keep_cols].reset_index(drop=True) return df
def __recognize_file(self): need_convert_format = re.sub(pattern='(.*[.])([a-zA-Z0-9]+$)', repl='\\2', string=self.audio_file).lower() != 'wav' audio_filepath_wav = self.audio_file if need_convert_format: Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Converting "' + str(self.audio_file) + '" to wav format..') audio_filepath_wav = AudioUtils().convert_format( filepath=self.audio_file) # Initialize recognizer class (for recognizing the speech) r = sr.Recognizer() # Reading Audio file as source # listening the audio file and store in audio_text variable with sr.AudioFile(audio_filepath_wav) as source: audio_text = r.listen(source) # recoginize_() method will throw a request error if the API is unreachable, hence using exception handling try: if self.engine == SpeechRecognition.ENGINE_GOOGLE: text = r.recognize_google(audio_text, language=self.lang) elif self.engine == SpeechRecognition.ENGINE_GOOGLE_CLOUD: text = r.recognize_google_cloud( audio_text, credentials_json=self.auth_info, language=self.lang) elif self.engine == SpeechRecognition.ENGINE_BING: text = r.recognize_bing(audio_text, key=self.auth_info, language=self.lang) else: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unsuported engine "' + str(self.engine) + '".') Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Converting audio transcripts into text ...') Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Recognized "' + str(self.lang) + '" text "' + str(text) + '" from audio file "' + str(self.audio_file) + '"') return text except Exception as ex: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Exception converting audio transcript from "' + str(self.audio_file) + '": ' + str(ex))
def load_model(self, path): # Now, load the model for use on a new dataset loaded_model = pickle.load(open(path, "rb")) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Loaded model from file "' + str(path) + '", feature names: ' + str(loaded_model.feature_names)) return loaded_model
def __init__( self ): self.lang_features = LangFeatures() # Map alphabet name to unicode character set array self.alphabet_dict = {} for alp in self.TESTS_BY_ORDER: self.alphabet_dict[alp] = LangCharacters.get_alphabet_charset( alphabet = alp ) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Alphabets used: ' + str(self.alphabet_dict.keys()) ) self.langs_with_no_word_sep = self.lang_features.get_languages_with_no_word_separator() Log.debugdebug('Langs with no word sep: ' + str(self.langs_with_no_word_sep)) # Load common words self.common_words = {} self.common_words[LangFeatures.LANG_EN] = English() self.common_words[LangFeatures.LANG_ES] = Spanish() self.common_words[LangFeatures.LANG_FR] = French() self.common_words[LangFeatures.LANG_ID] = Indonesian() self.common_words[LangFeatures.LANG_VI] = Vietnamese() # Load stemmers self.word_stemmer = {} for lang in self.SUPPORTED_LANGS: lang_have_verb_conj = self.lang_features.have_verb_conjugation( lang = lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(lang) + '" verb conjugation = ' + str(lang_have_verb_conj) + '.' ) self.word_stemmer[lang] = None if lang_have_verb_conj: try: self.word_stemmer[lang] = Lemmatizer( lang = lang ) Log.important( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Lang "' + str(lang) + '" stemmer/lemmatizer initialized successfully.' ) except Exception as ex_stemmer: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': Lang "' + str(lang) + ' stemmer/lemmatizer failed to initialize: ' \ + str(ex_stemmer) + '.' Log.warning(errmsg) self.profiler_detect_alp = ProfilingHelper(profiler_name = str(self.__class__)) return
def set_all_field_value_from_answer(self, answer): dict_fld_name_values_updated = {} for fld in self.form.form_fields: value = self.__set_field_value_from_answer( answer=answer, form_field=fld, # For setting not targeted field, make sure it is strict strict_var_expressions=True) if value is not None: Log.info('********* Field "' + str(fld.name) + '" updated value = ' + str(value)) dict_fld_name_values_updated[fld.name] = fld.name return retFieldsUpdated(dict_name_values=dict_fld_name_values_updated)
def train_from_partial_models( self, write_model_to_storage=True, write_training_data_to_storage=False, # Log training events logs=None): # # Load EIDF first # TODO How to ensure there are no missing words? # x_name = self.training_data.get_x_name() try: if type(logs) is list: self.logs_training = logs else: self.logs_training = [] Log.info(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Initializing IDF object.. try to read from file first', log_list=self.logs_training) # Try to read from file df_eidf_file = eidf.Eidf.read_eidf_from_storage( dir_path_model=self.dir_path_model, identifier_string=self.identifier_string, x_name=x_name, log_training=self.logs_training) Log.debug(str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Successfully Read EIDF from file', log_list=self.logs_training) self.model_data.idf = np.array( df_eidf_file[eidf.Eidf.STORAGE_COL_EIDF]) except Exception as ex_eidf: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno)\ + ': No EIDF from file available. Exception ' + str(ex_eidf) Log.critical(errmsg, log_list=self.logs_training) raise Exception(errmsg) # Standardize to at least 2-dimensional, easier when weighting x self.model_data.idf = npUtil.NumpyUtil.convert_dimension( arr=self.model_data.idf, to_dim=2) # # Combines # self.model_data.load_model_from_partial_trainings_data( td_latest=self.training_data, log_training=self.logs_training) return self.logs_training
def __send_email(self, text_subject, text_msg, files, ignore_limit): email_msg = SendMail.prepare_message( from_addr=self.from_addr, to_addrs_list=self.alert_recipients, subject=text_subject, text=text_msg, files=files) try: # Check how many already sent this hour if datetime.now().hour != self.current_hour: self.current_hour = datetime.now().hour self.emails_sent_this_hour = 0 if not ignore_limit: if self.emails_sent_this_hour >= self.limit_per_hour: Log.warning( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Send email alert limit ' + str(self.limit_per_hour) + ' per hour hit. Not sending subject: "' + str(text_subject) + '", message: ' + str(text_msg)) return else: Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Ignoring send limit of ' + str(self.limit_per_hour) + ' per hour.') if self.fake_send: print('Fake send email from "' + str(self.from_addr) + '" to: ' + str(self.alert_recipients) + ' Message:\n\r' + str(email_msg)) else: SendMail(mode=self.mail_mode, mail_server_url=self.mail_server_url, mail_server_port=self.mail_server_port).send( user=self.from_addr, password=self.password, recipients_list=self.alert_recipients, message=email_msg) self.emails_sent_this_hour += 1 except Exception as ex_mail: Log.error( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Error sending email: ' + str(ex_mail) + '. Could not send message: ' + str(email_msg))
def run_unit_test(self): for wfm in [ WordFreqDocMatrix.BY_FREQ_NORM, WordFreqDocMatrix.BY_SIGMOID_FREQ_NORM, # WordFreqDocMatrix.BY_LOG_FREQ_NORM, ]: Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Start testing using word freq model "' + str(wfm) + '"') self.unit_test_train(word_freq_model=wfm) self.unit_test_predict_classes(word_freq_model=wfm, include_match_details=True, top=2) return self.res_final
def stop_model_thread(self): # Kill any background jobs try: Log.info( str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ': "' + str(self.identifier_string) + '" Stopping model background job..') self.model.stoprequest.set() except Exception as ex: Log.error( str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ': "' + str(self.identifier_string) + '" Stop model background job exception: ' + str(ex))
def run_unit_test(self): res_final = ut.ResultObj(count_ok=0, count_fail=0) from nwae.ml.metricspace.ut.UtMetricSpaceModel import UnitTestMetricSpaceModel x_text = UnitTestMetricSpaceModel.DATA_TEXTS y = UnitTestMetricSpaceModel.DATA_Y predict = PredictClass( model_name=ModelHelper.MODEL_NAME_HYPERSPHERE_METRICSPACE, identifier_string=UnitTestMetricSpaceModel.IDENTIFIER_STRING, dir_path_model=self.ut_params.dirpath_model, lang=LangFeatures.LANG_KO, dir_wordlist=self.ut_params.dirpath_wordlist, postfix_wordlist=self.ut_params.postfix_wordlist, dir_wordlist_app=self.ut_params.dirpath_app_wordlist, postfix_wordlist_app=self.ut_params.postfix_app_wordlist, dirpath_synonymlist=self.ut_params.dirpath_synonymlist, postfix_synonymlist=self.ut_params.postfix_synonymlist, # чуть-чуть не правильно, потому-что мы используем модель тестировки из UtMetricSpaceModel.py # и там последним тестом был сигмоид. но если бы это было не так, тест все равно бы прошел word_freq_model=FeatureVector.COL_SIGMOID_FREQ, do_spelling_correction=False, do_profiling=True) for i in range(len(x_text)): label = y[i] text_arr = x_text[i] text = ' '.join(text_arr) # Return all results in the top 5 res = predict.predict_class_text_features( inputtext=text, match_pct_within_top_score=0, include_match_details=True, top=5, ) res_final.update_bool(res_bool=ut.UnitTest.assert_true( observed=res.predict_result.predicted_classes[0], expected=label, test_comment='Test "' + str(text) + '" label ' + str(label))) Log.info( str(self.__class__) + str(getframeinfo(currentframe()).lineno) + ': Match Details' + str(res.predict_result.match_details)) # Kill any background jobs predict.stop_model_thread() return res_final
def recommend_products_by_product_names_only( self, product_names_list, df_product_dna, # List type, e.g. ['league'] unique_prdname_cols, replace_purchased_product_with_nan=True, ): assert len(unique_prdname_cols ) == 1, 'Multi-column product names not supported yet' attributes_list = self.extract_attributes_list( df=df_product_dna, unique_name_colums_list=unique_prdname_cols, ) np_attributes_list = np.array(attributes_list) # # Collapse to 1-dimensional vector # np_product_names = df_product_dna[unique_prdname_cols].to_numpy().squeeze() Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Extracted attributes list from product dna: ' + str(np_attributes_list) # + ', product list: ' + str(np_product_names) ) condition = False for prd in product_names_list: condition = condition | (df_product_dna[unique_prdname_cols[0]] == prd) df_prd_only = df_product_dna[condition] np_probs = df_prd_only[attributes_list].values # Sort by highest probability to lowest indxs_dist_sort = np.flip(np.argsort(np_probs), axis=1) np_recommendations = np_attributes_list[indxs_dist_sort] # если список продуктов было раньше сокращен, то продукты которые убраны не будут смены if replace_purchased_product_with_nan: for i in range(len(np_recommendations)): condition = np_recommendations[i] == product_names_list[i] purchased_before = np_recommendations[i][condition] replace_x = [(r in purchased_before) for r in np_recommendations[i]] np_recommendations[i][ replace_x] = SuggestDataProfile.NAN_PRODUCT return np_recommendations.tolist()
def __set_field_value_from_answer(self, answer, form_field, strict_var_expressions): res = form_field.set_field_value( user_text=answer, # Allow to match also single word or number (e.g. "79.5"), # without any var expressions (with expressions, "Amount is 79.5") strict_var_expressions=strict_var_expressions) Log.info('Updated field "' + str(form_field.name) + '" = ' + str(res)) if res is True: # Confirm question we can build elsewhere # confirm_question = \ # str(form_field.name).lower() + ': "' + str(value) + '"' \ # + '? ' + str(self.text_confirm_question) return form_field.value return None
def add_wordlist( self, dirpath, postfix, array_words = None, ): if self.use_external_lib: Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Not adding word list for language "' + str(self.lang) + '" using external lib' ) return self.lang_wordlist.append_wordlist( dirpath = dirpath, postfix = postfix, array_words = array_words, )
def scrape(self): self.sentences_scraped = ScrapeUrl( ).get_training_data_by_scraping_urls( url_list=self.url_list, tag_to_find='p', min_char_per_sent=0, max_char_per_sent=9999, write_to_filepath=None, ) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Scraped ' + str(len(self.sentences_scraped)) + ' from urls ' + str(self.url_list)) self.sentences_processed = self.txt_preprocessor.preprocess_list_all_langs( sentences_list=self.sentences_scraped)
def add_word_stems(self): if self.word_stemmer is None: return None stems = [] for w in self.common_words: w_stem = self.word_stemmer.stem(word=w) if w_stem == w: continue else: stems.append(w_stem) stems = sorted(set(stems)) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Loaded ' + str(len(stems)) + ' unique word stems: ' + str(stems)) return stems
def __init__( self, name, value, if_required, if_masked, # MEX expression to extract param from human sentence mex_expr, # For deserializing old objects so the old state is maintained value_just_updated=False, completed=False): self.name = name self.value = value self.if_required = if_required self.if_masked = if_masked # Field MEX self.mex_expr = mex_expr self.value_just_updated = value_just_updated # Already obtained the parameter from user conversation? self.completed = completed try: self.mex_obj = MatchExpression(pattern=self.mex_expr, lang=None) self.mex_var_name = self.mex_obj.get_mex_var_names()[0] self.mex_obj_no_var_expressions = MatchExpression.create_mex_obj_from_object_vars( var_name_str=self.mex_var_name, var_type_str=self.mex_obj.get_mex_var_type( var_name=self.mex_var_name), var_expressions_str='', var_len_range_list2=self.mex_obj.get_mex_var_length_range( var_name=self.mex_var_name), var_preferred_dir_str=self.mex_obj.get_mex_var_pref_dir( var_name=self.mex_var_name)) except Exception as ex_mex: raise Exception( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Failed to get mex var name for mex expr "' + str(self.mex_expr) + '", got exception "' + str(ex_mex) + '".') Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Field initialized: ' + str(self.to_json())) return
def __init__(self, lang=LangFeatures.LANG_EN): self.lang = LangFeatures.map_to_lang_code_iso639_1(lang_code=lang) Ssl.disable_ssl_check() try: if nltk.download(Corpora.NLTK_COMTRANS): Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': NLTK download of "' + Corpora.NLTK_COMTRANS + '" OK.') else: raise Exception('Download "' + str(Corpora.NLTK_COMTRANS) + '" returned False') except Exception as ex: errmsg = str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) \ + ': NLTK download of "' + str(Corpora.NLTK_COMTRANS) + '" exception: ' \ + str(ex) + '.' Log.error(errmsg) raise Exception(errmsg) return
def fit( self, # 'entropy' (information concept) or 'gini' (impurity concept) criterion = 'gini', max_tree_depth = 10, min_samples_split = 20, min_impurity_decrease = 0.0, output_graph_path = None, output_code_path = None, output_code_newline = '\n' ): dtree = DecisionTreeClassifier( criterion = criterion, max_depth = max_tree_depth, min_samples_split = min_samples_split, min_impurity_decrease = min_impurity_decrease, ) dtree = dtree.fit(self.df_X, self.df_y) data = tree.export_graphviz( dtree, out_file = None, feature_names = self.feature_names, ) code = self.tree_to_code( tree = dtree, feature_names = self.feature_names, newline = output_code_newline ) if output_code_path is not None: f = open(output_code_path, 'w') f.write(code) f.close() Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Decision tree built successfully: ' + str(data) + ', tree converted to code:\n\r' + str(code) ) if output_graph_path is not None: graph = pydotplus.graph_from_dot_data(data) graph.write_png(output_graph_path) return dtree
def test_corpora_general( self, data_from_internet=True, write_to_file_path=None, sample_fpath=None, ): if data_from_internet: sentences_list = ScrapeUrl().get_training_data_by_scraping_urls( url_list=[ 'https://slowcook.netlify.app/mix/2050-recipe-of-homemade-nakji-bokkeum-korean-spicy-octopus-stirfry/', 'https://www.bbc.com/ukrainian/vert-earth-russian-47766544', 'https://www.say7.info/cook/recipe/118-Plov.html', 'https://ru.wikipedia.org/wiki/IU_(%D0%BF%D0%B5%D0%B2%D0%B8%D1%86%D0%B0)', ], tag_to_find='p', min_char_per_sent=50, max_char_per_sent=500, write_to_filepath=write_to_file_path, ) Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': TOTAL SCRAPED = ' + str(len(sentences_list))) else: assert sample_fpath is not None sentences_list = ScrapeUrl().get_training_data_from_file( filepath=sample_fpath, min_char_per_sent=0, max_char_per_sent=np.inf, ) for i in range(len(sentences_list)): s = sentences_list[i] import re sentences_list[i] = re.sub(pattern=' ', repl='', string=s) # sentences_list = [s for s in sentences_list if (len(s) >= 10) and (len(s) <= 30)] Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': TOTAL READ = ' + str(len(sentences_list))) # [print(s) for s in sentences_list] return sentences_list
def test_japanese(self): try: import nagisa except: Log.info( str(self.__class__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Not testing japanese, cannot load nagisa' ) return ut.ResultObj(count_ok=0, count_fail=0) list_sent_exp = [ ['本日はチャットサービスをご利用いただき、ありがとうございます。オペレーターと接続中です。', ['本日', 'は', 'チャット', 'サービス', 'を', 'ご', '利用', 'いただき', '、', 'ありがとう', 'ござい', 'ます', '。', 'オペレーター', 'と', '接続', '中', 'です', '。']], ['江戸時代には江戸前や江戸前海などの呼び名があった。', ['江戸', '時代', 'に', 'は', '江戸', '前', 'や', '江戸', '前海', 'など', 'の', '呼び名', 'が', 'あっ', 'た', '。']], ] retv = self.do_unit_test( word_segmenter = self.get_word_segmenter(lang = lf.LangFeatures.LANG_JA), list_sent_exp = list_sent_exp ) return retv
def map_to_correct_lang_code_iso_639_1_or_3( # 2 character language code lang_code ): # общая ошибка 'cn' вместо 'zh' if lang_code in (LangFeatures.LANG_CN, LangFeatures.LANG_ZH_CN): return LangFeatures.LANG_ZH # общая ошибка 'vn' вместо 'vi' elif lang_code == LangFeatures.LANG_VN: return LangFeatures.LANG_VI else: if lang_code in LangFeatures.ALL_ISO639_1_SUPPORTED_LANGS: return lang_code elif lang_code in LangFeatures.ALL_ISO639_3_SUPPORTED_LANGS: return lang_code else: Log.info( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Unsupported language code "' + str(lang_code) + '" return unchanged "' + str(lang_code) + '"' ) return lang_code
def __attach_file_check_validity_and_size( files_attachment_list, max_total_files_size=MAX_TOTAL_FILES_SIZE_MB_EMAIL_ATTCH): if files_attachment_list is None: return [] files_attachment_list_allowed = [] cum_size_mb = 0.0 for filepath in files_attachment_list: if os.path.isfile(filepath): Log.info('File <' + str(__name__) + '> line ' + str(getframeinfo(currentframe()).lineno) + ': Attachment file path "' + str(filepath) + '" OK') else: Log.error('File <' + str(__name__) + '> line ' + str(getframeinfo(currentframe()).lineno) + ': Invalid attachment file "' + str(filepath) + '", not attaching to email') continue fsize_bytes = os.path.getsize(filepath) fsize_mb = round(fsize_bytes / (1024 * 1024), 2) if fsize_mb + cum_size_mb < max_total_files_size: files_attachment_list_allowed.append(filepath) cum_size_mb += fsize_mb Log.info('File <' + str(__name__) + '> line ' + str(getframeinfo(currentframe()).lineno) + ': Appended file "' + str(filepath) + '" as email attachment size ' + str(fsize_mb) + 'MB, total cumulative ' + str(cum_size_mb) + 'MB') else: Log.warning('File <' + str(__name__) + '> line ' + str(getframeinfo(currentframe()).lineno) + ': File "' + str(filepath) + '" too big ' + str(fsize_mb) + 'MB. Cumulative = ' + str(fsize_mb + cum_size_mb) + ' Not attaching to email') return files_attachment_list_allowed
def get_training_data_by_scraping( self, url, tag_to_find='p', min_char_per_sent=0, max_char_per_sent=np.inf, rm_html_markup=False, unquote_html=False, ): # Пример данных из википедии sentences_list_from_wiki_scraping = Scrape().scrape_url( url=url, tag_to_find=tag_to_find) Log.info( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Scraped ' + str(len(sentences_list_from_wiki_scraping)) + ' sentences from url "' + str(url) + '"') sentences_list = [] for s in sentences_list_from_wiki_scraping: s = StringUtils.trim(s) s = BeautifulSoup(s).text s_clean = s if rm_html_markup: # Remove all patterns '<...>' html_tags_re = re.compile(r'<[^>]+>') s_clean = re.sub(html_tags_re, '', string=s) if unquote_html: # Convert strings like '%3Fmode%3DLSD%26mid%3Dshm%26sid1%3D102%26oid%3D421%26aid%3D0005537039' # into '?mode=LSD&mid=shm&sid1=102&oid=421&aid=0005537039' s_clean = urllib.parse.unquote(string=s) len_s = len(s_clean) if (len_s >= min_char_per_sent) and (len_s <= max_char_per_sent): sentences_list.append(s_clean) Log.debug('From\n\r\t"' + str(s) + '" to\n\r\t"' + str(s_clean) + '"') Log.info( str(__name__) + ' ' + str(getframeinfo(currentframe()).lineno) + ': Filtered to ' + str(len(sentences_list)) + ' sentences from url "' + str(url) + '"') return sentences_list