def run_model(df_train, df_test, config, save_imp=True, params=None): # run model with trainning & predicting process # Warning: drop_list columns should be dropped already mdf = model.ModelFactory(target_col=config['target_col'], index_col=config['index_col']) mdl = mdf.build_model(config, params) mdh = ModelHolder(config, target_col=config['target_col'], index_col=config['index_col']) mdh.fit_predict(mdl, df_train, df_test, stack_prefix=STACK_PREFIX) imps = mdh.get_feature_imp() if (save_imp) and (imps is not None): imps.to_csv(IMP_SAVEFILE, header=None) return mdh
def encrypt(data_to_hide: str, sentences: List[str], models_holder: ModelHolder) -> Tuple[List[str], List[str], str]: binary_data = _make_binary_data(data_to_hide) binary_data_len = len(binary_data) sentences_count = len(sentences) assert sentences_count >= binary_data_len, 'Sentences count should not be less than binary data size' output_sentences: List[str] = [] hiders: List[str] = [] inserted_bits = 0 sentences_index = 0 while sentences_index < len(sentences): bit = binary_data[ inserted_bits] if inserted_bits < binary_data_len else None tokenized = models_holder.tokenize_sentence(sentences[sentences_index]) updated_sentence, is_inserted = _update_sentence( bit, tokenized, models_holder) output_sentences.append(updated_sentence) if is_inserted: inserted_bits += 1 hiders.append(updated_sentence) sentences_index += 1 assert inserted_bits == binary_data_len, f"Not enough data. Probably some similarities are missed. " \ f"Try to extend text data.\n" \ f"Bits inserted: {inserted_bits}\n" \ f"Binary data length: {binary_data_len}" return output_sentences, hiders, binary_data[:-8]
def decrypt(sentences: List[str], models_holder: ModelHolder) -> Optional[str]: def find_replaced(doc: spacy.tokens.Doc) -> str: has_subject = any( filter( lambda token: token.dep_ == SUBJECT_DEP and token.tag_ != SPACE_TAG, tokenized)) has_predicate = any( filter( lambda token: token.dep_ == ROOT_DEP and token.tag_ != SPACE_TAG, tokenized)) if not has_subject and not has_predicate: return None search_params = get_search_parameters(has_subject) replacer = next( (token for token in doc if token.dep_ == search_params.dep and token.tag_ != SPACE_TAG), None) if replacer is None: return None return replacer.text encrypted_data = str() for index, sentence in enumerate(sentences): tokenized = models_holder.tokenize_sentence(sentence) replacer = find_replaced(tokenized) if replacer is None: continue encrypted_data += str(len(replacer) % 2) if len(encrypted_data) % BITS_IN_BYTE == 0 and encrypted_data.endswith( STOP_SIGN): break # assert len(encrypted_data) % BITS_IN_BYTE == 0, f'Data is incorrect, not enough bits {len(encrypted_data)}' encrypted_data = encrypted_data[:-BITS_IN_BYTE] return _convert_binary_to_string(encrypted_data), encrypted_data
formatter = dtFrm.LANLDataFormatter(data_df=data_df, data_type='train', doTransform=True, doScale=True, cols_to_keep=50) data_df = formatter.transform() most_dependent_columns = formatter.getMostImpCols() # data_df = data_df.drop(['acc_max','acc_min','chg_acc_max','chg_acc_min'],axis=1) # Splitting data into test_random_forest and train # train_set, test_set = train_test_split(data_df, test_size=0.2, random_state=np.random.randint(1, 1000)) # Separate output from inputs y_train = data_df['time_to_failure'] x_train_seg = data_df['segment_id'] x_train = data_df.drop(['time_to_failure', 'segment_id'], axis=1) y_train = np.around(y_train.values, decimals=2) # mlpReg = MLPRegressor(verbose=True, tol=0.0001, max_iter=200000, n_iter_no_change=10000, hidden_layer_sizes=(200,)) mlpReg = MLPRegressor(verbose=True, max_iter=1000) mlpReg.fit(x_train, y_train) # Create an variable to pickle and open it in write mode mh = ModelHolder(mlpReg, most_dependent_columns) mh.save('mlp_regression.model') mlpReg = None mh_new = load_model(model_name) mlpReg, most_dependent_columns = mh_new.get() y_pred = mlpReg.predict(x_train) # y_pred = pd.Series(y_pred).apply(lambda x: float(x / 10)) print('MAE for Multi Layer Perceptron', mean_absolute_error(y_train, y_pred))
# Separate output from inputs y_train = data_df['time_to_failure'] x_train_seg = data_df['segment_id'] x_train = data_df.drop(['time_to_failure', 'segment_id'], axis=1) svReg = SVR(C=9137.08647605824366, cache_size=200, coef0=0.0, degree=2, epsilon=0.001, gamma=0.586414861763494, kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=True) svReg.fit(x_train, y_train) # Create an variable to pickle and open it in write mode mh = ModelHolder(svReg, most_dependent_columns) mh.save(model_name) svReg = None mh_new = load_model(model_name) svReg, most_dependent_columns = mh_new.get() y_pred = svReg.predict(x_train) mas = mean_absolute_error(y_train, y_pred) print('Mean Absolute Error', mas)
y_train = data_df['time_to_failure'] x_train_seg = data_df['segment_id'] x_train = data_df.drop(['time_to_failure'], axis=1) x_train = x_train.drop(['segment_id'], axis=1) # y_test = test_set['time_to_failure'] # x_test_seg = test_set['segment_id'] # x_test = test_set.drop(['time_to_failure'], axis=1) # x_test = x_test.drop(['segment_id'], axis=1) y_train = y_train.apply(lambda x: int(round(x))) logReg = LogisticRegression(solver='lbfgs', random_state=np.random.randint(1, 1000), max_iter=200000, n_jobs=4, multi_class='auto', verbose=2) logReg.fit(x_train, y_train) # Create an variable to pickle and open it in write mode mh = ModelHolder(logReg, most_dependent_columns) mh.save('logistic_regression.model') logReg = None mh_new = load_model(model_name) logReg, most_dependent_columns = mh_new.get() y_pred = logReg.predict(x_train) # y_pred = pd.Series(y_pred).apply(lambda x: float(x / 10)) print('MAE for Logistic', mean_absolute_error(y_train, y_pred))
}), 400 errors = [] if not model_holder.set_new_model(request.json['A']): errors.append( "cannot set model A, model with given name does not exist") if not model_holder.set_default_model(request.json['B']): errors.append( "cannot set model B, model with given name does not exist") # error response if data is missing if len(errors) > 0: return jsonify({ 'status': 'missing model(s)', 'missing': str(errors) }), 400 return jsonify({'status': 'ok', 'active': model_holder.ab}), 200 if __name__ == '__main__': dataset = obtain_dataset_table() dataset, le_cat, le_subcat, le_city = code_labels(dataset) # models = [ # {'name': 'tree model', 'model': load("tree_model.pkl"), 'filename': "tree_model.pkl"}, # {'name': 'knn model', 'model': load("knn_model.pkl"), 'filename': "knn_model.pkl"}, # {'name': 'xgb model','model': load("xgb_model.pkl"), 'filename': "xgb_model.pkl"}] model_holder = ModelHolder([], le_cat, le_subcat, le_city) app.run(debug=False)