Пример #1
0
def run_model(df_train, df_test, config, save_imp=True, params=None):
    # run model with trainning & predicting process
    # Warning: drop_list columns should be dropped already
    mdf = model.ModelFactory(target_col=config['target_col'],
                             index_col=config['index_col'])
    mdl = mdf.build_model(config, params)
    mdh = ModelHolder(config,
                      target_col=config['target_col'],
                      index_col=config['index_col'])
    mdh.fit_predict(mdl, df_train, df_test, stack_prefix=STACK_PREFIX)
    imps = mdh.get_feature_imp()
    if (save_imp) and (imps is not None):
        imps.to_csv(IMP_SAVEFILE, header=None)
    return mdh
Пример #2
0
def encrypt(data_to_hide: str, sentences: List[str],
            models_holder: ModelHolder) -> Tuple[List[str], List[str], str]:
    binary_data = _make_binary_data(data_to_hide)

    binary_data_len = len(binary_data)
    sentences_count = len(sentences)

    assert sentences_count >= binary_data_len, 'Sentences count should not be less than binary data size'

    output_sentences: List[str] = []
    hiders: List[str] = []
    inserted_bits = 0
    sentences_index = 0
    while sentences_index < len(sentences):
        bit = binary_data[
            inserted_bits] if inserted_bits < binary_data_len else None
        tokenized = models_holder.tokenize_sentence(sentences[sentences_index])
        updated_sentence, is_inserted = _update_sentence(
            bit, tokenized, models_holder)
        output_sentences.append(updated_sentence)

        if is_inserted:
            inserted_bits += 1
            hiders.append(updated_sentence)

        sentences_index += 1

    assert inserted_bits == binary_data_len, f"Not enough data. Probably some similarities are missed. " \
                                             f"Try to extend text data.\n" \
                                             f"Bits inserted:      {inserted_bits}\n" \
                                             f"Binary data length: {binary_data_len}"

    return output_sentences, hiders, binary_data[:-8]
Пример #3
0
def decrypt(sentences: List[str], models_holder: ModelHolder) -> Optional[str]:
    def find_replaced(doc: spacy.tokens.Doc) -> str:
        has_subject = any(
            filter(
                lambda token: token.dep_ == SUBJECT_DEP and token.tag_ !=
                SPACE_TAG, tokenized))
        has_predicate = any(
            filter(
                lambda token: token.dep_ == ROOT_DEP and token.tag_ !=
                SPACE_TAG, tokenized))
        if not has_subject and not has_predicate:
            return None

        search_params = get_search_parameters(has_subject)

        replacer = next(
            (token for token in doc
             if token.dep_ == search_params.dep and token.tag_ != SPACE_TAG),
            None)
        if replacer is None:
            return None

        return replacer.text

    encrypted_data = str()

    for index, sentence in enumerate(sentences):
        tokenized = models_holder.tokenize_sentence(sentence)
        replacer = find_replaced(tokenized)

        if replacer is None:
            continue

        encrypted_data += str(len(replacer) % 2)

        if len(encrypted_data) % BITS_IN_BYTE == 0 and encrypted_data.endswith(
                STOP_SIGN):
            break

    # assert len(encrypted_data) % BITS_IN_BYTE == 0, f'Data is incorrect, not enough bits {len(encrypted_data)}'
    encrypted_data = encrypted_data[:-BITS_IN_BYTE]

    return _convert_binary_to_string(encrypted_data), encrypted_data
Пример #4
0
formatter = dtFrm.LANLDataFormatter(data_df=data_df, data_type='train', doTransform=True, doScale=True, cols_to_keep=50)
data_df = formatter.transform()
most_dependent_columns = formatter.getMostImpCols()

# data_df = data_df.drop(['acc_max','acc_min','chg_acc_max','chg_acc_min'],axis=1)
# Splitting data into test_random_forest and train
# train_set, test_set = train_test_split(data_df, test_size=0.2, random_state=np.random.randint(1, 1000))

# Separate output from inputs
y_train = data_df['time_to_failure']
x_train_seg = data_df['segment_id']
x_train = data_df.drop(['time_to_failure', 'segment_id'], axis=1)

y_train = np.around(y_train.values, decimals=2)

# mlpReg = MLPRegressor(verbose=True, tol=0.0001, max_iter=200000, n_iter_no_change=10000, hidden_layer_sizes=(200,))
mlpReg = MLPRegressor(verbose=True, max_iter=1000)
mlpReg.fit(x_train, y_train)

# Create an variable to pickle and open it in write mode
mh = ModelHolder(mlpReg, most_dependent_columns)
mh.save('mlp_regression.model')
mlpReg = None
mh_new = load_model(model_name)
mlpReg, most_dependent_columns = mh_new.get()
y_pred = mlpReg.predict(x_train) 
# y_pred = pd.Series(y_pred).apply(lambda x: float(x / 10))

print('MAE for Multi Layer Perceptron', mean_absolute_error(y_train, y_pred))
# Separate output from inputs
y_train = data_df['time_to_failure']
x_train_seg = data_df['segment_id']
x_train = data_df.drop(['time_to_failure', 'segment_id'], axis=1)

svReg = SVR(C=9137.08647605824366,
            cache_size=200,
            coef0=0.0,
            degree=2,
            epsilon=0.001,
            gamma=0.586414861763494,
            kernel='rbf',
            max_iter=-1,
            shrinking=True,
            tol=0.001,
            verbose=True)

svReg.fit(x_train, y_train)
# Create an variable to pickle and open it in write mode
mh = ModelHolder(svReg, most_dependent_columns)
mh.save(model_name)
svReg = None
mh_new = load_model(model_name)
svReg, most_dependent_columns = mh_new.get()

y_pred = svReg.predict(x_train)

mas = mean_absolute_error(y_train, y_pred)
print('Mean Absolute Error', mas)
y_train = data_df['time_to_failure']
x_train_seg = data_df['segment_id']
x_train = data_df.drop(['time_to_failure'], axis=1)
x_train = x_train.drop(['segment_id'], axis=1)

# y_test = test_set['time_to_failure']
# x_test_seg = test_set['segment_id']
# x_test = test_set.drop(['time_to_failure'], axis=1)
# x_test = x_test.drop(['segment_id'], axis=1)

y_train = y_train.apply(lambda x: int(round(x)))

logReg = LogisticRegression(solver='lbfgs',
                            random_state=np.random.randint(1, 1000),
                            max_iter=200000,
                            n_jobs=4,
                            multi_class='auto',
                            verbose=2)
logReg.fit(x_train, y_train)

# Create an variable to pickle and open it in write mode
mh = ModelHolder(logReg, most_dependent_columns)
mh.save('logistic_regression.model')
logReg = None
mh_new = load_model(model_name)
logReg, most_dependent_columns = mh_new.get()
y_pred = logReg.predict(x_train)
# y_pred = pd.Series(y_pred).apply(lambda x: float(x / 10))

print('MAE for Logistic', mean_absolute_error(y_train, y_pred))
Пример #7
0
        }), 400

    errors = []
    if not model_holder.set_new_model(request.json['A']):
        errors.append(
            "cannot set model A, model with given name does not exist")
    if not model_holder.set_default_model(request.json['B']):
        errors.append(
            "cannot set model B, model with given name does not exist")
    # error response if data is missing
    if len(errors) > 0:
        return jsonify({
            'status': 'missing model(s)',
            'missing': str(errors)
        }), 400

    return jsonify({'status': 'ok', 'active': model_holder.ab}), 200


if __name__ == '__main__':
    dataset = obtain_dataset_table()
    dataset, le_cat, le_subcat, le_city = code_labels(dataset)

    # models = [
    # {'name': 'tree model', 'model': load("tree_model.pkl"), 'filename': "tree_model.pkl"},
    # {'name': 'knn model', 'model': load("knn_model.pkl"), 'filename': "knn_model.pkl"},
    # {'name': 'xgb model','model': load("xgb_model.pkl"), 'filename': "xgb_model.pkl"}]
    model_holder = ModelHolder([], le_cat, le_subcat, le_city)

    app.run(debug=False)