def test_linear_regression(loga=False, norm=False):
    train_split = TRAIN_SPLIT
    feature_set = FEATURE_SET
    logarithm = loga
    normalize = norm
    name = 'name'

    parameter_dict = {
        'name': name,
        'feature_set': feature_set,
        'logarithm': logarithm,
        'normalize': normalize,
        'train_split': train_split,
    }

    x_tr, y_tr, x_te, y_te, L2_matrix, saved_data, scaler = import_data(
        logarithm,
        normalize,
        FILE_PATH,
        TRAIN_SPLIT,
        FEATURE_SET,
    )

    prediction = linear_regression(x_tr, y_tr, x_te, L2_matrix, parameter_dict)
    write_price_differences(prediction, x_te, y_te, L2_matrix, saved_data,
                            parameter_dict)
def train_n_test_model(times, parameters):
    x_tr, y_tr, x_te, y_te, L2_matrix, saved_data, scaler = import_data(
        parameters['logarithm'], parameters['normalize'], FILE_PATH,
        parameters['train_split'], parameters['feature_set'])

    for i in range(times):
        prediction = train_and_evaluate_models(x_tr, y_tr, x_te, L2_matrix,
                                               parameters)
        write_price_differences(prediction, x_te, y_te, L2_matrix, saved_data,
                                parameters)
示例#3
0
def calculate_distance(lat, long):
    country_list = preprocessing.import_data()
    nearest_distance = sys.maxint
    nearest_point = ""
    for point in country_list:
        point_distance = distance.haversine(float(long), float(lat),
                                            float(point[2]), float(point[1]))
        if nearest_distance > point_distance:
            nearest_distance = point_distance
            nearest_point = point[0]
    print nearest_point
def test_deep_learning_GS(epochs, lr, constant, dropout):
    # region data import
    train_split = TRAIN_SPLIT
    feature_set = FEATURE_SET
    logarithm = True
    normalize = False
    name = 'with_logarithm'

    parameter_dict = {
        'name': name,
        'feature_set': feature_set,
        'logarithm': logarithm,
        'normalize': normalize,
        'train_split': train_split,
    }

    x_tr, y_tr, x_te, y_te, L2_matrix, saved_data, scaler = import_data(
        logarithm, normalize, FILE_PATH, TRAIN_SPLIT, FEATURE_SET)

    # endregion

    # region parameters
    learning_rate_list = [lr]
    epochs_list = [epochs]
    batch_size_list = [16]
    hidden_layers_list = [4]
    number_neurons_list = [256]
    batchnorm_list = [False]
    dropout_list = [dropout]
    init_mode_list = ['he_normal']
    input_dim = [x_tr.shape[1]]
    optimizer_list = ['adam']
    dropout_rate = [0.1]
    constant = [constant]
    set_of_features = [0, 1, 2, 3, 4, 5, 6, 7, 8]

    parameters = dict(batch_size=batch_size_list,
                      epochs=epochs_list,
                      hidden_layers=hidden_layers_list,
                      neurons=number_neurons_list,
                      learn_rate=learning_rate_list,
                      batchnormalize=batchnorm_list,
                      init_mode=init_mode_list,
                      input_dimension=input_dim,
                      optimizer=optimizer_list,
                      dropout=dropout_list,
                      dropout_rate=dropout_rate,
                      constant=constant)

    # endregion

    do_grid_search(x_tr, y_tr, parameters, create_model, parameter_dict)
def test_svr():
    train_split = TRAIN_SPLIT
    feature_set = FEATURE_SET
    logarithm = True
    normalize = True
    name = 'name'

    parameter_dict = {
        'name': name,
        'feature_set': feature_set,
        'logarithm': logarithm,
        'normalize': normalize,
        'train_split': train_split,
    }

    x_tr, y_tr, x_te, y_te, L2_matrix, saved_data, scaler = import_data(
        logarithm, normalize, FILE_PATH, TRAIN_SPLIT, FEATURE_SET)

    prediction = support_vector_machine(x_tr, y_tr, x_te, L2_matrix, 1e-06,
                                        300, 0.01, parameter_dict)
    write_price_differences(prediction, x_te, y_te, L2_matrix, saved_data,
                            parameter_dict)
def test_random_forest():
    train_split = TRAIN_SPLIT
    feature_set = FEATURE_SET
    logarithm = True
    normalize = False
    name = 'logarithm'

    parameter_dict = {
        'name': name,
        'feature_set': feature_set,
        'logarithm': logarithm,
        'normalize': normalize,
        'train_split': train_split,
    }

    x_tr, y_tr, x_te, y_te, L2_matrix, saved_data, scaler = import_data(
        logarithm, normalize, FILE_PATH, TRAIN_SPLIT, FEATURE_SET)
    for i in range(3):
        prediction = random_forrest(x_tr, y_tr, x_te, L2_matrix,
                                    parameter_dict)
        write_price_differences(prediction, x_te, y_te, L2_matrix, saved_data,
                                parameter_dict)
def test_svr_grid():
    train_split = TRAIN_SPLIT
    feature_set = FEATURE_SET
    logarithm = False
    normalize = False
    name = 'name'

    parameter_dict = {
        'name': name,
        'feature_set': feature_set,
        'logarithm': logarithm,
        'normalize': normalize,
        'train_split': train_split,
    }

    x_tr, y_tr, x_te, y_te, L2_matrix, saved_data, scaler = import_data(
        logarithm, normalize, FILE_PATH, TRAIN_SPLIT, FEATURE_SET)

    epsilon_list = [1e-5]
    gamma_list = [200]
    c_list = [0.01]

    manual_gridsearch_svr(x_tr, y_tr, x_te, y_te, L2_matrix, epsilon_list,
                          gamma_list, c_list, parameter_dict)
示例#8
0
# -*- coding: utf-8 -*-
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
import pydot
import numpy as np
import preprocessing
from math import sqrt


#1 = bots, 0 = legit
#training data
df = preprocessing.import_data()
train_bots = df[1].values[:12000,3:].astype(int)
train_legit = df[2].values[:12000,3:].astype(int)

feature_list = df[3][3:]

X_train = np.vstack((train_bots,train_legit))

#testing data
test_bots = df[1].values[15000:16000,3:].astype(int)
test_legit = df[2].values[15000:16000,3:].astype(int)

X_test = np.vstack((test_bots,test_legit))

#training labels
train_bots_label = np.ones((train_bots.shape[0],1))
train_legit_label = np.zeros((train_legit.shape[0],1))

Y_train = np.vstack((train_bots_label,train_legit_label))
Y_train = Y_train.ravel(order='C')
示例#9
0
文件: main.py 项目: mgetech/SubLoc
import pandas as pd
import evaluation as evaal
import preprocessing as prp
import functions as func
from datetime import datetime
from tensorflow import keras
#print(keras.__version__)
from keras.preprocessing.sequence import pad_sequences
"""### Preprocessing"""

#@title Please choose the algorithm that you want to use: { form-width: "250px", display-mode: "both" }
algorithm = "SVM"  #@param ["SVM", "BiLSTM"]

# ========== Importing the data

X, y = prp.import_data(algorithm)

# =========================== Preprocessing ===============================

# ========== Ordinal encoding

amino_codes = [
    '0', 'A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P',
    'S', 'R', 'T', 'W', 'V', 'Y'
]
non_amino_letters = ['B', 'J', 'O', 'U', 'X', 'Z']

amino_mapping = prp.create_mapping(amino_codes)

X['mapped_seq'] = prp.integer_encoding(X['seq'], amino_mapping)