def main(generate='xgb', scaling=False):
    """ Runs the different processing steps to generate a
    solution in csv format"""

    print("loading data in memory...")
    (df_train, target, df_test) = load_data()

    print("preparing datasets...")
    (X, y, X_test, label_encoder) = prepare_datasets(df_train, df_test, target)

    if generate == 'benchmark1':
        print("Generating submission...")
        return generate_benchmark1_submission(df_train,
                                              target,
                                              df_test['id'])
    elif generate == 'benchmark2':
        print("Generating submission...")
        return generate_benchmark2_submission(df_train,
                                              target,
                                              df_test['id'])
    else:
        if scaling:
            print("Transforming datasets...")
            (X, scaler, transformer) = feature_transformation(X)
            X_test = transformer.transform(scaler.transform(X_test))

        print("Training classifier...")
        clf = train_classifier(X, y)

        print("Generating submission...")
        return generate_submission(
            clf, X_test, label_encoder, df_test['id'], clf_name=generate)
예제 #2
0
def train_model_validation(filename_train_validation_set,
                           filename_labels_train_validation_set,
                           filter_density,
                           dropout,
                           input_shape,
                           output_shape,
                           file_path_model,
                           filename_log,
                           channel=1):
    """
    train model with validation
    """

    filenames_train, Y_train, \
    filenames_validation, Y_validation, \
    filenames_features, Y_train_validation = \
        load_data(filename_labels_train_validation_set)

    model_0 = model_switcher(filter_density, dropout, input_shape,
                             output_shape)

    # print(model_0.summary())

    batch_size = 256
    patience = 15

    # print(model_0.count_params())

    model_train_validation(model_0, batch_size, patience, input_shape,
                           filename_train_validation_set, filenames_train,
                           Y_train, filenames_validation, Y_validation,
                           file_path_model, filename_log, channel)
예제 #3
0
def load_model_data():
    '''
    Loads the model datasets from the respective files

    Returns:
    results -- dictionary containing the following
        train_x_nb -- training x for naive bayes data set
        train_y -- training y data set
        test_x_nb -- test x for naive bayes data set
        test_y -- test y data set
        train_x_nn -- training x for neural nets (& Logistic regression)  data set
        test_x_nn -- test x for neural nets (& Logistic regression)  data set
    '''
    _, train_x_nb, _, _, _ = load_data(constants.train_x_nb_file, [], [])
    _, train_y, _, _, _ = load_data(constants.train_y_file, [], [])
    _, test_x_nb, _, _, _ = load_data(constants.test_x_nb_file, [], [])
    _, test_y, _, _, _ = load_data(constants.test_y_file, [], [])
    _, train_x_nn, _, _, _ = load_data(constants.train_x_nn_file, [], [])
    _, test_x_nn, _, _, _ = load_data(constants.test_x_nn_file, [], [])

    results = {}
    results['train_x_nb'] = train_x_nb
    results['train_y'] = train_y
    results['test_x_nb'] = test_x_nb
    results['test_y'] = test_y
    results['train_x_nn'] = train_x_nn
    results['test_x_nn'] = test_x_nn

    return results
def datasets(scaling=False):
    """ Prepares the datasets and returns (X_train, X_test, y_train, y_test)"""

    print("loading data in memory...")
    (df_train, target, df_test) = load_data()

    print("preparing datasets...")
    (X, y, _, _) = prepare_datasets(df_train, df_test, target)

    if scaling:
        print("Transforming datasets...")
        (X, _, _) = feature_transformation(X)

    return train_test_split(X, y, test_size=.30, random_state=42)
예제 #5
0
    def test_load_data(self):
        """
        Tests the load data funciton. required TESTING_data.csv file
        Reads from test file TESTING_data.csv, and verifies contents
        """
        header_orig_data, data_orig_data, _ = load_data(
            './datasets/TESTING_data.csv')

        header_array = np.array(['Item1', 'Item2', 'Item3', 'Item4'])
        # load_data is transposing this item
        header_array = header_array
        nptest.assert_array_equal(header_orig_data, header_array)

        data_array = np.array([[
            1,
            2,
            3,
            4,
        ], [5, 6, 7, 8]])
        # load_data is transposing this item
        data_array = data_array
        nptest.assert_array_equal(data_orig_data, data_array)
예제 #6
0
    def test_partition_data(self):
        """
        Tests the partition data funciton. required TESTING_partition_data.csv file
        """
        _, data_orig_data, _ = load_data(
            './datasets/TESTING_partition_data.csv')

        train_set, test_set, dev_set = partition_data(data_orig_data)
        self.assertEqual(train_set.shape[0], 6)
        self.assertEqual(test_set.shape[0], 2)
        self.assertEqual(dev_set.shape[0], 2)

        # Test a different partition configuration
        train_set, test_set, dev_set = partition_data(data_orig_data, 0.8, 0.1)
        self.assertEqual(train_set.shape[0], 8)
        self.assertEqual(test_set.shape[0], 1)
        self.assertEqual(dev_set.shape[0], 1)

        # Test a configuration with no dev
        train_set, test_set, dev_set = partition_data(data_orig_data, 0.9, 0.1)
        self.assertEqual(train_set.shape[0], 9)
        self.assertEqual(test_set.shape[0], 1)
        self.assertEqual(dev_set.shape[0], 0)
예제 #7
0
def data_preparation(filename):
    """
    Executed only once to create train, test and dev datasets.

    Arguments:
    filename -- the file of the original data

    Returns:
    This function creates three files
    hr_train.csv -- the training dataset
    hr_test.csv -- the testing dataset
    hr_dev.csv -- the development dataset
    """

    # load originial dataset
    hr_header_orig, hr_data_orig, _, _, _ = load_data(filename, [], [])

    # partition data 60/20/20
    hr_train, hr_test, hr_dev = partition_data(hr_data_orig)

    # save results in files
    save_data('./datasets/hr_train.csv', hr_header_orig, hr_train)
    save_data('./datasets/hr_test.csv', hr_header_orig, hr_test)
    save_data('./datasets/hr_dev.csv', hr_header_orig, hr_dev)
예제 #8
0
from layers import  Softmax, Linear, RNN#, LSTM
from loss import CrossEntropyLoss
from optimizer import SGDOptimizer
from network import Network
from data_preparation import load_data
from solve_rnn import solve_rnn

import theano.tensor as T

X_train, y_train, X_test, y_test = load_data()

HIDDEN_DIM = 32
INPUT_DIM = 20
OUTPUT_DIM = 10

model = Network()
model.add(RNN('rnn1', HIDDEN_DIM, INPUT_DIM, 0.1))      # output shape: 4 x HIDDEN_DIM
model.add(Linear('fc', HIDDEN_DIM, OUTPUT_DIM, 0.1))    # output shape: 4 x OUTPUT_DIM
model.add(Softmax('softmax'))

loss = CrossEntropyLoss('xent')

optim = SGDOptimizer(0.01, 0.0001, 0.9)
input_placeholder = T.fmatrix('input')
label_placeholder = T.fmatrix('label')

model.compile(input_placeholder, label_placeholder, loss, optim)

MAX_EPOCH = 6
DISP_FREQ = 1000
TEST_FREQ = 10000
예제 #9
0
from __future__ import print_function
from data_preparation import load_data
from set_session import set_session
import resnet
import keras
import os

batch_size = 64
epochs = 150
im_dir = "../dataset/images/"
train_im_list = "../dataset/SUNAttributeDB/train.txt"
test_im_list = "../dataset/SUNAttributeDB/test.txt"
set_session(0)
# The data, shuffled and split between train and test sets:
(x_train, y_train), (x_test, y_test) = load_data(im_dir, train_im_list,
                                                 test_im_list)

model = resnet.ResnetBuilder.build_resnet_34((3, 224, 224), 102)

# initiate SGD optimizer
opt = keras.optimizers.SGD(lr=0.05, decay=1e-6)

# Let's train the model using SGD
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])

print('Not using data augmentation.')
checkpoint = keras.callbacks.ModelCheckpoint(
    filepath='./checkpoint/checkpoint-{epoch:02d}-{val_loss:.2f}.h5')
model.fit(x_train,
          y_train,
          batch_size=batch_size,
예제 #10
0
currentDT = str(datetime.datetime.now()).replace(':', '_').replace('.', '_').replace(' ', '_')
print(currentDT)


# ======================================================= DATA Preparation =============================================

# We need the data as : an annotation dataframe, a documents dataframe, and,
# if training, to divide the documents between train_docs and test_docs
# the load_data function can output these if given an annotations dataframe and the paht to the files (see required format in data_preparation)



all_annotations, documents, train_docs, test_docs = load_mt_samples()  # MT samples data
#cris_annotations, cris_documents, cris_train_docs, cris_test_docs = load_data(...) # CRIS data

i2b2_annotations, i2b2_documents = load_data('../TimeDatasets/i2b2 Data/i2b2_timexe_annotations.xlsx')
test_i2b2_docs = i2b2_documents[i2b2_documents.test == True]
train_i2b2_docs = i2b2_documents[i2b2_documents.test == False]
print(i2b2_documents)
# ============================================= MODEL TRAINING =================================================

# model parameters
spacy_type = False   #choosing which typing to use. True for DATE and TIME types.
other_annotations = False
nb_iter = 2


if not spacy_type:
    output_dir = 'models/all_types_model_' + currentDT
else:
    output_dir = 'models/spacy_types_model_' + currentDT
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost.sklearn import XGBClassifier

from sklearn.grid_search import RandomizedSearchCV # , GridSearchCV
from sklearn.cross_validation import train_test_split

from data_preparation import load_data, prepare_datasets
from data_transformation import feature_transformation
from metrics import ndcg_scorer

from evaluation import score_classifier, display_learning_curves


print("loading data in memory...")
(df_train, target, df_test) = load_data()

print("preparing datasets...")
(X, y, X_test, label_encoder) = prepare_datasets(df_train, df_test, target)

print("Transforming datasets...")
(X_transformed, scaler, transformer) = feature_transformation(X)
X_test_scaled = scaler.transform(X_test)
X_test_transformed = transformer.transform(X_test_scaled)

print("splitting data ")
(X_train, X_te, y_train, y_te) = train_test_split(
    X_transformed, y, test_size=.25, random_state=41)


def main():
from analyse import plot_comparison_curve
from data_preparation import load_data

column_drop_list = [
    constants.EMPLYEENO_R, constants.EMPLOYEECOUNT_R, constants.ISOVER18_R,
    constants.STDHOURS_R
]
encode_list = [
    constants.GENDER_T, constants.STATUS_T, constants.DEPARTMENT_T,
    constants.ROLE_T, constants.OVERTIME_T, constants.TRAVEL_T,
    constants.ISRESIGNED_T, constants.EDUCATION_T
]

file_path = '.\\datasets\\WA_Fn-UseC_-HR-Employee-Attrition.csv'

header, data, m_header, m_data, analytics = load_data(file_path, encode_list,
                                                      column_drop_list)

#analyse_data_init(header, analytics)

features_to_analyse = [
    constants.STOCKOPTIONS, constants.TRAINING, constants.SATISFACTION,
    constants.TEAMCLICK, constants.ROLE_T, constants.LEVEL,
    constants.DEPARTMENT_T, constants.EDUCATION_T, constants.GENDER_T,
    constants.COMPANIES, constants.STATUS_T, constants.RATING,
    constants.LIFEBALANCE, constants.INVOLVEMENT
]

features_to_analyse = [
    constants.RATING,
]
예제 #13
0
def prepare_model_data(rewrite_files=False):
    '''
    Prepares the data set files used by the models.

    Returns:
    results -- dictionary containing the following
        train_x_nb -- training x for naive bayes data set
        train_y -- training y data set
        test_x_nb -- test x for naive bayes data set
        test_y -- test y data set
        train_x_nn -- training x for neural nets (& Logistic regression)  data set
        test_x_nn -- test x for neural nets (& Logistic regression)  data set
    '''

    # step 1:
    column_drop_list = [
        constants.EMPLYEENO_R, constants.EMPLOYEECOUNT_R, constants.ISOVER18_R,
        constants.STDHOURS_R
    ]
    encode_list = [
        constants.GENDER_T, constants.STATUS_T, constants.DEPARTMENT_T,
        constants.ROLE_T, constants.OVERTIME_T, constants.TRAVEL_T,
        constants.ISRESIGNED_T, constants.EDUCATION_T
    ]

    _, _, m_header, m_data, _ = load_data(constants.orig_file, encode_list,
                                          column_drop_list)

    save_data(constants.processed_file,
              m_header,
              m_data,
              override=rewrite_files)

    # step 2:
    train_data, test_data, _ = partition_data(m_data, 0.8, 0.2)

    output_idx = np.argwhere(m_header == constants.ISRESIGNED_T).squeeze()

    train_y = train_data[:, output_idx]
    train_x = np.delete(train_data, output_idx, 1)

    test_y = test_data[:, output_idx]
    test_x = np.delete(test_data, output_idx, 1)

    traintest_header = np.delete(m_header, output_idx, 0)

    save_data(constants.train_x_file,
              traintest_header,
              train_x,
              override=rewrite_files)
    save_data(constants.train_y_file,
              np.array([constants.ISRESIGNED_T]),
              train_y,
              override=rewrite_files)
    save_data(constants.test_x_file,
              traintest_header,
              test_x,
              override=rewrite_files)
    save_data(constants.test_y_file,
              np.array([constants.ISRESIGNED_T]),
              test_y,
              override=rewrite_files)

    # to execute naive bayes we will discretise continuous data
    column_bins_definition = {
        constants.AGE: 10,
        constants.DAILYRATE: 10,
        constants.HOMEDISTANCE: 10,
        constants.SALARY: 10,
        constants.HOURLYRATE: 10,
        constants.MONTHLYRATE: 10,
        constants.YEARSEMPLOYED: 5,
        constants.YEARSCOMPANY: 5,
        constants.YEARSROLE: 5,
        constants.YEARSLASTPROMO: 5,
        constants.YEARSMANAGER: 5,
        constants.LASTINCREMENTPERCENT: 16
    }

    train_x_nb = digitize_columns(traintest_header, train_x,
                                  column_bins_definition)
    test_x_nb = digitize_columns(traintest_header, test_x,
                                 column_bins_definition)

    save_data(constants.train_x_nb_file,
              traintest_header,
              train_x_nb,
              override=rewrite_files)
    save_data(constants.test_x_nb_file,
              traintest_header,
              test_x_nb,
              override=rewrite_files)

    columns_norm = [
        constants.AGE, constants.DAILYRATE, constants.HOMEDISTANCE,
        constants.SALARY, constants.HOURLYRATE, constants.MONTHLYRATE,
        constants.YEARSEMPLOYED, constants.YEARSCOMPANY, constants.YEARSROLE,
        constants.YEARSLASTPROMO, constants.YEARSMANAGER,
        constants.LASTINCREMENTPERCENT
    ]

    train_x_lr = feature_scale_columns(traintest_header, train_x, columns_norm)
    test_x_lr = feature_scale_columns(traintest_header, test_x, columns_norm)

    save_data(constants.train_x_lr_file,
              traintest_header,
              train_x_lr,
              override=rewrite_files)
    save_data(constants.test_x_lr_file,
              traintest_header,
              test_x_lr,
              override=rewrite_files)

    encode_list_nn = [
        constants.GENDER_T, constants.STATUS_T, constants.DEPARTMENT_T,
        constants.ROLE_T, constants.OVERTIME_T, constants.TRAVEL_T,
        constants.EDUCATION_T, constants.ENVIRONMENT, constants.INVOLVEMENT,
        constants.LEVEL, constants.SATISFACTION, constants.COMPANIES,
        constants.RATING, constants.TEAMCLICK, constants.STOCKOPTIONS,
        constants.TRAINING, constants.LIFEBALANCE
    ]

    train_test_header_nn, train_x_nn = encode_columns_nn(
        traintest_header, train_x_lr, encode_list_nn)
    _, test_x_nn = encode_columns_nn(traintest_header, test_x_lr,
                                     encode_list_nn)

    save_data(constants.train_x_nn_file,
              train_test_header_nn,
              train_x_nn,
              override=rewrite_files)
    save_data(constants.test_x_nn_file,
              train_test_header_nn,
              test_x_nn,
              override=rewrite_files)

    results = {}

    results['train_x_nb'] = train_x_nb
    results['train_y'] = train_y
    results['test_x_nb'] = test_x_nb
    results['test_y'] = test_y
    results['train_x_nn'] = train_x_nn
    results['test_x_nn'] = test_x_nn
    return results
예제 #14
0
from predict import digitize_columns
from predict import standardize_columns
from predict import encode_columns_nn

from data_preparation import load_data
from data_preparation import save_data

orig_file_path = '.\\datasets\\TESTING_ml.csv'
processed_file_path = '.\\datasets\\TESTING_ml_proc.csv'
nb_file_path = '.\\datasets\\TESTING_ml_nb.csv'
lr_file_path = '.\\datasets\\TESTING_ml_lr.csv'
nn_file_path = '.\\datasets\\TESTING_ml_nn.csv'

column_drop_list = ['Delete']
encode_list = ['Gender', 'Sport']
_, _, header, data, _ = load_data(orig_file_path, encode_list,
                                  column_drop_list)
save_data(processed_file_path, header, data)

column_bins_definition = {'Age': 5, 'Salary': 5}
data_nb = digitize_columns(header, data, column_bins_definition)
save_data(nb_file_path, header, data_nb)

columns_norm = ['Age', 'Salary']
data_lr = standardize_columns(header, data, column_bins_definition)
save_data(lr_file_path, header, data_lr)

print(header)
print(data_lr)
encode_list_nn = ['Gender', 'Sport']
header_nn, data_nn = encode_columns_nn(header, data_lr, encode_list_nn)
save_data(nn_file_path, header_nn, data_nn)