Пример #1
0
def main(opt):
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
    if opt.mode == "train":
        st = time.time()
        print('Loading data')
        x_train, y_train, x_valid, y_valid, vocabulary_size = load_data(
            "data", opt.debug)

        num_training_data = x_train.shape[0]
        sequence_length = x_train.shape[1]
        print(num_training_data)

        print('Vocab Size', vocabulary_size)

        model = build_model(opt.embedding_dim, opt.hidden_size, opt.drop,
                            sequence_length, vocabulary_size)
        print("Traning Model...")
        history = model.fit(
            x_train,
            y_train,
            batch_size=opt.batch_size,
            epochs=opt.epochs,
            verbose=1,
            callbacks=[TestCallback((x_valid, y_valid), model=model)])
        model.save(opt.saved_model)
        print("Training cost time: ", time.time() - st)

    elif opt.mode == "ensemble":
        model1 = load_model(opt.saved_model1)
        model1.name = 'model1'
        for layer in model1.layers:
            layer.name = layer.name + str("_1")
        model2 = load_model(opt.saved_model2)
        model2.name = 'model2'
        for layer in model2.layers:
            layer.name = layer.name + str("_2")
        models = [model1, model2]

        vocabulary = json.load(open(os.path.join("data", "vocab.json")))
        predict_dict = predict_final_word_models(models, vocabulary, opt.input)
        sub_file = make_submission(predict_dict, opt.student_id, opt.input)
        if opt.score:
            scoring(sub_file, os.path.join("data"), type="valid")


#         x_train, y_train, x_valid, y_valid, vocabulary_size = load_data(
#             "data", opt.debug)
#         num_training_data = x_train.shape[0]
#         sequence_length = x_train.shape[1]
#         model_inputs = Input(shape=(sequence_length,), dtype='int32')
#         model = ensemble(models, model_inputs)
#         model.save(opt.model_to_be_saved)

    else:
        model = load_model(opt.saved_model)
        vocabulary = json.load(open(os.path.join("data", "vocab.json")))
        predict_dict = predict_final_word(model, vocabulary, opt.input)
        sub_file = make_submission(predict_dict, opt.student_id, opt.input)
        if opt.score:
            scoring(sub_file, os.path.join("data"), type="valid")
Пример #2
0
def main(opt):
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
    if opt.mode == "train":
        st = time.time()
        print('Loading data')
        x_train, y_train, x_valid, y_valid, vocabulary_size = load_data(
            "data", opt.debug)

        num_training_data = x_train.shape[0]
        sequence_length = x_train.shape[1]
        print(num_training_data)

        print('Vocab Size', vocabulary_size)

        model = build_model(opt.embedding_dim, opt.hidden_size, opt.drop1,
                            opt.drop2, sequence_length, vocabulary_size)
        print("Training Model...")
        model.fit(x_train,
                  y_train,
                  batch_size=opt.batch_size,
                  epochs=opt.epochs,
                  verbose=2,
                  callbacks=[TestCallback((x_valid, y_valid), model=model)])
        model.save(opt.saved_model)
        print("Training cost time: ", time.time() - st)
    else:
        if opt.mode == "score_valid":
            model = load_model(opt.saved_model)
            vocabulary = json.load(open(os.path.join("data", "vocab.json")))
            predict_dict = predict_final_word([model], vocabulary, opt.input)
            sub_file = make_submission(predict_dict, opt.student_id, opt.input)
            scoring(sub_file, os.path.join("data"), type="valid")
        else:
            model0 = load_model('models/model0.h5')
            model1 = load_model('models/model1.h5')
            model2 = load_model('models/model2.h5')
            model3 = load_model('models/model3.h5')
            model4 = load_model('models/model4.h5')
            model5 = load_model('models/model5.h5')
            model6 = load_model('models/model6.h5')
            model7 = load_model('models/model7.h5')
            model8 = load_model('models/model8.h5')
            model9 = load_model('models/model9.h5')
            model_list = [
                model0, model1, model2, model3, model4, model5, model6, model7,
                model8, model9
            ]
            vocabulary = json.load(open(os.path.join("data", "vocab.json")))
            predict_dict = predict_final_word(model_list, vocabulary,
                                              opt.input)
            sub_file = make_submission(predict_dict, opt.student_id, opt.input)
            scoring(sub_file, os.path.join("data"), type="valid")
Пример #3
0
def main(opt):
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
    if opt.mode == "train":
        st = time.time()
        print('Loading data')
        x_train, y_train, x_valid, y_valid, vocabulary_size = load_data(
            "data", opt.debug)

        num_training_data = x_train.shape[0]
        sequence_length = x_train.shape[1]
        print(num_training_data)

        print('Vocab Size', vocabulary_size)

        model = build_model(opt.embedding_dim, opt.hidden_size, opt.drop,
                            sequence_length, vocabulary_size, opt.optimizer)
        print("Traning Model...")
        history = model.fit(
            x_train,
            y_train,
            batch_size=opt.batch_size,
            epochs=opt.epochs,
            verbose=1,
            callbacks=[TestCallback((x_valid, y_valid), model=model)])
        model.save(opt.saved_model)
        print("Training cost time: ", time.time() - st)

    else:
        model = load_model(opt.saved_model)
        vocabulary = json.load(open(os.path.join("data", "vocab.json")))
        predict_dict = predict_final_word(model, vocabulary, opt.input)
        sub_file = make_submission(predict_dict, opt.student_id, opt.input)
        if opt.score:
            scoring(sub_file, os.path.join("data"), type="valid")
Пример #4
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log',
                        level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)

    with t.timer('load data'):
        train_df = pd.read_csv(const.TRAIN_PATH)
        test_df = pd.read_csv(const.TEST_PATH)

    with t.timer('make folds'):
        fold_df = factory.get_fold(cfg.validation, train_df)
        if cfg.validation.single:
            fold_df = fold_df[['fold_0']]
            fold_df /= fold_df['fold_0'].max()

    with t.timer('drop index'):
        if cfg.common.drop is not None:
            drop_idx = factory.get_drop_idx(cfg.common.drop)
            train_df = train_df.drop(drop_idx, axis=0).reset_index(drop=True)
            fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

    with t.timer('train model'):
        trainer = NNTrainer(run_name, fold_df, cfg)
        cv = trainer.train(train_df=train_df,
                           target_df=train_df[const.TARGET_COL])
        preds = trainer.predict(test_df)
        trainer.save()

        run_name_cv = f'{run_name}_{cv:.3f}'
        logger_path.rename(f'../logs/{run_name_cv}')
        logging.disable(logging.FATAL)

    with t.timer('make submission'):
        make_submission(run_name=run_name_cv,
                        y_pred=preds,
                        target_name='Label',
                        comp=False)
        if cfg.common.kaggle.submit:
            kaggle = Kaggle(cfg.compe.name, run_name_cv)
            kaggle.submit(comment)
Пример #5
0
def main():
    experiment_name = now()
    cv_path = Path(f"result/{experiment_name}")
    cv_path.mkdir(parents=True)

    copy_script(cv_path)
    log = Logger(experiment_name, cv_path / "exp.log")

    log.info("load data")
    with log.interval_timer("load data"):
        train_X = load_fs_tosh('all_snap', conf)
        train_y = feather.read_dataframe("features/HasDetections.ftr")
        train_y = train_y.HasDetections
        test = load_fs_tosh('all_snap', conf, test=True)

    log.info(pformat(list(train_X.columns)))
    cv = StratifiedKFold(n_splits=5, random_state=conf.seed)
    cv = cv.split(train_X, train_y)

    log.info("learning start")
    log.double_kiritori()
    with open('features/NN/conf_tosh_all_snap.pkl', 'rb') as p:
        embedd_conf = pickle.load(p)
    log.info(pformat(embedd_conf))
    score, pred, meta = NN_cv(train_X,
                              train_y,
                              cv,
                              log,
                              cv_path,
                              X_test=test,
                              split_conf=embedd_conf)
    log.info(score)
    log.double_kiritori()
    log.info("done")

    del train_X, train_y

    np.save(cv_path / "test_preds.npy", pred)
    np.save(cv_path / "oof_preds.npy", meta)

    make_submission(pred, f"submissions/{experiment_name}.csv.gz")
Пример #6
0
def main(opt):
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
    if opt.mode == "train":
        st = time.time()
        print('Loading data')
        x_train, y_train, x_valid, y_valid, vocabulary_size = load_data(
            "data", opt.debug)

        num_training_data = x_train.shape[0]
        sequence_length = x_train.shape[1]
        print(num_training_data)

        print('Vocab Size', vocabulary_size)

        model = build_model(opt.embedding_dim, opt.hidden_size, opt.drop,
                            sequence_length, vocabulary_size)
        print("Traning Model...")
        history = model.fit(
            x_train,
            y_train,
            batch_size=opt.batch_size,
            epochs=opt.epochs,
            verbose=1,
            callbacks=[TestCallback((x_valid, y_valid), model=model)])
        model.save(opt.saved_model)

        # Save the model architecture
        #with open('model_architecture.yaml', 'w') as f:
        #    f.write(model.to_json())

        print("Training cost time: ", time.time() - st)

    else:
        # Model reconstruction from JSON file
        #with open('model_architecture.yaml', 'r') as f:
        #    model = model_from_yaml(f.read())
        model = load_model(
            opt.saved_model,
            custom_objects={'LayerNormalization': LayerNormalization})
        vocabulary = json.load(open(os.path.join("data", "vocab.json")))
        predict_dict = predict_final_word(model, vocabulary, opt.input)
        sub_file = make_submission(predict_dict, opt.student_id, opt.input)
        if opt.score:
            scoring(sub_file, os.path.join("data"), type="valid")
Пример #7
0
    X_fold = np.hstack((X_fold, X_pred))

    all_X.append(X_fold)
    all_y.append(y_fold)
    all_w.append(w_fold)

X = np.vstack(all_X)
y = np.concatenate(all_y)
w = np.concatenate(all_w)

clf = Classifier(**params)
w = rescale(w)
w = rebalance(y, w)

try:
    clf.fit(X, y, sample_weight=w)
except:
    clf.fit(X, y)


# And make a submussion
print "Making submission..."
X_test, _, _, _ = load_test()
X_pred = load_predictions("stack/*-test.npy")
X_test = np.hstack((X_test, X_pred))

make_submission(clf, threshold, "output-stacking.csv", X_test=X_test)

import IPython; IPython.embed()

Пример #8
0
    "bootstrap": False,
    "max_features": 27
}


# Train on the whole training set
def train(Classifier, params, X, y, w, verbose=1):
    if verbose > 0:
        print "[Start]"

    w = rescale(w)
    w = rebalance(y, w)

    clf = Classifier(**params)
    clf.fit(X, y, sample_weight=w)

    if verbose > 0:
        print "[End]"

    return clf


clf = train(Classifier, params, X, y, w)

# Make submission
threshold = -2.74420523643
make_submission(clf, threshold, "output-rs.csv")

import IPython
IPython.embed()
Пример #9
0
def main(args):
    """Train (and evaluate) a GRU-based model for classifying toxic content in
    wikipedia comments. Takes a preprocessed (cleaned, tokenized, and padded)
    comments as input and outputs the probability of six different types of toxicity
    being contained in the comment. Execution is modified by a number of call
    arguments, described below.

    Parameters
    ----------
    --train (-t) : (Re)train the model. Leave this out if only doing inference or
        only evaluating on test set.
    --auxilliary_input (-a) : Use auxilliary input to the model for training and
        testing. Auxilliary input consists of class probabilities calculated using
        ridge regression. Requires that said auxilliary input is already generate
        for a given input sentence.
    --combine_data (-c) : Combine training and test data with additional figshare
        comments when fitting tokenizer to data.
    --submit (-s) : Turn test predictions into a submission for Kaggle.
    --visualise (-v) : Visualise attention activations for a sentence.
    --fasttext (-f) : Use word embeddings trained using fasttext instead of
        pre-trained GloVe embeddings.
    """

    TRAIN = args.train
    USE_AUXILLIARY_INPUT = args.auxilliary_input
    COMBINE_DATA = args.combine_data
    MAKE_SUBMISSION = args.submit
    VISUALISE_FULL_ATTENTION = args.visualise
    USE_FASTTEXT = args.fasttext

    MAX_NUM_WORDS = None
    MAX_LENGTH = 150
    EMBEDDING_DIM = 300
    SKIPGRAM = True

    MAX_EPOCHS = 50
    BATCH_SIZE = 512
    VAL_SPLIT = 0.2
    SENTENCE_NUM = 51

    TOXICITY_THRESHOLD = 0.6

    AVERAGE_ATTENTION = False

    BASE_LR = 0.0001
    MAX_LR = 0.005
    STEP_SIZE = 30000
    CLR_MODE = 'triangular'
    now = datetime.datetime.now()
    now = now.strftime('%Y%m%d%H%M')
    LOG_PATH = './logs/' + now
    WEIGHT_SAVE_PATH = 'weights_base.best.hdf5'
    SUBMISSION_SAVE_PATH = './submissions/submission_' + now + '.csv'
    ES_PATIENCE = 6
    TB_HIST_FREQ = 0
    TB_WRITE_GRAPH = True

    clr_params = {
        'base_lr': BASE_LR,
        'max_lr': MAX_LR,
        'step_size': STEP_SIZE,
        'mode': CLR_MODE
    }
    ckpt_params = {
        'filepath': WEIGHT_SAVE_PATH,
        'verbose': 1,
        'save_best_only': True,
        'save_weights_only': True
    }
    es_params = {'patience': ES_PATIENCE}
    tb_params = {
        'log_dir': LOG_PATH,
        'histogram_freq': TB_HIST_FREQ,
        'write_graph': TB_WRITE_GRAPH,
        'batch_size': BATCH_SIZE,
        'embeddings_freq': MAX_EPOCHS + 1
    }

    callbacks = get_callbacks(clr_params, ckpt_params, es_params, tb_params)

    CLASS_LIST = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]

    txt_prep = TextPreprocessor(max_nb_words=MAX_NUM_WORDS,
                                max_padding_length=MAX_LENGTH,
                                combine_data=COMBINE_DATA,
                                use_auxilliary_features=USE_AUXILLIARY_INPUT)
    if USE_AUXILLIARY_INPUT:
        X_train, X_aux, y_train, X_test, test_aux, word_index, sample_text, sample_target = \
            txt_prep.load_and_tokenize(class_list=CLASS_LIST,
                                       sample_index=SENTENCE_NUM)
    else:
        X_train, y_train, X_test, word_index, sample_text, sample_target = \
            txt_prep.load_and_tokenize(class_list=CLASS_LIST,
                                       sample_index=SENTENCE_NUM)

    tc = ToxicClassifier(embedding_dim=EMBEDDING_DIM,
                         num_timesteps=MAX_LENGTH,
                         word_index=word_index,
                         weight_path=WEIGHT_SAVE_PATH,
                         use_aux_input=USE_AUXILLIARY_INPUT,
                         average_attention=AVERAGE_ATTENTION,
                         use_ft=USE_FASTTEXT,
                         visualize=VISUALISE_FULL_ATTENTION)

    if USE_AUXILLIARY_INPUT:
        tc.set_input_and_labels(X_train, y_train, X_aux)
        tc.set_sample_sentence(sample_text, X_train[SENTENCE_NUM],
                               y_train[SENTENCE_NUM], X_aux[SENTENCE_NUM])
    else:
        tc.set_input_and_labels(X_train, y_train)
        tc.set_sample_sentence(sample_text, X_train[SENTENCE_NUM],
                               y_train[SENTENCE_NUM])

    tc.build_model(word_index=word_index, use_skipgram=SKIPGRAM)
    tc.model.summary()

    if TRAIN:
        tc.train(max_epochs=MAX_EPOCHS,
                 batch_size=BATCH_SIZE,
                 val_split=VAL_SPLIT,
                 callbacks=callbacks)

        sample_pred = tc.predict_sample_output()
        print('Original sentence: ', sample_text)
        print('Actual label: ', sample_target)
        print('Model prediction :', sample_pred[0, :])
        present_toxicity = get_toxicity_classes(sample_pred[0, :],
                                                TOXICITY_THRESHOLD, CLASS_LIST)
        print_toxicity_report(sample_pred[0, :], TOXICITY_THRESHOLD,
                              CLASS_LIST)

        if VISUALISE_FULL_ATTENTION:
            visualise_attention(tc.attention_history, sample_text)
        else:
            attention = tc.get_attention_output()
            attention /= sum(attention)  # Normalise to percentage
            label = tc.get_sample_labels()
            visualise_attention_with_text(attention, sample_text,
                                          sample_pred[0, :], present_toxicity,
                                          sample_target, label)

    if MAKE_SUBMISSION:
        print('Loading best weights and predicting on test data\n')
        if USE_AUXILLIARY_INPUT:
            make_aux_submission(tc.model,
                                X_test,
                                test_aux,
                                CLASS_LIST,
                                WEIGHT_SAVE_PATH,
                                SUBMISSION_SAVE_PATH,
                                post_process=True)
        else:
            make_submission(tc.model, X_test, CLASS_LIST, WEIGHT_SAVE_PATH,
                            SUBMISSION_SAVE_PATH)
Пример #10
0
    shape = (None, 3, cfg.WIDTH, cfg.HEIGHT)
    predict_fn = models.get_predict_function(m_param, model_weights, file_fmt, shape);

    load_and_process = ld.LoadAndProcess(
            size = (cfg.WIDTH, cfg.HEIGHT),
            augmentation_params = None,
            crop = None,
            color_noise = 0,
            fill_size = cfg.pretrained);

    batch_size = cfg.batch_size;
    test_imgs,test_labels = ld.list_imgs_labels(cfg.data_dir,data='test');
    test_data = ld.ImgStream(test_imgs, test_labels, batch_size,
            cycle=False, file_dir_fmt=cfg.data_dir+'/test/{}',
            load_and_process = load_and_process, preload=None);

    print("num of test cases: {}".format(len(test_data)));

    res = [];
    c = 0;
    for imgs,labels in test_data:
        res.append(predict_fn(imgs));
        c += 1;
        if c%50 == 0:
            print("{} processed ".format(c*batch_size));

    res = np.concatenate(res);
    filename = cfg.output_dir + "/submit_{}.csv".format(fname);
    print(res[-1])
    utils.make_submission(filename, test_imgs, res, 0.5e-3);
Пример #11
0
        convert_type=config['data']['convert_type'])

    logging.disable(logging.FATAL)

    if OOF_PARAMS['save_oof']:
        np.save(f'../logs/{RUN_NAME}/oof.npy', oof)
        save_oof_plot(RUN_NAME, train_y, oof, type_='reg', dia=True)

with t.timer('save features importances'):
    save_importances(RUN_NAME, models, FEATURES)

with t.timer('make submission'):
    output_path = LOGGER_PATH / f'{METER_TYPE}.csv'
    make_submission(y_pred=np.mean(preds, axis=1),
                    target_name=TARGET_NAME,
                    sample_path=SAMPLE_SUB_PATH,
                    output_path=str(output_path),
                    comp=True)

LOGGER_PATH.rename(f'../logs/{RUN_NAME}_{np.mean(scores):.3f}')

process_minutes = t.get_processing_time()

with t.timer('notify'):
    message = f'''{MODEL_NAME}\ncv: {np.mean(scores):.3f}\nscores: {scores}\ntime: {process_minutes:.2f}[min]'''

    send_line(NOTIFY_PARAMS['line']['token'], message)

    send_notion(token_v2=NOTIFY_PARAMS['notion']['token_v2'],
                url=NOTIFY_PARAMS['notion']['url'],
                name=RUN_NAME,
Пример #12
0
    model.add(Dropout(0.5))
    
    model.add(Dense(612, 612, init='glorot_uniform'))
    model.add(PReLU((612,)))
    model.add(BatchNormalization((612,)))
    model.add(Dropout(0.5))
    
    model.add(Dense(612, nb_classes, init='glorot_uniform'))
    model.add(Activation('softmax'))   
    model.compile(loss='categorical_crossentropy', optimizer="adam")
    #model.compile(loss='categorical_crossentropy', optimizer="sgd")
    

print("Training model...")
ne = 17
bs = 32
vs = 0.15
model.fit(X, y, nb_epoch=ne, batch_size=bs, validation_split= vs)

print ("Saving model (will overwrite existing one)")
filename = "keras-nn-%d-%d-%d"%(ne,bs,vs)
ut.save(model, filename, verbose=True)

print("Generating submission...")
proba = model.predict_proba(X_test)
ut.make_submission(proba, ids, encoder, fname='keras-otto-proba-93.csv')

#print(type(proba))
#print(proba[0:10,])

Пример #13
0
    l=0
    i=0
    while l<len(set_X_test):
    if(len(set_X_test[l])>0):
        set_X_test[l]['CSPL_RECEIVED_CALLS'] =   listPred[i]
        i=i+1
    l=l+1
    """


    #on réassemble les valeurs de prédiction
    resultPred= pd.concat(set_X_test)
    resultPred=resultPred.sort_index()
    incremental_prediction.append(resultPred)
print "score global = ",score_global.mean()

print("Merging incremental learning...")
resultPred_final=pd.concat(incremental_prediction)
resultPred_final=resultPred_final.sort_values(by=['DATE', 'cod_ASS_ASSIGNMENT'])

print("Make every prediction positif, ceil it ...")
resultPred_final['CSPL_RECEIVED_CALLS']=resultPred_final['CSPL_RECEIVED_CALLS'].apply(lambda x: x*(x>0))
#resultPred_final['CSPL_RECEIVED_CALLS']=resultPred_final['CSPL_RECEIVED_CALLS'].apply(lambda x: 2.5*x)
resultPred_final['CSPL_RECEIVED_CALLS']=resultPred_final['CSPL_RECEIVED_CALLS'].apply(lambda x: math.ceil(x))

print "Write the submission ..."
make_submission(dataTest,resultPred_final)
print "End."

Пример #14
0
Y_train=np.array(Y_train)
X_train=np.array(X_train)
X_test=np.array(X_test)


#### Creation of regressor 
reg=Regressor()


#### Cross validation
print "Cross validation ..."
#loo = cross_validation.LeaveOneOut(len(y_df))
loo=10
scores = cross_validation.cross_val_score(reg, X_train, Y_train, scoring='mean_squared_error', cv=loo,)
print "The score mean of cross validation : "
print scores.mean()

#### fit 
print "Fit ..."
reg.fit(X_train, Y_train)


#### Prediction
print "Prediction ..."
Y_pred = reg.predict(X_test)

#### write the submission
print "Write the submission ..."
make_submission(dataTest,Y_pred)

print "End."
Пример #15
0
#             pred.iloc[idx] = 0

with t.timer('replace with leak'):
    leak = pd.read_feather(DATA_PATH / 'input/leak.feather')
    leak['timestamp'] = leak['timestamp'].astype(str)
    leak.rename(columns={'meter_reading': 'leak_meter_reading'}, inplace=True)

    test_and_leak = pd.merge(test, leak, on=['building_id', 'meter', 'timestamp'], how='left')
    leak_idx = test_and_leak['leak_meter_reading'].dropna().index
    pred.iloc[leak_idx] = test_and_leak.loc[leak_idx, 'leak_meter_reading']

with t.timer('make submission'):
    output_path = str(DATA_PATH / f'output/sub_{RUN_NAME}_{cv}.csv')
    make_submission(y_pred=pred, 
                    target_name=TARGET_NAME, 
                    sample_path=SAMPLE_SUB_PATH,
                    output_path=output_path,
                    comp=True)


# LOGGER_PATH.rename(f'../logs/{RUN_NAME}_{np.mean(scores):.3f}')

process_minutes = t.get_processing_time()

with t.timer('notify'):
    message = f'''{MODEL_NAME}\ncv: {cv:.3f}\nscores: \ntime: {process_minutes:.2f}[min]'''

    send_line(NOTIFY_PARAMS['line']['token'], message)

    send_notion(token_v2=NOTIFY_PARAMS['notion']['token_v2'],
                url=NOTIFY_PARAMS['notion']['url'],
Пример #16
0
from __future__ import print_function

import numpy as np
import pandas as pd
import utils as ut
import os
import xgboost as xgb
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.utils import np_utils, generic_utils

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

np.random.seed(1337) # for reproducibility

## check if raw data exist
print("Loading data...")
X, labels = ut.load_data('data/train.csv', train=True)
data, ids = ut.load_data('data/test.csv', train=False)

print("Preprocessing labels")
y, encoder = ut.preprocess_labels(labels)

prediction_files = ["xgb-otto-proba-round-430-eta-0.csv", 
                    "keras-otto-proba-93.csv"]
ensemble = ut.ensemble(prediction_files, weights=[0.4, 0.6]) 
ut.make_submission(ensemble, ids, encoder, fname='ensemble-otto-selected-93.csv')
Пример #17
0
    logging.disable(logging.FATAL)

    if 'nn' in MODEL_NAME:
        save_learning_curve(RUN_NAME, models)

    if SETTINGS_PARAMS['oof']['save']:
        np.save(f'../logs/{RUN_NAME}/oof.npy', oof)
        save_oof_plot(RUN_NAME, train_y, oof, type_='reg', dia=True)

with t.timer('save features importances'):
    save_importances(RUN_NAME, models, FEATURES)

with t.timer('make submission'):
    output_path = f'../data/output/{RUN_NAME}_{np.mean(scores):.3f}.csv'
    make_submission(y_pred=np.mean(preds, axis=1), target_name=COMPE_PARAMS['target_name'],
                    sample_path=PATH_PARAMS['sample'], output_path=str(output_path), comp=False)

LOGGER_PATH.rename(f'../logs/{RUN_NAME}_{np.mean(scores):.3f}')

process_minutes = t.get_processing_time()

with t.timer('notify'):
    message = f'''{MODEL_NAME}\ncv: {np.mean(scores):.3f}\nscores: {scores}\ntime: {process_minutes:.2f}[min]'''

    send_line(NOTIFY_PARAMS['line']['token'], message)

    send_notion(token_v2=NOTIFY_PARAMS['notion']['token_v2'],
                url=NOTIFY_PARAMS['notion']['url'],
                name=RUN_NAME,
                created=NOW,
                model=MODEL_NAME.split('_')[0],
Пример #18
0
    X_pred = load_predictions("stack/*-fold%d.npy" % i)
    X_fold = np.hstack((X_fold, X_pred))

    all_X.append(X_fold)
    all_y.append(y_fold)
    all_w.append(w_fold)

X = np.vstack(all_X)
y = np.concatenate(all_y)
w = np.concatenate(all_w)

clf = Classifier(**params)
w = rescale(w)
w = rebalance(y, w)

try:
    clf.fit(X, y, sample_weight=w)
except:
    clf.fit(X, y)

# And make a submussion
print "Making submission..."
X_test, _, _, _ = load_test()
X_pred = load_predictions("stack/*-test.npy")
X_test = np.hstack((X_test, X_pred))

make_submission(clf, threshold, "output-stacking.csv", X_test=X_test)

import IPython
IPython.embed()
Пример #19
0
model.summary()
# %%
X, y = get_data(as_gray=False)

batch_size = 128
ra = ROCAUC(batch_size)
es = EarlyStopping(monitor='val_auc', patience=2, mode='max')
mc = ModelCheckpoint(f'data/models/model.h5',
                     monitor='val_auc',
                     save_best_only=True,
                     mode='max',
                     verbose=1)
model.fit(X,
          y,
          batch_size=batch_size,
          epochs=50,
          validation_split=.2,
          callbacks=[ra, es, mc])

# %%
model.fit(X, y, batch_size=batch_size, epochs=5)

# %%
X_test, test_ids = get_data(test=True, as_gray=False)

test_predictions = model.predict(X_test, batch_size=batch_size)
test_predictions = test_predictions.flatten()
make_submission(test_ids, test_predictions,
                'submissions/first_transfer_cnn.csv')
Пример #20
0
 def on_epoch_end(self, epoch, logs={}):
     x, y = self.test_data
     predict_dict = predict_final_word(self.model, self.vocabulary, self.filename)
     sub_file = make_submission(predict_dict, opt.student_id, opt.input)
     scoring(sub_file, os.path.join("data"), type="valid")
Пример #21
0
def main(opt):
    os.environ['CUDA_VISIBLE_DEVICES'] = opt.gpu
    np.random.seed(opt.seed)  # set a seed for reproduciaiblity
    if opt.mode == "train":
        st = time.time()
        print('Loading data')
        x_train, y_train, x_valid, y_valid, vocabulary_size = load_data(
            "data", opt.debug)

        num_training_data = x_train.shape[0]
        sequence_length = x_train.shape[1]
        print(num_training_data)

        print('Vocab Size', vocabulary_size)

        model = build_model(opt.model, opt.embedding_dim, opt.hidden_size,
                            opt.drop, opt.filter, sequence_length,
                            vocabulary_size)
        adam = Adam()
        model.compile(loss='sparse_categorical_crossentropy', optimizer=adam)
        print("Traning Model...")
        checkpoint = ModelCheckpoint(opt.saved_model,
                                     monitor='val_loss',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='min')
        early = EarlyStopping(monitor="val_loss", mode="min", patience=5)
        history = model.fit(x_train,
                            y_train,
                            batch_size=opt.batch_size,
                            epochs=100,
                            verbose=1,
                            validation_data=(x_valid, y_valid),
                            callbacks=[
                                TestCallback((x_valid, y_valid), model=model),
                                checkpoint, early
                            ])
        model.save(opt.saved_model)
        print("Training cost time: ", time.time() - st)
    elif opt.mode == "ensemble":
        x_train, y_train, x_valid, y_valid, vocabulary_size = load_data(
            "data", opt.debug)

        num_training_data = x_train.shape[0]
        sequence_length = x_train.shape[1]
        print(num_training_data)

        print('Vocab Size', vocabulary_size)

        ENSEMBLE_DIR = "models/ensemble/"
        model_files = []
        for (dirpath, dirnames, filenames) in os.walk(ENSEMBLE_DIR):
            model_files.extend(filenames)
            break
        models = []
        model_count = 0
        for filename in model_files:
            model = load_model(ENSEMBLE_DIR + filename)
            model.name = "model" + str(model_count)
            model_count += 1
            models.append(model)

        build_save_ensemble_model(opt.saved_model, models, sequence_length)
    else:
        model = load_model(opt.saved_model)
        vocabulary = json.load(open(os.path.join("data", "vocab.json")))
        predict_dict = predict_final_word(model, vocabulary, opt.input)
        sub_file = make_submission(predict_dict, opt.student_id, opt.input)
        if opt.score:
            scoring(sub_file, os.path.join("data"), type="valid")
Пример #22
0
    X_TS = utils.create_fingerprints(TS["SMILES"].values)

    if METHOD == "DT":
        depths, scores = doDecisionTree(X_LS, Y_LS)
        print(scores)

    elif METHOD == "KNN":
        depths, scores = doKNN(X_LS, Y_LS)
        print(scores)

        classifier_knn = KNeighborsClassifier(n_neighbors=50)
        classifier_knn.fit(X_LS, Y_LS)
        pred = classifier_knn.predict_proba(X_TS)
        auc_predicted = 0.7
        fname = utils.make_submission(pred[:, 1], auc_predicted, 'knn_50')
        print('Submission file "{}" successfully written'.format(fname))

    elif METHOD == "RF":
        #ts, depths, scores = doRandomForest(X_LS, Y_LS)
        #print(scores)

        classifier_rf = RandomForestClassifier(n_estimators=800, max_depth=700)
        classifier_rf.fit(X_LS, Y_LS)
        pred = classifier_rf.predict_proba(X_TS)
        auc_predicted = 0.78
        fname = utils.make_submission(pred[:, 1], auc_predicted, 'final')
        print('Submission file "{}" successfully written'.format(fname))

    elif METHOD == "MLP":
        layers, neurones, scores = doMLP(X_LS, Y_LS)
Пример #23
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)

    with t.timer('load data'):
        train_x = dh.load('../data/input/train_concated.csv')
        train_org_x = dh.load('../data/input/train.csv')
        train_2019_x = dh.load('../data/input/train_2019.csv')
        test_x = dh.load('../data/input/test.csv')

    with t.timer('make folds'):
        fold_org_df = factory.get_fold(cfg.validation.val1, train_org_x, train_org_x[[cfg.common.target]])
        fold2019_df = factory.get_fold(cfg.validation.val2, train_2019_x, train_2019_x[[cfg.common.target]])
        fold_df = pd.concat([fold_org_df, fold2019_df], axis=0, sort=False, ignore_index=True)
        if cfg.validation.val1.single:
            fold_df = fold_df[['fold_0']]
            fold_df /= fold_df['fold_0'].max()

    with t.timer('load features'):
        features = dh.load('../configs/feature/all.yml')['features']
        for f in features:
            train_x[f] = dh.load(f'../features/{f}_train.feather')[f].fillna(-1)
            test_x[f] = dh.load(f'../features/{f}_test.feather')[f].fillna(-1)

    with t.timer('drop several rows'):
        if cfg.common.drop is not None:
            drop_idx = factory.get_drop_idx(cfg.common.drop)
            train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True)
            fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

    with t.timer('train model'):
        result = train_model(run_name, train_x, fold_df, cfg)
    
    logging.disable(logging.FATAL)
    run_name_cv = f'{run_name}_{result["cv"]:.3f}'
    logger_path.rename(f'../logs/{run_name_cv}')

    with t.timer('predict'):
        preds = predict_test(run_name_cv, test_x, fold_df, cfg)

    with t.timer('post process'):
        duplicates = {
            'ISIC_5224960': 1,
            'ISIC_9207777': 1,
            'ISIC_6457527': 1,
            'ISIC_8347588': 0,
            'ISIC_8372206': 1,
            'ISIC_9353360': 1,
            'ISIC_3689290': 0,
            'ISIC_3584949': 0,  
        }
        for image_name, target in duplicates.items():
            idx = test_x[test_x['image_name'] == image_name].index[0]
            preds[idx] = target

    with t.timer('make submission'):
        sample_path = f'../data/input/sample_submission.csv'
        output_path = f'../data/output/{run_name_cv}.csv'
        make_submission(y_pred=preds,
                        target_name=cfg.common.target,
                        sample_path=sample_path,
                        output_path=output_path,
                        comp=False)

    with t.timer('kaggle api'):
        kaggle = Kaggle(cfg.compe.compe_name, run_name_cv)
        if cfg.common.kaggle.submit:
            kaggle.submit(comment)

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        message = f'''{model_name}\ncv: {result["cv"]:.3f}\ntime: {process_minutes:.2f}[h]'''
        send_line(notify_params.line.token, message)

        notion = Notion(token=notify_params.notion.token_v2)
        notion.set_url(url=notify_params.notion.url)
        notion.insert_rows({
            'name': run_name_cv,
            'created': now,
            'model': cfg.model.name,
            'local_cv': round(result['cv'], 4),
            'time': process_minutes,
            'comment': comment
        })
Пример #24
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)
    logging.basicConfig(filename=logger_path / 'train.log',
                        level=logging.DEBUG)

    dh.save(logger_path / 'config.yml', cfg)
    dh.save(logger_path / 'features.yml', features_params)

    with t.timer('load data'):
        train_df = dh.load('../data/input/train.csv')
        train2019_df = dh.load('../data/input/train_concated.csv')
        train_x = factory.get_features(features, cfg.data.loader.train)
        test_x = factory.get_features(features, cfg.data.loader.test)
        train_y = factory.get_target(cfg.data.target)

    with t.timer('add oof'):
        if cfg.data.features.oof.name is not None:
            oof, preds = factory.get_oof(cfg.data)
            train_x['oof'] = oof
            test_x['oof'] = preds
            features.append('oof')

    with t.timer('make folds'):
        fold_df = factory.get_fold(cfg.validation, train_df,
                                   train_df[['target']])
        fold_df = pd.concat([
            fold_df,
            pd.DataFrame(np.zeros((len(train2019_df), len(fold_df.columns))),
                         columns=fold_df.columns)
        ],
                            axis=0,
                            sort=False,
                            ignore_index=True)
        if cfg.validation.single:
            fold_df = fold_df[['fold_0']]
            fold_df /= fold_df['fold_0'].max()

    with t.timer('drop index'):
        if cfg.common.drop is not None:
            drop_idx = factory.get_drop_idx(cfg.common.drop)
            train_x = train_x.drop(drop_idx, axis=0).reset_index(drop=True)
            train_y = train_y.drop(drop_idx, axis=0).reset_index(drop=True)
            fold_df = fold_df.drop(drop_idx, axis=0).reset_index(drop=True)

    with t.timer('prepare for ad'):
        if cfg.data.adversarial_validation:
            train_x, train_y = factory.get_ad(cfg, train_x, test_x)

    with t.timer('train and predict'):
        trainer = Trainer(cfg)
        cv = trainer.train(train_df=train_x,
                           target_df=train_y,
                           fold_df=fold_df)
        preds = trainer.predict(test_x)
        trainer.save(run_name)

        run_name_cv = f'{run_name}_{cv:.3f}'
        logger_path.rename(f'../logs/{run_name_cv}')
        logging.disable(logging.FATAL)

    with t.timer('make submission'):
        sample_path = f'../data/input/sample_submission.csv'
        output_path = f'../data/output/{run_name_cv}.csv'
        make_submission(y_pred=preds,
                        target_name=cfg.data.target.name,
                        sample_path=sample_path,
                        output_path=output_path,
                        comp=False)
        if cfg.common.kaggle.submit:
            kaggle = Kaggle(cfg.compe.name, run_name_cv)
            kaggle.submit(comment)

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        message = f'''{cfg.model.name}\ncv: {cv:.3f}\ntime: {process_minutes}[min]'''
        send_line(notify_params.line.token, message)

        notion = Notion(token=notify_params.notion.token_v2)
        notion.set_url(url=notify_params.notion.url)
        notion.insert_rows({
            'name': run_name_cv,
            'created': now,
            'model': options.model,
            'local_cv': round(cv, 4),
            'time': process_minutes,
            'comment': comment
        })
Пример #25
0
def main():
    t = Timer()
    seed_everything(cfg.common.seed)

    logger_path.mkdir(exist_ok=True)

    dh.save(logger_path / 'config.yml', cfg)

    with t.timer('load data'):
        train_df = dh.load('../data/input/train_data.csv')
        test_df = dh.load('../data/input/test_data.csv')

        oof = np.zeros((len(train_df), len(cfg.models)))
        preds = np.zeros((len(test_df), len(cfg.models)))

        for i, m in enumerate(cfg.models):
            name = getattr(cfg.models, m).name

            log_dir = Path(f'../logs/{name}')
            model_oof = dh.load(log_dir / 'oof.npy')
            model_cfg = dh.load(log_dir / 'config.yml')
            if model_cfg.common.drop:
                drop_idxs = np.array([])
                for drop_name in model_cfg.common.drop:
                    drop_idx = dh.load(f'../pickle/{drop_name}.npy')
                    drop_idxs = np.append(drop_idxs, drop_idx)
                model_oof = factory.fill_dropped(model_oof, drop_idx)

            model_preds = dh.load(f'../logs/{name}/raw_preds.npy')

            oof[:, i] = model_oof[:len(train_df)]
            preds[:, i] = model_preds

    with t.timer('drop index'):
        if cfg.common.drop is not None:
            drop_idxs = np.array([])
            for drop_name in model_cfg.common.drop:
                drop_idx = dh.load(f'../pickle/{drop_name}.npy')
                drop_idxs = np.append(drop_idxs, drop_idx)
            train_df = train_df.drop(drop_idxs, axis=0).reset_index(drop=True)

    with t.timer('optimize model weight'):
        metric = factory.get_metrics(cfg.common.metrics.name)
        y_true = train_df[cfg.common.target]

        def objective(trial):
            p_list = [0 for i in range(len(cfg.models))]
            for i in range(len(cfg.models) - 1):
                p_list[i] = trial.suggest_discrete_uniform(f'p{i}', 0.0, 1.0 - sum(p_list), 0.01)
            p_list[-1] = round(1 - sum(p_list[:-1]), 2)

            y_pred = np.zeros(len(train_df))
            for i in range(oof.shape[1]):
                y_pred += oof[:, i] * p_list[i]

            return metric(y_true, y_pred)

        study = optuna.create_study(direction='minimize')
        study.optimize(objective, timeout=10)
        best_params = list(study.best_params.values())
        best_weight = best_params + [round(1 - sum(best_params), 2)]

    with t.timer('ensemble'):
        ensemble_oof = np.zeros(len(train_df))
        ensemble_preds = np.zeros(len(test_df))
        for i in range(len(best_weight)):
            ensemble_oof += oof[:, i] * best_weight[i]
            ensemble_preds += preds[:, i] * best_weight[i]

        dh.save(f'../logs/{run_name}/oof.npy', ensemble_oof)
        dh.save(f'../logs/{run_name}/raw_preds.npy', ensemble_preds)

        cv = metric(y_true, ensemble_oof)
        run_name_cv = f'{run_name}_{cv:.3f}'
        logger_path.rename(f'../logs/{run_name_cv}')

        print('\n\n===================================\n')
        print(f'CV: {cv:.4f}')
        print(f'BEST WEIGHT: {best_weight}')
        print('\n===================================\n\n')

    with t.timer('make submission'):
        sample_path = f'../data/input/sample_submission.feather'
        output_path = f'../data/output/{run_name_cv}.csv'
        make_submission(y_pred=ensemble_preds,
                        target_name=cfg.common.target,
                        sample_path=sample_path,
                        output_path=output_path,
                        comp=False)
        if cfg.common.kaggle.submit:
            kaggle = Kaggle(cfg.compe.compe_name, run_name_cv)
            kaggle.submit(comment)

    with t.timer('notify'):
        process_minutes = t.get_processing_time()
        message = f'''{options.model}\ncv: {cv:.3f}\ntime: {process_minutes}[min]'''
        send_line(notify_params.line.token, message)

        notion = Notion(token=notify_params.notion.token_v2)
        notion.set_url(url=notify_params.notion.url)
        notion.insert_rows({
            'name': run_name_cv,
            'created': now,
            'model': options.model,
            'local_cv': round(cv, 4),
            'time': process_minutes,
            'comment': comment
        })
Пример #26
0
# check correlation between base learner predictions
np.corrcoef(X_train_level_2.T)
sns.jointplot(X_train_level_2[:,0], X_train_level_2[:,1])
plt.show()

# simple convex combination between pair
alphas_to_try = np.linspace(0, 1, 1001)
rmse_best = np.Inf
for alpha in alphas_to_try:
    mix = alpha * X_train_level_2[:,0] + (1-alpha) * X_train_level_2[:,1]
    rmse_new = np.sqrt(mean_squared_error(Y_train_level_2, mix))
    if rmse_new < rmse_best:
        alpha_best = alpha
        rmse_best = rmse_new

score = round(rmse_best, 6)
pred_test = alpha_best * X_test_level_2[:,0] + (1-alpha_best) * X_test_level_2[:,1]
ids = np.array(df.loc[df['date_block_num'] == 34, 'ID'])
submission = make_submission(ids, np.array(pred_test).flatten())

# export
today = datetime.datetime.now()
sub_id = today.strftime('%y%m%d') + '_' + today.strftime("%H%M") + \
		'_score_' + str(score)
folder = OUT_FOLDER + '/' + sub_id
os.mkdir(folder)
print('\n---- ' + sub_id + ' ----')
submission.to_csv(os.path.join(folder, 'submission.csv'), index=False)

Пример #27
0
# 5 fold cross validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
prediction_scores = np.empty(y.shape[0], dtype='object')

for train_idx, val_idx in tqdm(skf.split(X, y)):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train = y[train_idx]

    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_val)[:, 1]

    # Save the predictions for this fold
    prediction_scores[val_idx] = y_pred

plt.title('SVM 5-fold cross validation ROC AUC')
plot_roc(y, prediction_scores)
plt.savefig('report/figures/svm_roc.png', dpi=300)

plot_prediction_samples(imgs, y, prediction_scores, 'SVM Prediction Samples')
plt.savefig('report/figures/svm_confmat.png', dpi=300)
# %%

# load and preprocess test data then create submission
X_test, test_ids = get_data(test=True)
X_test = np.stack([get_HOG(img, **hog_params) for img in X_test])

clf = clf.fit(X, y)
test_predictions = clf.predict_proba(X_test)[:, 1]
make_submission(test_ids,
                test_predictions,
                fname='submissions/svc_10_hog_16_4_fulltrain.csv')
X_test = read_pickle('../audio_data/X_test4d.pkl')
Y_train = read_pickle('../audio_data/Y_train1d.pkl')
print("The shape of X_train/X_test/Y_train: ", X_train.shape, X_test.shape, Y_train.shape)

# Instantiate the model
bigan = BIGAN(X_train.shape[1], X_train.shape[2], X_train.shape[3])

if is_trainable:
    # Training the BiGAN
    bigan.train_by_batch(X_train, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE)
    #bilstm.train_all(X_train_, Y_train_, BATCH_SIZE, NUM_EPOCHS)
else:
    # Restore the checkpoint
    checkpoint_dir = './runs/checkpoint_bigan'
    checkpoint = tf.train.Checkpoint()
    checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)).expect_partial()
    print("Checkpoint restored for Anomaly Detection!")

    # Anomaly Detection
    AS = bigan.compute_anomaly_score(X_train, Y_train, X_test)
    # Prediction
    ts = NUM_OUTLIERS/len(X_test) # Find out the best threshold
    Y_pred_AS = bigan.predict_outlier(AS, ts)
    #print("Y_pred_AS: ", Counter(Y_pred_AS))

    # Geneate final Y_pred and make submission
    Y_pred = np.load('Y_pred.npy')
    Y_pred_new = gen_Y_pred(Y_pred, Y_pred_AS)
    print("Y_pred_new.shape: ", Y_pred_new.shape)
    make_submission(Y_pred_new, "submission")
Пример #29
0
models = []
for train_idx, val_idx in skf.split(X, y):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    model = get_model()
    y_train = to_categorical(y_train)
    model.fit_generator(imagen.flow(X_train, y_train, batch_size=BATCH_SIZE),
                        steps_per_epoch=batch_per_epoch,
                        epochs=EPOCHS,
                        verbose=0)
    prediction_scores[val_idx] = model.predict(X_val, batch_size=BATCH_SIZE)[:,
                                                                             1]
    cur_auc = roc_auc_score(y_val, prediction_scores[val_idx])
    print(cur_auc)
    if cur_auc < 0.8:
        break
    models.append(model)

print(roc_auc_score(y, prediction_scores))

# %%
X_test, test_ids = get_data(test=True, as_gray=False)
X_test = X_test / 255.

test_predictions = np.mean(
    [m.predict(X_test, batch_size=BATCH_SIZE)[:, 1] for m in models], axis=0)

make_submission(test_ids, test_predictions, 'submissions/homebrew_cnn_CV.csv')
# %%
[m.save(f'data/models/model_fold_{i}.h5') for i, m in enumerate(models)]
Пример #30
0
    X_LS = fingerprints.transform(LS['SMILES'].values, FINGERPRINT)
    y_LS = LS['ACTIVE'].values

    # Variance threshold (feature selection)
    selector = VarianceThreshold()
    selector.fit(X_LS)
    X_LS = selector.transform(X_LS)

    # Cross validation score
    cv = ShuffleSplit(n_splits=5, test_size=0.25, random_state=0)
    scores = cross_val_score(MODEL, X_LS, y_LS, cv=cv, scoring='roc_auc')

    # Estimated AUC
    AUC = scores.mean()

    # Train model
    MODEL.fit(X_LS, y_LS)

    # Create fingerprint features of test set
    X_TS = fingerprints.transform(TS['SMILES'].values, FINGERPRINT)
    X_TS = selector.transform(X_TS)

    # Predict
    prob = MODEL.predict_proba(X_TS)[:, -1]

    # Writing the submission file
    os.makedirs(DESTINATION, exist_ok=True)
    fname = utils.make_submission(prob, AUC, DESTINATION + 'submission')

    print('Submission file "{}" successfully written'.format(fname))