Пример #1
0
def model_airports_individually(features_for_modeling, target_variable):

    if os.path.exists("weather_modeling_scores_test.csv") == False:
        airline_carriers = [
            'WN', 'AA', 'AS', 'DL', 'F9', 'NK', 'OO', 'B6', 'UA', '9E', 'EV',
            'YX', 'YV', 'OH', 'MQ', 'VX', 'G4', 'HA'
        ]
        score = pd.DataFrame()
        features_for_modeling += ["observation"]
        features_for_modeling += [target_variable]
        for airline in airline_carriers:

            merged_df = wrangle.merge_flight_weather_data()
            merged_df = preprocessing.to_date_time(merged_df)
            merged_df = preprocessing.create_new_features(merged_df)
            merged_df = preprocessing.create_target_variable(merged_df)

            # add weather features
            merged_df["avg_weather_delay"] = merged_df.groupby(
                "Type").arr_delay.transform("mean")
            merged_df[
                "type_severity"] = merged_df.Type + "_" + merged_df.Severity
            merged_df["avg_type_severity"] = merged_df.groupby(
                "type_severity").arr_delay.transform("mean")

            merged_df = merged_df[(merged_df.op_carrier == airline)]

            merged_df = merged_df[features_for_modeling]
            merged_df = merged_df.set_index("observation")

            train, validate, test = preprocessing.split_data(merged_df)

            X_train = train.drop(columns=target_variable)
            y_train = train[target_variable]
            X_validate = validate.drop(columns=target_variable)
            y_validate = validate[target_variable]
            X_test = test.drop(columns=target_variable)
            y_test = test[target_variable]

            scaler, train_scaled, validate_scaled, test_scaled = preprocessing.min_max_scaler(
                X_train, X_validate, X_test)

            knn, y_pred = run_knn(train_scaled, y_train, 3)
            y_pred = knn.predict(test_scaled)
            report = classification_report(y_test, y_pred, output_dict=True)
            report = pd.DataFrame.from_dict(report)
            actual_score = pd.DataFrame(
                {
                    airline:
                    [report.accuracy.values[0], report["True"].loc["recall"]]
                },
                index=["accuracy", "recall"])

            score = pd.concat([score, actual_score], axis=1)

        return score

    else:
        score = pd.read_csv("weather_modeling_scores_test.csv")
        return score
Пример #2
0
def k_fold(manager, k, data, weights=0):
    """
    Runs a k fold validation
    k: Number of folds

    Returns:
        List: One confusion matrix per fold
    """
    cms = []
    histories = []
    test_size = 1 / k
    n = data.count()[0]

    for i in range(k):
        print("Fold " + str(i + 1) + '/' + str(k))
        # Run a complete training
        start_i = n * i // k
        timed_data = prp.split_data(data, test_size=test_size, start_index=start_i, ordered=True)
        train_dataset, manager.scaler = prp.scale_and_format(timed_data[0], timed_data[1], timed_data[2], timed_data[3])
        manager.model, history = mlp.run_training(train_dataset, layers_sizes=manager.params["layers_sizes"],layers_activations=manager.params["layers_activations"],
                                                  epochs_nb=manager.params["epochs_nb"], batch_size=manager.params["batch_size"])
        pred = manager.get_pred(timed_data[1])
        cm = confusion_matrix(timed_data[3]["label"], pred["label"])
        cms.append(cm)
        histories.append(history)
    return cms, histories
Пример #3
0
    def test_split_data(self):
        """
        Test shape of train and validation sets
        """
        X_train, y_train, X_val, y_val = split_data(self.mock_token_vectors,
                                                    self.mock_labels, 2, 2)

        self.assertEqual((2, 7), X_train.shape,
                         "Incorrect shape of training samples after split")
        self.assertEqual((2, 7), X_val.shape,
                         "Incorrect shape of validation samples after split")
        self.assertEqual((2, ), y_train.shape,
                         "Incorrect shape of training labels after split")
        self.assertEqual((2, ), y_val.shape,
                         "Incorrect shpae of validation labels after split")
Пример #4
0
    def run_training(self, dataset=None, weight=0, loss_function = mlp.jaccard_distance, verbose=1):
        """
        Train a new neural network according to the manager's paremeters
        dataset: DataFrame whose columns are features and lines are train samples

        Returns: Training history
        """
        # Prepare data
        split_dataset = prp.split_data(dataset, test_size=self.params["test_size"])
        train_dataset, self.scaler = prp.scale_and_format(split_dataset[0], split_dataset[1], split_dataset[2], split_dataset[3])
        if weight:
            weight_list = prp.compute_weights(split_dataset[2])
        else:
            weight_list = None
        
        # Train
        self.model, history = mlp.run_training(train_dataset, layers_sizes=self.params["layers_sizes"], layers_activations=self.params["layers_activations"], epochs_nb=self.params["epochs_nb"],
                                               batch_size=self.params["batch_size"], weight_list = weight_list, loss_function=loss_function, verbose=verbose)
        # return history
        return history
Пример #5
0
    def test_train_model(self):
        """
        Test if function returns trained model
        """
        texts, labels = preprocess_labels(data_dir_path="data/mock_aclImdb",
                                          dataset="train")
        vectorized_texts, word_index = tokenize_data(texts)
        mock_X_train, mock_y_train, mock_X_val, mock_y_val = split_data(
            vectorized_texts, labels)

        mock_embedding_matrix = pickle.load(
            open("models/mock_glove.6B/mock_embedding_matrix.p", "rb"))
        mock_model = build_model(mock_embedding_matrix)

        mock_trained_model = train_model(mock_model,
                                         (mock_X_train, mock_y_train),
                                         (mock_X_val, mock_y_val))

        self.assertIsNotNone(mock_trained_model[1], "no model trained")
        self.assertIsNotNone(mock_trained_model[0],
                             "history dict doesn't exist")
from model import TempNN, train_model
from preprocessing import (split_data, moving_average, scaler,
                           create_sequences, train_test, generated)

if __name__ == "__main__":

    values_dt = pd.read_csv(
        '../temp_ds/Power-Networks-LCL-June2015(withAcornGps)v2_2.csv',
        delimiter=',')
    values_dt = np.asarray(values_dt['KWH/hh (per half hour) '].dropna(
        how='any', axis=0))
    values_dt[np.where(values_dt == 'Null')] = -1
    values_dt = values_dt.astype(np.float32)

    splited = split_data(values_dt, 50)  #Нарезаем на 50 батчей
    avg_splited = [
        moving_average(splited[i], 20) for i in range(len(splited))
    ]  #Усредняем
    scalers_data = np.asarray([
        scaler(avg_splited[i]) for i in range(len(avg_splited))
    ])  #Нормализуем
    datas = scalers_data[:, 1]  # Данные (батчи)
    scalers = scalers_data[:, 0]  # Скейлеры

    model = TempNN(n_features=1, n_hidden=64, seq_len=30, n_layers=1)
    for i, data in enumerate(datas):
        print("Batch №%d" % i)
        X_train, y_train, X_test, y_test = train_test(data)
        y_train = torch.reshape(y_train, (-1, 1))
        y_test = torch.reshape(y_test, (-1, 1))
Пример #7
0
 def __init__(self, csv_data_loc, images_dir, test__split_percentage):
     try:
         self._raw_data = get_data_from_csv(csv_data_loc)
         self._converted_data = convert_images_to_numpy_array(
             self._raw_data, images_dir)
         self._training_x, self._training_y, self._training_val_x, self._training_val_y = split_data(
             self._converted_data, self._raw_data, test__split_percentage)
     except RuntimeError:
         print(failed_to_load_data_for_class)
         raise
import numpy as np
import preprocessing as pre
# --- LSTM nao balanceada tomando media em 10 intervalos ---

data_num = 500
num_int = 2560
#Lendo todos os dados do experimento
X, y = pre.load_3dim('dataset/', data_num, num_int)
#Pegando a media em um numero de 10 intervalos para cada componente
X = pre.med_intervalo_3dim(X, 10)
#Remodelado as dimensões de y para ser aceito na dummy
y = np.reshape(y, (y.shape[0], -1))
#Passando y para dummy variables
y_dummy = pre.dummy_variables(y)
#Separando em conjunto de treino e teste (pego de forma aleatoria, aleatorizando também as variáveis dependentes)
X_train, X_test, y_train, y_test = pre.split_data(X, y_dummy, 0.2, None)
#Padronizando dados
X_train, X_test = pre.standardize_data(X_train, X_test)
#Implementando a LSTM
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import Dense
#Dimensao da camada invisivel
hidden_size = 32
#Criando obbjeto da rede
sl_model = Sequential()
#gerando uma camada do tipo LSTM que recebe o numero de saidas, o tipo de funcao de ativacao geralmente a tangente hiperbolica
# o quanto irei desconsiderar dos dados de entrada nessa camada e quanto irei desconsideradar do estado de recorrencia anterior
sl_model.add(
    LSTM(units=hidden_size,
        if inject_anom:
            if (np.random.binomial(1, 0.01))>0.5:
                y += np.random.uniform(-0.75,0.75)
        yield t, y

''' Createe training data '''
signal_generator = generate_signal(inject_anom=False)
''' Generate training data with ength of 1000'''
data = []
for i in range(1000):
    t, sig = next(signal_generator)
    data.append(sig)



train, test = preprocessing.split_data(data)

timesteps = 5
X_train, Y_train = preprocessing.arange_data_for_sequence_model(data, timesteps, lookforward=1)


def define_model(n_features, timesteps, batch_size=None, stateful=False, forward_pred=1):
    """
Defines and builds a model. This function can build both stateful and none stateful model. 
    -------------------------------------------------------------------------------------------
    args:
        n_features (int) - Number of features in the data
        timesteps (int) - number of look back timesteps the model uses to generate predictions
        batch_size (int) - model training batch size
        stateful (bool) - 
        forward_pred (int) - number of forward steps to forecast
Пример #10
0
def main():
    # Maybe delete this ?
    group = 'lung'

    parser = argparse.ArgumentParser(description='classifier')
    parser.add_argument('--sample_file', type=str, default='lung.emx.txt', help="the name of the GEM organized by samples (columns) by genes (rows)")
    parser.add_argument('--label_file', type=str, default='sample_condition.txt', help="name of the label file: two columns that maps the sample to the label")
    parser.add_argument('--output_name', type=str, default='tissue-run-1', help="name of the output directory to store the output files")
    #parser.add_argument('--overwrite_output', type=bool, default=False, help="overwrite the output directory file if it already exists")
    parser.add_argument('--batch_size', type=int, default=16, help="size of batches to split data")
    parser.add_argument('--max_epoch', type=int, default=100, help="number of passes through a dataset")
    parser.add_argument('--learning_rate', type=float, default=0.001, help="controls the rate at which the weights of the model update")
    parser.add_argument('--test_split', type=float, default=0.3, help="percentage of test data, the train data will be the remaining data. 30% -> 0.3")
    parser.add_argument('--continuous_discrete', type=str, default='continuous', help="type of data in the sample file, typically RNA will be continous and DNA will be discrete")
    parser.add_argument('--plot_results', type=bool, default=True, help="plots the sample distribution, training/test accuracy/loss, and confusion matrix")
    parser.add_argument('--use_gpu', type=bool, default=False, help="true to use a gpu, false to use the cpu - if the node does not have a gpu then it will use the cpu")
    args = parser.parse_args()

    #If data is discrete, data should only range between 0-3
    #if args.continuous_discrete == "discrete":
        #args.input_num_classes = 4

    # Initialize file paths and create output folder
    LABEL_FILE = os.path.join(INPUT_DIR, args.label_file)
    SAMPLE_FILE = os.path.join(INPUT_DIR, args.sample_file)
    OUTPUT_DIR_FINAL = os.path.join(OUTPUT_DIR, args.output_name + "-" + str(datetime.today().strftime('%Y-%m-%d-%H:%M')))
    if not os.path.exists(OUTPUT_DIR_FINAL):
        os.makedirs(OUTPUT_DIR_FINAL)

    # Create log file to keep track of model parameters
    logging.basicConfig(filename=os.path.join(OUTPUT_DIR_FINAL,'classifier.log'),
                        filemode='w',
                        format='%(message)s',
                        level=logging.INFO)
    logger = logging.getLogger(__name__)
    logger.info('Classifer log file for ' + args.sample_file + ' - Started on ' + str(datetime.today().strftime('%Y-%m-%d-%H:%M')) + '\n')
    logger.info('Batch size: %d', args.batch_size)
    logger.info('Number of epochs: %d', args.max_epoch)
    logger.info('Learning Rate: %f', args.learning_rate)
    logger.info('Sample filename: ' + args.sample_file)
    logger.info('Output directory: ' + args.output_name)

    if args.continuous_discrete != 'continuous' and args.continuous_discrete != 'discrete':
        logger.error("ERROR: check that the continuous_discrete argument is spelled correctly.")
        logger.error("       only continuous or discrete data can be processed.")
        sys.exit("\nCommand line argument error. Please check the log file.\n")

    # Intialize gpu usage if desired
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda and args.use_gpu else "cpu")
    train_kwargs = {'batch_size': 16}
    test_kwargs = {'batch_size': 16}
    if use_cuda:
        cuda_kwargs = {'num_workers': 1,
                       'pin_memory': True,
                       'shuffle': True}
        train_kwargs.update(cuda_kwargs)
        test_kwargs.update(cuda_kwargs)

    # Load matrix, labels/weights, and number of samples
    column_names = ("sample", "label")
    matrix_df = pd.read_csv(SAMPLE_FILE, sep='\t', index_col=[0])
    labels_df = pd.read_csv(LABEL_FILE, names=column_names, delim_whitespace=True, header=None)


    # Error checking for same number of samples in both files and samples are unique
    samples_unique = set(labels_df.iloc[:,0])
    assert len(labels_df) == len(matrix_df.columns)
    assert len(labels_df) == len(samples_unique)

    
    labels, class_weights = preprocessing.labels_and_weights(labels_df)
    args.output_num_classes = len(labels)
    is_binary = False
    if len(labels) == 2:
        is_binary = True
        args.output_num_classess = 1

    # Define model paramters
    batch_size = args.batch_size
    max_epoch = args.max_epoch
    learning_rate = args.learning_rate #5e-4
    num_features = len(matrix_df.index)

    # Setup model
    model = utils.Net(input_seq_length=num_features,
                  output_num_classes=args.output_num_classes).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.1)

    if is_binary:
        loss_fn = torch.nn.BCEWithLogitsLoss()
    else:
        loss_fn = torch.nn.CrossEntropyLoss()#(weight=class_weights)

    logger.info('Number of samples: %d\n', len(labels_df))
    logger.info('Labels: ')
    for i in range(len(labels)):
        logger.info('       %d - %s', i, labels[i])
    
    # Replace missing data with the global minimum of the dataset
    val_min, val_max = np.nanmin(matrix_df), np.nanmax(matrix_df)
    matrix_df.fillna(val_min, inplace=True)

    # Transposing matrix to align with label file
    matrix_transposed_df = matrix_df.T

    # Create density and tsne plot
    graphs = Plotter(OUTPUT_DIR_FINAL)
    graphs.density(matrix_df)
    graphs.tsne(matrix_transposed_df, labels_df, labels, title=args.sample_file)

    train_data, test_data = preprocessing.split_data(matrix_transposed_df, labels_df, args.test_split, args.output_num_classes)

    # Convert tuple of df's to tuple of np's
    # Allows the dataset class to access w/ data[][] instead of data[].iloc[]
    train_data_np = (train_data[0].values, train_data[1].values)
    test_data_np = (test_data[0].values, test_data[1].values)

    train_dataset = dataset.Dataset(train_data_np)
    test_dataset = dataset.Dataset(test_data_np)
    train_generator = data.DataLoader(train_dataset, **train_kwargs, drop_last=False)
    test_generator = data.DataLoader(test_dataset, **test_kwargs, drop_last=False)
    # drop_last=True would drop the last batch if the sample size is not divisible by the batch size

    logger.info('\nTraining size: %d \nTesting size: %d\n', len(train_dataset), len(test_dataset))

    # Create variables to store accuracy and loss
    loss_meter = utils.AverageMeter()
    loss_meter.reset()
    summary_file = pd.DataFrame([], columns=['Epoch', 'Training Loss', 'Accuracy', 'Accurate Count', 'Total Items'])
    train_stats = pd.DataFrame([], columns=['accuracy', 'loss'])
    test_stats = pd.DataFrame([], columns=['accuracy', 'loss'])

    # Train and test the model
    for epoch in range(args.max_epoch):
        train_stats = train(model, device, is_binary, train_generator, optimizer, loss_fn, batch_size, loss_meter, train_stats)
        test_stats = test(model, device, is_binary, test_generator, loss_fn, epoch, batch_size, loss_meter, test_stats, train_stats, logger)
        scheduler.step()

    # Training finished - Below is used for testing the network, plots and saving results
    if(args.plot_results):
        y_predict_list = []
        y_target_list = []
        y_predict_list, y_target_list = forward(model, device, is_binary, test_generator, y_predict_list, y_target_list)

        graphs.accuracy(train_stats, test_stats, graphs_title=args.sample_file)
        graphs.confusion(y_predict_list, y_target_list, labels, cm_title=args.sample_file)
        logger.info("\n\nf1 score: %0.2f" % (f1_score(y_target_list, y_predict_list, average="weighted")))

    #summary_file.to_csv(RESULTS_FILE, sep='\t', index=False)
    logger.info('\nFinal Accuracy: %2.3f', test_stats.iloc[epoch]['accuracy'])
    logger.info('\nFinished at  ' + str(datetime.today().strftime('%Y-%m-%d-%H:%M')))
Пример #11
0
def train_GAN(params):

    # -------------------
    #  Parameters
    # -------------------

    log(str(params), name=params['log_name'])

    # Clear remaining model
    if params['ratio_L'] < 1.0 or params['ratio_U'] < 1.0:
        network.clear(params['name'] + '_R' + str(params['start_run']))
    plt.close('all')

    # -------------------
    #  CUDA
    # -------------------

    cuda = True if torch.cuda.is_available() else False
    G_Loss = torch.nn.BCELoss()
    D_Loss = torch.nn.BCELoss()
    C_Loss = torch.nn.BCELoss()

    if cuda:
        G_Loss.cuda()
        D_Loss.cuda()
        C_Loss.cuda()
        floatTensor = torch.cuda.FloatTensor
        log("CUDA Training.", name=params['log_name'])
        network.clear_cache()
    else:
        floatTensor = torch.FloatTensor
        log("CPU Training.", name=params['log_name'])

    # -------------------
    #  Data scaling
    # -------------------
    '''
    XTL ... Original labelled data
    XTU ... Original unlabelled data
    XTV ... Original validation data
    
    XL  ... Labelled data
    XU  ... Unlabelled data
    XV  ... Validation data
    '''

    dset_L = params['dset_L']
    dset_U = params['dset_U']
    dset_V = params['dset_V']

    if dset_L == dset_U:
        X, Y = pp.get_data(params, dset_L)
        XTL, XTU, YTL, YTU = pp.split_data(X, Y)
    else:
        XTL, YTL = pp.get_data(params, dset_L)
        XTU, YTU = pp.get_data(params, dset_U)

    if dset_V is None:
        XTV, YTV = XTU, YTU
    else:
        XTV, YTV = pp.get_data(params, dset_V)

    XTL = pp.scale_minmax(XTL)
    XTU = pp.scale_minmax(XTU)

    XTV = pp.scale_minmax(XTV)
    if params['ratio_V'] < 1.0:
        XTV, YTV = pp.select_random(XTV, YTV, params['ratio_L'])
        log("Selected %s of validation samples." %
            (format(params['ratio_V'], '0.2f')),
            name=params['log_name'])

    DL_V = pp.get_dataloader(params, XTV, YTV, batch_size=1024)

    # -------------------
    #  Load accuracy
    # -------------------

    mat_accuracy_G, mat_accuracy_D, mat_accuracy_C = network.load_Acc(params)

    if (params['R_active']):
        mat_accuracy_R = network.load_R_Acc(params)

    # -------------------
    #  Final prediction
    # -------------------

    if (params['prediction']):
        Y_pred = torch.zeros(XTU.shape[0], 8)

    # -------------------
    #  Start Training
    # -------------------

    YF = None
    PF = None
    RF = None

    for run in range(params['runs']):

        # -------------------
        #  Labelled Data
        # -------------------

        XL, YL = XTL, YTL

        if params['ratio_L'] < 1.0:
            XL, YL = pp.select_random(XL, YL, params['ratio_L'])
            log("Selected %s of labelled samples." %
                (format(params['ratio_L'], '0.2f')),
                name=params['log_name'])

        count_L = YL.shape[0]
        log("Number of labelled samples = %d." % (count_L),
            name=params['log_name'])

        DL_L = pp.get_dataloader(params, XL, YL)

        # -------------------
        #  Unlabelled Data
        # -------------------

        XU, YU = XTU, YTU

        if params['ratio_U'] < 1.0:
            XU, YU = pp.select_random(XU, YU, params['ratio_U'])
            log("Selected %s of unlabelled samples." %
                (format(params['ratio_U'], '0.2f')),
                name=params['log_name'])

        log("Number of unlabelled samples = %d." % (XU.shape[0]),
            name=params['log_name'])

        DL_U_iter = pp.get_perm_dataloader(params, XU, YU)

        # -------------------
        #  Networks
        # -------------------

        G, D, C = network.load_GAN(run, params)

        if (params['R_active']):
            R = network.load_Ref(run, params)

        # -------------------
        #  Optimizers
        # -------------------

        optimizer_G = torch.optim.Adam(G.parameters(),
                                       lr=params['GLR'],
                                       betas=(params['GB1'], params['GB2']))
        optimizer_D = torch.optim.Adam(D.parameters(),
                                       lr=params['DLR'],
                                       betas=(params['DB1'], params['DB2']))
        optimizer_C = torch.optim.Adam(C.parameters(),
                                       lr=params['CLR'],
                                       betas=(params['CB1'], params['CB2']))

        if (params['R_active']):
            optimizer_R = torch.optim.Adam(R.parameters(),
                                           lr=params['CLR'],
                                           betas=(params['CB1'],
                                                  params['CB2']))

        # -------------------
        #  Training
        # -------------------

        if run >= params['start_run']:

            if params['oversampling']:
                XL, YL = pp.over_sampling(params, XL, YL)
                log("Oversampling: created %d new labelled samples." %
                    (XL.shape[0] - count_L),
                    name=params['log_name'])

            for epoch in range(params['epochs']):

                # Jump to start epoch
                if run == params['start_run']:
                    if epoch < params['start_epoch']:
                        continue

                running_loss_G = 0.0
                running_loss_D = 0.0
                running_loss_C = 0.0
                """
                      X1, P1      - Labelled Data,      predicted Labels (C)                             | Regular training of classifier
                W1 = (X1, Y1), A1 - Labelled Data,      actual Labels,        predicted Authenticity (D) | Real samples
                W2 = (X2, Y2), A2 - Unlabelled Data,    predicted Labels (C), predicted Authenticity (D) | Real data with fake labels
                W3 = (X3, Y3), A3 - Synthetic Data (G), actual Labels,        predicted Authenticity (D) | Fake data with real labels
                W4 = (X4, Y4), A4 - Unlabbeled Data,    predicted Labels (C), predicted Authenticity (D) | Fake positive to prevent overfitting
                      XV, YV,  PV - Validation Data,    actual Labels,        predicted Labels (C)       | Validation samples
                  R1, F2, F3,  R4 - Real/Fake Labels
                """
                for i, data in enumerate(DL_L, 1):

                    loss_G = []
                    loss_D = []
                    loss_C = []

                    # -------------------
                    #  Train the classifier on real samples
                    # -------------------
                    X1, Y1 = data
                    W1 = torch.cat((X1, Y1), dim=1)
                    R1 = floatTensor(W1.shape[0], 1).fill_(1.0)

                    if params['C_basic_train']:
                        optimizer_C.zero_grad()
                        P1 = C(X1)
                        loss = C_Loss(P1, Y1)
                        loss_C.append(loss)
                        loss.backward()
                        optimizer_C.step()

                    if params['R_active']:
                        optimizer_R.zero_grad()
                        PR = R(X1)
                        loss = C_Loss(PR, Y1)
                        loss.backward()
                        optimizer_R.step()

                    # -------------------
                    #  Train the discriminator to label real samples
                    # -------------------
                    optimizer_D.zero_grad()
                    A1 = D(W1)
                    loss = D_Loss(A1, R1)
                    loss_D.append(loss)
                    loss.backward()
                    optimizer_D.step()

                    # -------------------
                    #  Classify unlabelled data
                    # -------------------
                    optimizer_C.zero_grad()
                    X2 = DL_U_iter.get_next()[0]
                    Y2 = C(X2)
                    W2 = torch.cat((X2, Y2), dim=1)

                    # -------------------
                    #  Train the classifier to label unlabelled samples
                    # -------------------
                    A2 = D(W2)
                    R2 = floatTensor(W2.shape[0], 1).fill_(1.0)
                    loss = C_Loss(A2, R2)
                    loss_C.append(loss)
                    loss.backward()
                    optimizer_C.step()

                    # -------------------
                    #  Train the discriminator to label predicted samples
                    # -------------------
                    optimizer_D.zero_grad()
                    A2 = D(W2.detach())
                    F2 = floatTensor(W2.shape[0], 1).fill_(0.0)
                    loss = D_Loss(A2, F2)
                    loss_D.append(loss)
                    loss.backward()
                    optimizer_D.step()

                    # -------------------
                    #  Train the discriminator to label fake positive samples
                    # -------------------
                    X4 = DL_U_iter.get_next()[0]
                    Y4 = C(X4)
                    W4 = torch.cat((X4, Y4), dim=1)

                    optimizer_D.zero_grad()
                    A4 = D(W4)
                    R4 = floatTensor(W4.shape[0], 1).fill_(1.0)
                    loss = D_Loss(A4, R4)
                    loss_D.append(loss)
                    loss.backward()
                    optimizer_D.step()

                    # -------------------
                    #  Create Synthetic Data
                    # -------------------
                    optimizer_G.zero_grad()
                    if params['G_label_sample']:
                        # Selected Labels from a uniform distribution of available labels
                        Y3 = floatTensor(
                            pp.get_one_hot_labels(params=params,
                                                  num=Y1.shape[0] *
                                                  params['G_label_factor']))
                    else:
                        # Select labels from current training batch
                        Y3 = torch.cat(
                            ([Y1 for _ in range(params['G_label_factor'])]),
                            dim=0)

                    Z = floatTensor(
                        np.random.normal(0, 1,
                                         (Y3.shape[0], params['noise_shape'])))
                    I3 = torch.cat((Z, Y3), dim=1)
                    X3 = G(I3)
                    W3 = torch.cat((X3, Y3), dim=1)

                    # -------------------
                    #  Train the generator to fool the discriminator
                    # -------------------
                    A3 = D(W3)
                    R3 = floatTensor(W3.shape[0], 1).fill_(1.0)
                    loss = G_Loss(A3, R3)
                    loss_G.append(loss)
                    loss.backward()
                    optimizer_G.step()

                    # -------------------
                    #  Train the discriminator to label synthetic samples
                    # -------------------
                    optimizer_D.zero_grad()
                    A3 = D(W3.detach())
                    F3 = floatTensor(W3.shape[0], 1).fill_(0.0)
                    loss = D_Loss(A3, F3)
                    loss_D.append(loss)
                    loss.backward()
                    optimizer_D.step()

                    # -------------------
                    #  Calculate overall loss
                    # -------------------
                    running_loss_G += np.mean([loss.item() for loss in loss_G])
                    running_loss_D += np.mean([loss.item() for loss in loss_D])
                    running_loss_C += np.mean([loss.item() for loss in loss_C])

                # -------------------
                #  Post Epoch
                # -------------------

                logString = "[Run %d/%d] [Epoch %d/%d] [G loss: %f] [D loss: %f] [C loss: %f]" % (
                    run + 1, params['runs'], epoch + 1, params['epochs'],
                    running_loss_G / (i), running_loss_D /
                    (i), running_loss_C / (i))
                log(logString, save=False, name=params['log_name'])

                if (epoch + 1) % params['save_step'] == 0:
                    idx = run, int(epoch / params['save_step']) + 1

                    acc_D_real = []
                    acc_D_vs_C = []
                    acc_D_vs_G = []
                    acc_C_real = []

                    for data in DL_V:

                        XV, YV = data

                        # Predict labels
                        PV = C(XV)

                        if params['R_active']:
                            PR = R(XV)
                            mat_accuracy_R[idx] = get_accuracy(PR, YV)
                            network.save_Ref(params['name'], run, R)
                            network.save_R_Acc(params, mat_accuracy_R)

                        # Generate Synthetic Data
                        Z = floatTensor(
                            np.random.normal(
                                0, 1, (YV.shape[0], params['noise_shape'])))
                        IV = torch.cat((Z, YV), dim=1)
                        XG = G(IV)

                        # Estimate Discriminator Accuracy
                        WV1 = torch.cat((XV, YV), dim=1)
                        WV2 = torch.cat((XV, PV), dim=1)
                        WV3 = torch.cat((XG, YV), dim=1)
                        RV1 = floatTensor(WV1.shape[0], 1).fill_(1.0)
                        FV2 = floatTensor(WV2.shape[0], 1).fill_(0.0)
                        FV3 = floatTensor(WV3.shape[0], 1).fill_(0.0)

                        AV1 = D(WV1)
                        AV2 = D(WV2)
                        AV3 = D(WV3)

                        acc_D_real.append(get_accuracy_binary(AV1, RV1))
                        acc_D_vs_C.append(get_accuracy_binary(AV2, FV2))
                        acc_D_vs_G.append(get_accuracy_binary(AV3, FV3))

                        acc_C_real.append(get_accuracy(PV, YV))

                    acc_D_real = np.mean(acc_D_real)
                    acc_D_vs_C = np.mean(acc_D_vs_C)
                    acc_D_vs_G = np.mean(acc_D_vs_G)
                    acc_D = .5 * acc_D_real + .25 * acc_D_vs_G + .25 * acc_D_vs_C
                    mat_accuracy_D[idx] = acc_D

                    acc_C_real = np.mean(acc_C_real)
                    acc_C_vs_D = 1.0 - acc_D_vs_C
                    acc_C = .5 * acc_C_real + .5 * acc_C_vs_D
                    mat_accuracy_C[idx] = acc_C_real

                    acc_G = 1.0 - acc_D_vs_G
                    mat_accuracy_G[idx] = acc_G

                    logString = "[Run %d/%d] [Epoch %d/%d] [G acc: %f] [D acc: %f | vs Real: %f | vs G: %f | vs C: %f] [C acc: %f | vs Real: %f | vs D: %f]" % (
                        run + 1, params['runs'], epoch + 1, params['epochs'],
                        acc_G, acc_D, acc_D_real, acc_D_vs_G, acc_D_vs_C,
                        acc_C, acc_C_real, acc_C_vs_D)
                    log(logString, save=True, name=params['log_name'])

                    network.save_GAN(params['name'], run, G, D, C)
                    params['start_epoch'] = epoch + 1
                    network.save_Parameter(params)
                    network.save_Acc(params, mat_accuracy_G, mat_accuracy_D,
                                     mat_accuracy_C)

            # End of Training Run
            params['start_run'] = run + 1
            params['start_epoch'] = 0
            network.save_Parameter(params)

        # -------------------
        #  Post Run
        # -------------------

        acc_C_real = []

        for data in DL_V:

            XV, YV = data

            # # Generate Synthetic Data
            # Z = floatTensor(np.random.normal(0, 1, (YV.shape[0], params['noise_shape'])))
            # IV = torch.cat((Z,YV),dim=1)
            # XG = G(IV)

            # Classify Validation data
            PC = C(XV)
            acc_C_real.append(get_accuracy(PC, YV))

            if params['R_active']:
                if RF == None:
                    RF = R(XV)
                else:
                    RF = torch.cat((RF, R(XV).detach()), 0)

            if YF == None:
                YF = YV
                PF = PC
            else:
                YF = torch.cat((YF, YV), 0)
                PF = torch.cat((PF, PC), 0)

        mat_accuracy_C[run] = np.mean(acc_C_real)

        # -------------------
        #  Final prediction
        # -------------------

        if (params['prediction']):
            C.hard = False
            XP = pp.get_tensor(XTU, None)[0]
            YP = C(XP)
            Y_pred += YP.cpu().detach()
            C.hard = True

    # -------------------
    #  Post Training
    # -------------------

    timeline = np.arange(0, params['epochs'] + 1, params['save_step'])

    # -------------------
    #  Plot Accuracy
    # -------------------

    acc_G = np.mean(mat_accuracy_G, axis=0)
    std_G = np.std(mat_accuracy_G, axis=0)
    acc_D = np.mean(mat_accuracy_D, axis=0)
    std_D = np.std(mat_accuracy_D, axis=0)
    acc_C = np.mean(mat_accuracy_C, axis=0)
    std_C = np.std(mat_accuracy_C, axis=0)
    if params['R_active']:
        acc_R = np.mean(mat_accuracy_R, axis=0)

    fig, ax = plt.subplots()

    legend = []
    cmap = plt.get_cmap('gnuplot')
    indices = np.linspace(0, cmap.N, 7)
    colors = [cmap(int(i)) for i in indices]

    ax.plot(timeline, acc_C, c=colors[0], linestyle='solid')
    ax.fill_between(timeline,
                    acc_C - std_C,
                    acc_C + std_C,
                    alpha=0.3,
                    facecolor=colors[0])
    legend.append("Accuracy $A_C$")

    ax.plot(timeline, acc_D, c=colors[1], linestyle='dashed')
    ax.fill_between(timeline,
                    acc_D - std_D,
                    acc_D + std_D,
                    alpha=0.3,
                    facecolor=colors[1])
    legend.append("Accuracy $A_D$")

    ax.plot(timeline, acc_G, c=colors[2], linestyle='dotted')
    ax.fill_between(timeline,
                    acc_G - std_G,
                    acc_G + std_G,
                    alpha=0.3,
                    facecolor=colors[2])
    legend.append("Accuracy $A_G$")

    Y_max = 1.15
    if params['R_active']:
        ax.plot(timeline, acc_R, c=colors[3], linestyle='dashdot')
        legend.append("Accuracy $A_R$")

        perf = np.zeros_like(acc_C)
        perf[0] = 0.0
        perf[1:] = (acc_C[1:] - acc_R[1:]) / acc_R[1:]

        ax.plot(timeline, perf + 1, c=colors[4], linestyle='solid')
        legend.append("Performance $P_C$")

    ax.set_xlim(0.0, params['epochs'])
    ax.set_ylim(0.0, Y_max)

    ax.legend(legend, fontsize=20)
    ax.set_xlabel('Epoch', fontsize=20)
    ax.set_ylabel('Accuracy', fontsize=20)

    ax.grid()
    save_fig(params, 'eval', fig)

    # -------------------
    #  Compare Classifier to Baseline
    # -------------------

    if params['R_active']:
        maxC = np.argmax(acc_C, axis=0)
        bestC = acc_C[maxC]
        maxR = np.argmax(acc_R, axis=0)
        bestR = acc_R[maxR]
        log(' - Peak Accuracy: C: %s after %d epochs | R: %s after %d epochs | Inc: %s'
            % (format((bestC), '0.4f'), timeline[maxC], format(
                (bestR), '0.4f'), timeline[maxR],
               format((bestC - bestR) / bestR, '0.4f')),
            name='results')

        Y_max = max(Y_max, max(perf + 1) + 0.025)

        maxP = np.argmax(perf, axis=0)
        log(' - Hightest $P_C$: %s after %d epochs.' % (format(
            (perf[maxP]), '0.4f'), timeline[maxP]),
            name='results')

        adva = np.zeros_like(acc_C)
        for i, v1 in enumerate(acc_C):
            for j, v2 in enumerate(acc_R):
                if v2 >= v1:
                    adva[i] = j - i
                    break

        maxA = np.argmax(adva, axis=0)
        log(' - Biggest Advantage: %d epochs after %d epochs.' %
            (adva[maxA] * params['save_step'], timeline[maxA]),
            name='results')

    # -------------------
    #  Log Results
    # -------------------

    if params['evaluate']:
        log(" - %s ( %s | %s ):  [C acc: %f ( ± %f )]" %
            (params['name'], params['dset_V'], params['location'], acc_C[-1],
             std_C[-1]),
            name='results')
    else:
        log(" - " + params['name'] +
            ": [C acc: %f ( ± %f )] [D acc: %f ( ± %f )] [G acc: %f ( ± %f )]"
            %
            (acc_C[-1], std_C[-1], acc_D[-1], std_D[-1], acc_G[-1], std_G[-1]),
            name='results')

    # -------------------
    #  Generate Confusion Matrix
    # -------------------

    YF = pp.one_hot_to_labels(params, YF)
    PF = pp.one_hot_to_labels(params, PF)

    con_mat = confusion_matrix(YF,
                               PF,
                               labels=None,
                               sample_weight=None,
                               normalize='true')
    if params['evaluate']:
        plot_confusion_matrix(con_mat,
                              params,
                              name='%s_%s' %
                              (params['dset_V'], params['location']),
                              title='Confusion matrix')
    else:
        plot_confusion_matrix(con_mat,
                              params,
                              name='C',
                              title='Confusion matrix')

    if params['R_active']:
        RF = pp.one_hot_to_labels(params, RF)
        con_mat = confusion_matrix(YF,
                                   RF,
                                   labels=None,
                                   sample_weight=None,
                                   normalize='true')
        plot_confusion_matrix(con_mat,
                              params,
                              name='R',
                              title='Confusion matrix')

    # -------------------
    #  Final prediction
    # -------------------

    if (params['prediction']):
        network.make_dir_pre()
        pred = torch.argmax(Y_pred, axis=1)
        f = open(network.S_PATH + params['name'] + '_predictions.txt', "w")
        for y in pred:
            f.write(' '.join(['%.6f' % (float(y.item() + 1))] * 500) + '\n')
        f.close()
import numpy as np
import preprocessing as pre
# --- SVM balanceada tomando a media dos intervalos de 10 em 10 ---

data_num = 500
num_int = 2560
#Lendo todos os dados do experimento
X, y = pre.load('dataset/', data_num, num_int)
#Pegando a media em um numero de 10 intervalos para cada componente
X = pre.med_intervalo(X, 10)
#Balanceando os dados
X, y = pre.proc_balanceado(X, y, data_num)
#Separando em conjunto de treino e teste (pego de forma aleatoria, aleatorizando também as variáveis dependentes)
X_train, X_test, y_train, y_test = pre.split_data(X, y, 0.2, None)
#Padronizando dados
X_train, X_test = pre.standardize_data(X_train, X_test)

#Implementando a SVM
from sklearn.svm import SVC
classifier = SVC(kernel='rbf', probability=True, gamma='auto')
#Treinando a SVM
classifier.fit(X_train, y_train.ravel())

#Prevendo os resultados de teste
y_pred = classifier.predict(X_test)
svm_predict = classifier.predict_proba(X_test)

#Produzindo a confusion matrix da SVM acima
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print('\n\nConfusion Matrix: \n', cm)
def twod_array(array):
    return np.concatenate([array[:2560].reshape(-1,1), array[2560:].reshape(-1,1)], axis=1)
#MLP - dataset original: #######################################################################

data_num = 500
num_int = 2560
random_seed = 31

X, y = pre.load('dataset/',data_num,num_int)

y_dummy = y.reshape(-1, 1)
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()
y_dummy = onehotencoder.fit_transform(y_dummy).toarray()

X_train, X_test, y_train, y_test = pre.split_data(X,y_dummy,0.2,random_seed)
X_train, X_test = pre.standardize_data(X_train,X_test)

from keras.models import Sequential
from keras.layers import Dense, Dropout


mlp_cls = Sequential()
mlp_cls.add(Dense(units=128, kernel_initializer='uniform', activation='sigmoid', input_dim=X_train.shape[1]))
mlp_cls.add(Dropout(0.5))
mlp_cls.add(Dense(units=y_train.shape[1], kernel_initializer='uniform', activation='softmax'))
mlp_cls.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

weigths = mlp_cls.get_weights()

Пример #14
0
def train_evaluate_model(city,
                         data,
                         predict_n,
                         look_back,
                         hidden,
                         epochs,
                         ratio=0.7,
                         cluster=True,
                         load=False,
                         uncertainty=True):
    """
    Train the model
    :param city: Name of the city
    :param data: Dataset
    :param predict_n: Number of steps ahead to be predicted
    :param look_back: number of history steps to include in training window
    :param hidden: Number of Hidden layer
    :param epochs: number of training epochs
    :param ratio: ratio of the full dataset to use in training
    :param cluster: whether to train on features from the city's cluster
    :param load: Whether to load a previously saved model
    :return:
    """
    if cluster:
        target_col = list(data.columns).index("casos_est_{}".format(city))
    else:
        target_col = list(data.columns).index("casos_est")
    norm_data, max_features = normalize_data(data)
    factor = max_features[target_col]

    ##split test and train
    X_train, Y_train, X_test, Y_test = split_data(
        norm_data,
        look_back=look_back,
        ratio=ratio,
        predict_n=predict_n,
        Y_column=target_col,
    )
    print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

    ## Run model
    model = build_model(hidden,
                        X_train.shape[2],
                        predict_n=predict_n,
                        look_back=look_back)
    if load:
        model.load_weights("trained_{}_model.h5".format(city))
    history = train(model,
                    X_train,
                    Y_train,
                    batch_size=1,
                    epochs=epochs,
                    geocode=city)
    model.save('../saved_models/LSTM/{}/lstm_{}_epochs_{}.h5'.format(
        STATE, city, epochs))

    predicted_out, metrics_out = evaluate(
        city,
        model,
        X_test,
        Y_test,
        label="out_of_sample_{}".format(city),
        uncertainty=uncertainty)
    predicted_in, metrics_in = evaluate(city,
                                        model,
                                        X_train,
                                        Y_train,
                                        label="in_sample_{}".format(city),
                                        uncertainty=uncertainty)
    if uncertainty:
        pout = np.percentile(predicted_out, 50, axis=2)
    else:
        pout = predicted_out
    metrics = calculate_metrics(pout, Y_test, factor)
    metrics.to_pickle("../saved_models/LSTM/{}/metrics_lstm_{}_8pw.pkl".format(
        STATE, city))

    predicted = np.concatenate((predicted_in, predicted_out), axis=0)
    with open(
            "../saved_models/LSTM/{}/predicted_lstm_{}_8pw.pkl".format(
                STATE, city), "wb") as f:
        pickle.dump(predicted, f)

    return predicted, X_test, Y_test, Y_train, factor
Пример #15
0
import numpy as np
import preprocessing as pre
# --- MLP nao balanceada tomando todos a media de 10 intervalos ---

data_num = 500
num_int = 2560
#Lendo todos os dados do experimento
X, y = pre.load('dataset/', data_num, num_int)
#Pegando a media em um numero de 10 intervalos para cada componente
X = pre.med_intervalo(X, 10)
#Remodelado as dimensões de y para ser aceito na dummy
y = np.reshape(y, (y.shape[0], -1))
#Passando y para dummy variables
y_dummy = pre.dummy_variables(y)
#Separando em conjunto de treino e teste (pego de forma aleatoria, aleatorizando também as variáveis dependentes)
X_train, X_test, y_train, y_test = pre.split_data(X, y_dummy, 0.2, None)
#Padronizando dados
X_train, X_test = pre.standardize_data(X_train, X_test)

#Implementando a MLP
from keras.models import Sequential
#modulo responsavel por inicializar a rede
from keras.layers import Dense, Dropout
#modulo responsavel por gerar as camadas da rede
mlp_cls = Sequential()

#Saidas do primeiro layer : (183+1)/2 = 92
mlp_cls.add(
    Dense(units=92,
          kernel_initializer='uniform',
          activation='sigmoid',
Пример #16
0
def run(
    filename: str = '^GSPC.csv',
    frac_train: float = .8,
    save_plots: bool = False,
    show_plots: bool = False,
):
    # load data
    sp = get_data.get_data(filename=filename)

    # preprocess
    df_scaled = sp.apply(preprocessing.adjust_to_seasonality,
                         args=([
                             'scale',
                         ], ))
    df_first_diff = df_scaled.apply(preprocessing.adjust_to_seasonality,
                                    args=([
                                        'first_diff',
                                    ], ))

    res = {}

    for transform, df in {
            'scaled': df_scaled,
            'first_diff': df_first_diff
    }.items():
        train, test = preprocessing.split_data(df, frac_train=frac_train)

        # reference model
        reference = model.reference.Reference(train['volume'], test['volume'])
        res['reference; ' + transform] = reference.results(
            show_plots=show_plots,
            save_plots=save_plots,
            plot_args={
                'title': str(reference) + '_' + transform,
                'ylabel': transform
            })

        # univariate model
        sarimax = model.statespace_models.Model(
            train['volume'],
            test['volume'],
            model=model.statespace_models.SARIMAX,
            trend='ct',
            order=(4, 1, 4),
            enforce_invertibility=False)
        res['sarimax; ' + transform] = sarimax.results(show_plots=show_plots,
                                                       save_plots=save_plots,
                                                       plot_args={
                                                           'title':
                                                           str(sarimax) + '_' +
                                                           transform,
                                                           'ylabel':
                                                           transform
                                                       })

        # multivariate model
        varmax = model.statespace_models.Model(
            train[['open', 'close', 'volume']],
            test[['open', 'close', 'volume']],
            column='volume',
            model=model.statespace_models.VARMAX,
            trend='c',
            order=(4, 1))
        res['varmax; ' + transform] = varmax.results(show_plots=show_plots,
                                                     save_plots=save_plots,
                                                     plot_args={
                                                         'title':
                                                         str(varmax) + '_' +
                                                         transform,
                                                         'ylabel':
                                                         transform
                                                     })

    pprint(res)
Пример #17
0
def main(*kargs, **kwargs):
    get_kwargs(kwargs)
    train_fname = kwargs['train']
    test_fname = kwargs['test']
    result_fname = kwargs['output']
    word_embeds_fname = kwargs['word_embeds']
    char_embeds_fname = kwargs['char_embeds']
    logger_fname = kwargs['logger']
    mode = kwargs['mode']
    max_words = kwargs['max_words']
    use_only_exists_words = kwargs['use_only_exists_words']
    swear_words_fname = kwargs['swear_words']
    wrong_words_fname = kwargs['wrong_words']
    embeds_format = kwargs['format_embeds']
    config = kwargs['config']
    output_dir = kwargs['output_dir']
    norm_prob = kwargs['norm_prob']
    norm_prob_koef = kwargs['norm_prob_koef']
    gpus = kwargs['gpus']

    seq_col_name_words = 'comment_seq_lw_use_exist{}_{}k'.format(
        int(use_only_exists_words), int(max_words / 1000))
    seq_col_name_ll3 = 'comment_seq_ll3_use_exist{}_{}k'.format(
        int(use_only_exists_words), int(max_words / 1000))

    model_file = {
        'dense': os.path.join(output_dir, 'dense.h5'),
        'cnn': os.path.join(output_dir, 'cnn.h5'),
        'lstm': os.path.join(output_dir, 'lstm.h5'),
        'lr': os.path.join(output_dir, '{}_logreg.bin'),
        'catboost': os.path.join(output_dir, '{}_catboost.bin')
    }

    # ====Create logger====
    logger = Logger(logging.getLogger(), logger_fname)

    # ====Detect GPUs====
    logger.debug(device_lib.list_local_devices())

    # ====Load data====
    logger.info('Loading data...')
    train_df = load_data(train_fname)
    test_df = load_data(test_fname)

    target_labels = [
        'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
    ]
    num_classes = len(target_labels)

    # ====Load additional data====
    logger.info('Loading additional data...')
    swear_words = load_data(swear_words_fname,
                            func=lambda x: set(x.T[0]),
                            header=None)
    wrong_words_dict = load_data(wrong_words_fname,
                                 func=lambda x: {val[0]: val[1]
                                                 for val in x})

    # ====Load word vectors====
    logger.info('Loading embeddings...')
    embeds_word = Embeds().load(word_embeds_fname, embeds_format)
    embeds_ll3 = Embeds().load(char_embeds_fname, embeds_format)

    # ====Clean texts====
    if mode in ('preprocess', 'all'):
        logger.info('Cleaning text...')
        train_df['comment_text_clear'] = clean_text(train_df['comment_text'],
                                                    wrong_words_dict,
                                                    autocorrect=True)
        test_df['comment_text_clear'] = clean_text(test_df['comment_text'],
                                                   wrong_words_dict,
                                                   autocorrect=True)
        train_df.to_csv(os.path.join(output_dir, 'train_clear.csv'),
                        index=False)
        test_df.to_csv(os.path.join(output_dir, 'test_clear.csv'), index=False)

    # ====Calculate maximum seq length====
    logger.info('Calc text length...')
    train_df.fillna('__NA__', inplace=True)
    test_df.fillna('__NA__', inplace=True)
    train_df['text_len'] = train_df['comment_text_clear'].apply(
        lambda words: len(words.split()))
    test_df['text_len'] = test_df['comment_text_clear'].apply(
        lambda words: len(words.split()))
    max_seq_len = np.round(train_df['text_len'].mean() +
                           3 * train_df['text_len'].std()).astype(int)
    max_char_seq_len = 2000  # empirical
    logger.debug('Max seq length = {}'.format(max_seq_len))

    # ====Prepare data to NN====
    logger.info('Converting texts to sequences...')

    if mode in ('preprocess', 'all'):
        train_df[seq_col_name_words], test_df[
            seq_col_name_words], word_index, train_df[
                seq_col_name_ll3], test_df[
                    seq_col_name_ll3], ll3_index = convert_text2seq(
                        train_df['comment_text_clear'].tolist(),
                        test_df['comment_text_clear'].tolist(),
                        max_words,
                        max_seq_len,
                        max_char_seq_len,
                        embeds_word,
                        lower=True,
                        oov_token='__NA__',
                        uniq=False,
                        use_only_exists_words=use_only_exists_words)
        logger.debug('Dictionary size use_exist{} = {}'.format(
            int(use_only_exists_words), len(word_index)))
        logger.debug('Char dict size use_exist{} = {}'.format(
            int(use_only_exists_words), len(ll3_index)))

        logger.info('Preparing embedding matrix...')
        words_not_found = embeds_word.set_matrix(max_words, word_index)
        embeds_ll3.matrix = np.random.normal(size=(len(ll3_index),
                                                   embeds_word.shape[1]))
        embeds_ll3.word_index = ll3_index
        embeds_ll3.word_index_reverse = {
            val: key
            for key, val in ll3_index.items()
        }
        embeds_ll3.shape = np.shape(embeds_ll3.matrix)
        embeds_word.save(
            os.path.join(output_dir,
                         'wiki.embeds_lw.{}k'.format(int(max_words / 1000))))
        embeds_ll3.save(
            os.path.join(output_dir,
                         'wiki.embeds_ll3.{}k'.format(int(max_words / 1000))))

        # ====Get text vector====
        pooling = {
            'max': {
                'func': np.max
            },
            'avg': {
                'func': np.sum,
                'normalize': True
            },
            'sum': {
                'func': np.sum,
                'normalize': False
            }
        }
        for p in ['max', 'avg', 'sum']:
            train_df['comment_vec_{}'.format(
                p)] = train_df[seq_col_name_words].apply(
                    lambda x: embed_aggregate(x, embeds_word, **pooling[p]))
            test_df['comment_vec_{}'.format(
                p)] = test_df[seq_col_name_words].apply(
                    lambda x: embed_aggregate(x, embeds_word, **pooling[p]))
        train_df.to_csv(os.path.join(output_dir, 'train_clear1.csv'),
                        index=False)
        test_df.to_csv(os.path.join(output_dir, 'test_clear1.csv'),
                       index=False)
    else:
        for col in train_df.columns:
            if col.startswith('comment_seq'):
                train_df[col] = train_df[col].apply(
                    lambda x: parse_seq(x, int))
                test_df[col] = test_df[col].apply(lambda x: parse_seq(x, int))
            elif col.startswith('comment_vec'):
                train_df[col] = train_df[col].apply(
                    lambda x: parse_seq(x, float))
                test_df[col] = test_df[col].apply(
                    lambda x: parse_seq(x, float))

    logger.debug('Embedding matrix shape = {}'.format(embeds_word.shape))
    logger.debug('Number of null word embeddings = {}'.format(
        np.sum(np.sum(embeds_word.matrix, axis=1) == 0)))

    # ====END OF `PREPROCESS`====
    if mode == 'preprocess':
        return True

    # ====Train/test split data====
    x = np.array(train_df[seq_col_name_words].values.tolist())
    y = np.array(train_df[target_labels].values.tolist())
    x_train_nn, x_val_nn, y_train, y_val, train_idxs, val_idxs = split_data(
        x, y, test_size=0.2, shuffle=True, random_state=42)
    x_test_nn = np.array(test_df[seq_col_name_words].values.tolist())

    x_char = np.array(train_df[seq_col_name_ll3].values.tolist())
    x_char_train_nn = x_char[train_idxs]
    x_char_val_nn = x_char[val_idxs]
    x_char_test_nn = np.array(test_df[seq_col_name_ll3].values.tolist())

    x_train_tfidf = train_df['comment_text_clear'].values[train_idxs]
    x_val_tfidf = train_df['comment_text_clear'].values[val_idxs]
    x_test_tfidf = test_df['comment_text_clear'].values

    catboost_cols = catboost_features(train_df, test_df)
    x_train_cb = train_df[catboost_cols].values[train_idxs].T
    x_val_cb = train_df[catboost_cols].values[val_idxs].T
    x_test_cb = test_df[catboost_cols].values.T

    # ====Train models====
    nn_models = {'cnn': cnn, 'dense': dense, 'rnn': rnn}

    params = Params(config)

    metrics = {}
    predictions = {}
    for param in params['models']:
        for model_label, model_params in param.items():
            if model_params.get('common', {}).get(
                    'warm_start', False) and os.path.exists(
                        model_params.get('common', {}).get('model_file', '')):
                logger.info('{} warm starting...'.format(model_label))
                model = load_model(
                    model_params.get('common', {}).get('model_file', None))
            elif model_label in nn_models:
                model = nn_models[model_label](embeds_word.matrix,
                                               embeds_ll3.matrix,
                                               num_classes,
                                               max_seq_len,
                                               max_char_seq_len,
                                               gpus=gpus,
                                               **model_params['init'])
                model_alias = model_params.get('common', {}).get('alias', None)
                if model_alias is None or not model_alias:
                    model_alias = '{}_{}'.format(model_label, i)
                logger.info("training {} ...".format(model_label))
                if model_label == 'dense':
                    x_tr = [x_train_nn, x_char_train_nn]
                    x_val = [x_val_nn, x_char_val_nn]
                    x_test = [x_test_nn, x_char_test_nn]
                else:
                    x_tr = x_train_nn
                    x_val = x_val_nn
                    x_test = x_test_nn
                hist = train(x_tr,
                             y_train,
                             model,
                             logger=logger,
                             **model_params['train'])
                predictions[model_alias] = model.predict(x_val)
                save_predictions(test_df, model.predict(x_test), target_labels,
                                 model_alias)
            elif model_label == 'tfidf':
                model = TFIDF(target_labels, **model_params['init'])
                model.fit(x_train_tfidf, y_train, **model_params['train'])
                predictions[model_alias] = model.predict(x_val_tfidf)
                save_predictions(test_df, model.predict(x_test_tfidf),
                                 target_labels, model_alias)
            elif model_label == 'catboost':
                model = CatBoost(target_labels, **model_params['init'])
                model.fit(x_train_cb,
                          y_train,
                          eval_set=(x_val_cb, y_val),
                          use_best_model=True)
                predictions[model_alias] = model.predict_proba(x_val_cb)
                save_predictions(test_df, model.predict_proba(x_test_cb),
                                 target_labels, model_alias)
            metrics[model_alias] = get_metrics(y_val, predictions[model_alias],
                                               target_labels)
            logger.debug('{} params:\n{}'.format(model_alias, model_params))
            logger.debug('{} metrics:\n{}'.format(
                model_alias, print_metrics(metrics[model_alias])))
            model.save(
                os.path.join(output_dir, model_params['common']['model_file']))

    logger.info('Saving metrics...')
    with open(os.path.join(output_dir, 'metrics.json'), 'w') as f:
        f.write(json.dumps(metrics))

    # ====END OF `VALIDATE`====
    if mode == 'validate':
        return True

    # Meta catboost
    logger.info('training catboost as metamodel...')

    x_meta = [
        predictions[model_alias] for model_alias in sorted(predictions.keys())
    ]
    x_meta = np.array(x_train_meta).T

    x_train_meta, x_val_meta, y_train_meta, y_val_meta = train_test_split(
        x_meta, y_val, test_size=0.20, random_state=42)
    meta_model = CatBoost(target_labels,
                          loss_function='Logloss',
                          iterations=1000,
                          depth=6,
                          learning_rate=0.03,
                          rsm=1)
    meta_model.fit(x_train_meta,
                   y_train_meta,
                   eval_set=(x_val_meta, y_val_meta),
                   use_best_model=True)
    y_hat_meta = meta_model.predict_proba(x_val_meta)
    metrics_meta = get_metrics(y_val_meta, y_hat_meta, target_labels)
    #model.save(os.path.join(output_dir, 'meta.catboost')
    logger.debug('{} metrics:\n{}'.format('META', print_metrics(metrics_meta)))

    # ====Predict====
    logger.info('Applying models...')
    test_cols = []
    for model_alias in sorted(predictions.keys()):
        for label in target_labels:
            test_cols.append('{}_{}'.format(model_alias, label))
    x_test = test_df[test_cols].values

    preds = meta_model.predict_proba(x_test)
    for i, label in enumerate(target_labels):
        test_df[label] = preds[:, i]

    # ====Normalize probabilities====
    if norm_prob:
        for label in target_labels:
            test_df[label] = norm_prob_koef * test_df[label]

    # ====Save results====
    logger.info('Saving results...')
    test_df[[
        'id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
        'identity_hate'
    ]].to_csv(result_fname, index=False, header=True)
    test_df.to_csv('{}_tmp'.format(result_fname), index=False, header=True)