예제 #1
0
파일: main.py 프로젝트: Guilherme26/mono1
def main():
    parser = argparse.ArgumentParser(description="Available Parameters:")
    parser.add_argument("--n_hidden_units", default=64, type=int)
    parser.add_argument("--n_hidden_layers", default=1, type=int)
    parser.add_argument("--train_epochs", default=100, type=int)
    parser.add_argument("--write_output", default=True, type=bool)
    args = parser.parse_args()

    torch.manual_seed(0)
    np.random.seed(0)

    profiles = pd.read_csv("../data/new_profiles_200t.csv")
    comments = pd.read_csv("../data/new_comments_200t.csv")

    comments = comments.drop_duplicates()
    profiles = preprocessing.categorical_to_numerical(profiles, col="category_1")
    all_users = set(profiles.profile_username.values)

    data = preprocessing.scale(profiles.drop(columns=["category_1", "profile_username"]).values)
    name_to_record = {name: record for name, record in zip(all_users, data)}

    input_dim, output_dim = data.shape[1], len(profiles.category_1.unique()) + 1
    user_to_label = {user: category for user, category in profiles[["profile_username", "category_1"]].values}

    K = 5
    skf = StratifiedKFold(n_splits=K)
    models_metrics, models_histories = defaultdict(dict), defaultdict(list)
    for kth_fold, (train_idx, test_idx) in enumerate(skf.split(profiles.profile_username.values, profiles.category_1.values), start=1):
        print("Starting {}th Fold".format(kth_fold))

        authors = profiles.profile_username.values
        username_to_index = utils.get_users_indices(authors)
        interactions = utils.get_interactions(comments, username_to_index)
        edge_index = utils.get_edge_index(interactions)
        
        x = utils.get_x(authors, name_to_record, input_dim=input_dim)
        y = utils.get_y(user_to_label, authors)

        train_mask = [True if i in train_idx else False for i in range(len(x))]
        test_mask = [True if i in test_idx else False for i in range(len(x))]
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        data = Data(x=x, y=y, edge_index=edge_index, train_mask=train_mask, test_mask=test_mask).to(device)

        assert len(x)==len(y), "Train Input and Output tensor do not have the same dimensions"

        models = utils.get_models(data.num_nodes, input_dim, output_dim, args.n_hidden_units, args.n_hidden_layers, device=device, lr=0.005)
        histories = utils.train(data, models, epochs=args.train_epochs)
        models_histories = utils.update_histories(models_histories, histories)

        current_metrics = utils.test(data, models)
        utils.update_metrics_dict(models_metrics, current_metrics)

        print('\n')
        
    models_histories = {model: list(history/K) for model, history in models_histories.items()} # Get mean traces
    models_metrics = utils.calculate_statistics(models_metrics)

    if args.write_output:
        utils.write_json("../data/results/models_metrics_{}e_{}l_{}u.json".format(args.train_epochs, args.n_hidden_layers, args.n_hidden_units), models_metrics)
        utils.write_json("../data/results/models_histories_{}e_{}l_{}u.json".format(args.train_epochs, args.n_hidden_layers, args.n_hidden_units), models_histories)
예제 #2
0
def quantify_mood_text(mood_list):
    # quantify the textual description of the mood using AlchemyAPI sentiment analysis technique
    # Parameter:
    # mood_list : the list that contains the words describing the mood of the music
    # Returns
    # sentiment_score : the aggregated score of the sentiment extracted from the word

    # get the list of the words from the string
    get_words = []
    for words in mood_list:
        extra_words = words.split(" / ")
        for word in extra_words:
            get_words.append(word)

    sentiment_score = 0.0
    for word in set(get_words):
        response = alchemy_obj.sentiment("text", word)
        if(response.has_key('docSentiment')):
            if (response['docSentiment'].has_key('score')):
                a_score = response['docSentiment']['score']
                sentiment_score = sentiment_score + float(a_score)

    sentiment_score  = sentiment_score / float(len(mood_list))
    # scale it into a range between 0 and 200

    sentiment_score = preprocessing.scale(sentiment_score,-1, 1, 0, 100)

    return sentiment_score
예제 #3
0
def run_experiments1(algor=None):
    X1, y1, X2, y2 = get_data()
    X1_train, X1_test, y1_train, y1_test = train_test_split(X1,
                                                            y1,
                                                            test_size=0.2,
                                                            shuffle=True)
    dataset_phishing_websites = "Phishing Websites"

    if not algor:
        run_decision_tree_exp(X1_train, y1_train, X1_test, y1_test,
                              dataset_phishing_websites)
        run_boosting_experiment(X1_train, y1_train, X1_test, y1_test,
                                dataset_phishing_websites)
        run_nn_experiment(X1_train, y1_train, X1_test, y1_test,
                          dataset_phishing_websites)
        run_knn_experiment(X1_train, y1_train, X1_test, y1_test,
                           dataset_phishing_websites)
        X1_train_scaled = preprocessing.scale(X1_train)
        X1_train_scaled = pd.DataFrame(X1_train_scaled)
        X1_test_scaled = preprocessing.scale(X1_test)
        X1_test_scaled = pd.DataFrame(X1_test_scaled)

        run_svm_exp(X1_train_scaled, y1_train, X1_test_scaled, y1_test,
                    dataset_phishing_websites)
    else:
        if algor == 'dt':
            run_decision_tree_exp(X1_train, y1_train, X1_test, y1_test,
                                  dataset_phishing_websites)
        elif algor == 'boosting':
            run_boosting_experiment(X1_train, y1_train, X1_test, y1_test,
                                    dataset_phishing_websites)
        elif algor == 'nn':
            run_nn_experiment(X1_train, y1_train, X1_test, y1_test,
                              dataset_phishing_websites)
        elif algor == 'knn':
            run_knn_experiment(X1_train, y1_train, X1_test, y1_test,
                               dataset_phishing_websites)
        elif algor == 'svm':
            X1_train_scaled = preprocessing.scale(X1_train)
            X1_train_scaled = pd.DataFrame(X1_train_scaled)
            X1_test_scaled = preprocessing.scale(X1_test)
            X1_test_scaled = pd.DataFrame(X1_test_scaled)

            run_svm_exp(X1_train_scaled, y1_train, X1_test_scaled, y1_test,
                        dataset_phishing_websites)
        else:
            raise ValueError(algor + ' does not exist')
예제 #4
0
def run_experiments2(algor=None):
    X1, y1, X2, y2 = get_data()
    X2_train, X2_test, y2_train, y2_test = train_test_split(X2,
                                                            y2,
                                                            test_size=0.2,
                                                            shuffle=True)
    print(X2_train.shape)
    dataset_madelon = "Madelon"
    if not algor:
        run_decision_tree_exp(X2_train, y2_train, X2_test, y2_test,
                              dataset_madelon)
        run_boosting_experiment(X2_train, y2_train, X2_test, y2_test,
                                dataset_madelon)
        run_nn_experiment(X2_train, y2_train, X2_test, y2_test,
                          dataset_madelon)
        run_knn_experiment(X2_train, y2_train, X2_test, y2_test,
                           dataset_madelon)
        X2_train_scaled = preprocessing.scale(X2_train)
        X2_train_scaled = pd.DataFrame(X2_train_scaled)
        X2_test_scaled = preprocessing.scale(X2_test)
        X2_test_scaled = pd.DataFrame(X2_test_scaled)
        run_svm_exp(X2_train_scaled, y2_train, X2_test_scaled, y2_test,
                    dataset_madelon)
    else:
        if algor == 'dt':
            run_decision_tree_exp(X2_train, y2_train, X2_test, y2_test,
                                  dataset_madelon)
        elif algor == 'boosting':
            run_boosting_experiment(X2_train, y2_train, X2_test, y2_test,
                                    dataset_madelon)
        elif algor == 'nn':
            run_nn_experiment(X2_train, y2_train, X2_test, y2_test,
                              dataset_madelon)
        elif algor == 'knn':
            run_knn_experiment(X2_train, y2_train, X2_test, y2_test,
                               dataset_madelon)
        elif algor == 'svm':
            X2_train_scaled = preprocessing.scale(X2_train)
            X2_train_scaled = pd.DataFrame(X2_train_scaled)
            X2_test_scaled = preprocessing.scale(X2_test)
            X2_test_scaled = pd.DataFrame(X2_test_scaled)

            run_svm_exp(X2_train_scaled, y2_train, X2_test_scaled, y2_test,
                        dataset_madelon)
        else:
            raise ValueError(algor + ' does not exist')
예제 #5
0
    def pre_process(self):
        if K.image_data_format() == 'channels_first':
            x_train = self.x_train.reshape(self.x_train.shape[0], 1,
                                           self.img_rows, self.img_cols)
            x_val = self.x_val.reshape(self.x_val.shape[0], 1, self.img_rows,
                                       self.img_cols)
            x_test = self.x_test.reshape(self.x_test.shape[0], 1,
                                         self.img_rows, self.img_cols)
            input_shape = (1, self.img_rows, self.img_cols)
        else:
            x_train = self.x_train.reshape(self.x_train.shape[0],
                                           self.img_rows, self.img_cols, 1)
            x_val = self.x_val.reshape(self.x_val.shape[0], self.img_rows,
                                       self.img_cols, 1)
            x_test = self.x_test.reshape(self.x_test.shape[0], self.img_rows,
                                         self.img_cols, 1)
            input_shape = (self.img_rows, self.img_cols, 1)
        x_train = x_train.astype('float32')
        x_val = x_val.astype('float32')
        x_test = x_test.astype('float32')
        x_train, X_min, X_max = scale(x_train, 0, 255)
        x_val, _, _ = scale(x_val, 0, 255, X_min=X_min, X_max=X_max)
        x_test, _, _ = scale(x_test, 0, 255, X_min=X_min, X_max=X_max)
        x_train /= 255
        x_val /= 255
        x_test /= 255
        # convert class vectors to binary class matrices
        f = False
        if f:
            i = 0
            for row in x_train:
                x_train[i, :] = preprocess_input(row)
                i = i + 1
            i = 0
            for row in x_val:
                x_val[i, :] = preprocess_input(row)
                i = i + 1
            for row in x_test:
                x_test[i, :] = preprocess_input(row)
                i = i + 1
        self.y_train = keras.utils.to_categorical(self.y_train,
                                                  self.num_classes)
        self.y_val = keras.utils.to_categorical(self.y_val, self.num_classes)
        self.y_test = keras.utils.to_categorical(self.y_test, self.num_classes)

        return  #x_train, y_train, x_val, y_val, x_test, y_test, input_shape
예제 #6
0
def predict_emotion_using_AV_model(data_df):
    # using Russell's Arousal-Valence model, estimate the angular quantity that represents
    # the predicted sentiment
    # In Russell's model, caloriesBurned is the horizontal axis (pleasrue-displeasure)
    # while beats per minute (bpm) represetns the vertical axis (sleepiness-arousal)

    # Parameter:
    # data_df : the data frame to be testeed upon, its columns must be two.
    # Returns
    # predicted_emotions : the textual description that displays the predicted emotion'

    # scale x any y to the range of -1 and 1 so that they can be applied to the AV model coordinate
    min_value_0 = min(data_df.iloc[:,0])
    max_value_0 = max(data_df.iloc[:,0])
    data_df.iloc[:,0] = data_df.iloc[:,0].apply(lambda x: preprocessing.scale(x,min_value_0, max_value_0, -1.0,1.0))
    min_value_1 = min(data_df.iloc[:,1])
    max_value_1 = max(data_df.iloc[:,1])
    data_df.iloc[:,1] = data_df.iloc[:,1].apply(lambda x: preprocessing.scale(x,min_value_1, max_value_1, -1.0,1.0))

    return data_df.apply(lambda x: calculate_angle(x), axis=1)
예제 #7
0
    def preprocess_data(self, x, y=None):
        '''Prepare the data for the neural network.

            - Remove 0's from the time channels
            - Center the data on 0
            - Scale it to have a standard deviation of 1'''
        std = 1
        preprocessing.fix_time_zeros(x)
        means = preprocessing.center(x)
        stds = preprocessing.scale(x, std, mode='standardize')
        def repeat_transformation(other):
            if len(other) == 0:
                return
            else:
                preprocessing.fix_time_zeros(other)
                other -= means
                other /= stds/std
        return repeat_transformation
예제 #8
0
    def preprocess_data(self, x, y=None):
        '''Prepare the data for the neural network.

            - Remove 0's from the time channels
            - Center the data on 0
            - Scale it to have a standard deviation of 1'''
        std = 1
        preprocessing.fix_time_zeros(x)
        means = preprocessing.center(x)
        stds = preprocessing.scale(x, std, mode='standardize')

        def repeat_transformation(other):
            if len(other) == 0:
                return
            else:
                preprocessing.fix_time_zeros(other)
                other -= means
                other /= stds / std

        return repeat_transformation
def main(model_type, t_spread_min, t_spread_max, ell_spread_min, ell_spread_max, n, n_test, n_epochs, data_dir):
    # Generate data
    feat, y, _, _ = pendulum(n=n, t_spread=[t_spread_min, t_spread_max], ell_spread=[ell_spread_min, ell_spread_max])

    # Set up data
    x_train, x_val, y_train, y_val = train_test_split(feat, y, test_size=val_proportion, random_state=42)
    x_scaler, x_train, x_val = scale(x_train, x_val)
    y_scaler, y_train, y_val = scale(y_train, y_val)

    t_range_str = f'trange{int(100*t_spread_min)}to{int(100*t_spread_max)}'
    model_name = f'{model_type}_{t_range_str}_{n_epochs}ep'

    os.makedirs(data_dir, exist_ok=True)

    if not os.path.isfile(f'{data_dir}x_scaler_{t_range_str}.pkl'):
        with open(f'{data_dir}x_scaler_{t_range_str}.pkl', 'wb') as file_pi:
            pickle.dump(x_scaler, file_pi)
        with open(f'{data_dir}y_scaler_{t_range_str}.pkl', 'wb') as file_pi:
            pickle.dump(y_scaler, file_pi)

    # train and save models
    model_number = 1
    while os.path.isfile(f'{data_dir}model_{model_name}_{str(model_number).zfill(3)}.h5'):
        model_number += 1

    if model_type == 'de':
        models = [mlp(loss='nll') for _ in range(n_models)]
    elif model_type == 'cd':
        n_features = x_train.shape[1]
        n_outputs = y_train.shape[1]
        dropout_reg = 2. / n
        models = [make_model(n_features, n_outputs, n_neurons, dropout_reg)]
    elif model_type == 'bnn':
        models = [mlp_flipout()]
    else:
        raise ValueError(f'Model type {model_type} not recognized!')

    for j, mod in enumerate(models):
        print(f'Model {j+1}')
        history = mod.fit(x_train, y_train, epochs=n_epochs, validation_data=(x_val, y_val))
        mod.save_weights(f'{data_dir}model_{model_name}_{str(model_number+j).zfill(3)}.h5')
        with open(f'{data_dir}history_{model_name}_{str(model_number+j).zfill(3)}.pkl', 'wb') as file_pi:
            pickle.dump(history.history, file_pi)

    # Generate test set
    feat_test, _, _, _ = pendulum(n=n_test, t_spread=[t_spread_min, t_spread_max],
                                  ell_spread=[ell_spread_min, ell_spread_max], seed=666)
    feat_test = x_scaler.transform(feat_test)

    # make predictions
    if model_type == 'de':
        y_pred = []
        for model in models:
            y_pred.append(model(feat_test.astype('float32')))
    elif model_type == 'cd':
        y_pred = np.array([models[0].predict(feat_test) for _ in range(n_models)])
    elif model_type == 'bnn':
        y_pred = [models[0](feat_test.astype('float32')) for _ in range(n_models)]

    if model_type == 'de' or model_type == 'bnn':
        y_pred_val = [pred.loc.numpy() for pred in y_pred]
        y_pred_unc = [pred.scale.numpy() for pred in y_pred]
    elif model_type == 'cd':
        y_pred_val = y_pred[:, :, :1]
        y_pred_unc = np.sqrt(np.exp(y_pred[:, :, 1:]))

    y_pred_val_resc = [y_scaler.inverse_transform(y) for y in y_pred_val]
    y_pred_unc_resc = [y / y_scaler.scale_[0] for y in y_pred_unc]

    y_pred_val_resc = np.array(y_pred_val_resc).reshape((n_models, n_test))
    y_pred_unc_resc = np.array(y_pred_unc_resc).reshape((n_models, n_test))

    y_pred_mean = np.mean(y_pred_val_resc, axis=0)
    y_pred_ep_unc = np.std(y_pred_val_resc, axis=0)
    y_pred_al_unc = np.sqrt(np.mean(y_pred_unc_resc * y_pred_unc_resc, axis=0))
    y_pred_unc = np.sqrt(y_pred_al_unc ** 2 + y_pred_ep_unc ** 2)

    np.save(f'{data_dir}y_pred_test_{model_name}_{str(model_number).zfill(3)}.npy', y_pred_mean)
    np.save(f'{data_dir}y_pred_test_alunc_{model_name}_{str(model_number).zfill(3)}.npy', y_pred_al_unc)
    np.save(f'{data_dir}y_pred_test_epunc_{model_name}_{str(model_number).zfill(3)}.npy', y_pred_ep_unc)
    np.save(f'{data_dir}y_pred_test_prunc_{model_name}_{str(model_number).zfill(3)}.npy', y_pred_unc)
예제 #10
0
def Pipeline(X_train, y_train, X_test, n_dims=44):
    id_train = np.array(X_train["id"])
    X_train = X_train.drop(columns=["id"])
    id_test = np.array(X_test["id"])
    X_test = X_test.drop(columns=["id"])

    X_train = np.array(X_train)
    y_train = np.array(y_train)
    X_test = np.array(X_test)

    ind_numeric = []
    for i in range(len(X_train[0])):
        if len(np.unique(X_train[:, i])) > 2:
            ind_numeric.append(i)

    print("Hay " + str(len(ind_numeric)) + " variables numericas")
    '''
    ind_delete = np.where(y_train=="functional needs repair")[0]
    y_train = np.delete(y_train, ind_delete, axis=0)
    X_train = np.delete(X_train, ind_delete, axis=0)
    '''

    #plotData(X_train, y_train, "raw")

    print("Scaling data...")
    X_train = preprocessing.scale(X_train)
    X_test = preprocessing.scale(X_test)

    #plotData(X_train, y_train, "scaled")

    print("PCA con " + str(n_dims) + " componentes...")
    X_train_binary = np.delete(X_train, ind_numeric, axis=1)
    X_test_binary = np.delete(X_test, ind_numeric, axis=1)
    X_train_numeric = X_train[:, ind_numeric]
    X_test_numeric = X_test[:, ind_numeric]
    pca = PCA(n_components=n_dims)
    #pca = KernelPCA(n_components=n_dims, kernel="linear", n_jobs=-1)
    X1 = pca.fit_transform(X_train_binary)
    X2 = pca.transform(X_test_binary)
    X_train = np.hstack((X_train_numeric, X1))
    X_test = np.hstack((X_test_numeric, X2))
    print("Numero de features: " + str(len(X_train[0])))

    #plotData(X_train, y_train, "PCA")
    '''
    print("Reduccion de dimensionalidad con AutoEncoder...")
    hid = [50,60,50]
    X_train, X_test = autoencoder.fitTransform(X_train, X_test, 50, hid, bsize=32)
    print("Numero de features: " + str(len(X_train[0])))
    '''
    '''
    print("Reduccion de dimensionalidad con AutoEncoder...")
    hid = [250,200,150,100,50]
    X_train_binary = np.delete(X_train, ind_numeric, axis=1)
    X_test_binary = np.delete(X_test, ind_numeric, axis=1)
    X_train_numeric = X_train[:,ind_numeric]
    X_test_numeric = X_test[:,ind_numeric]
    X1, X2 = autoencoder.fitTransform(X_train_binary, X_test_binary, 30, hid, bsize=32)
    X_train = np.hstack((X_train_numeric, X1))
    X_test = np.hstack((X_test_numeric, X2))
    print("Numero de features: " + str(len(X_train[0])))
    '''

    print("IPF...")
    X_train, y_train = IPF(X_train, y_train)
    print("Numero de instancias: " + str(len(X_train)))
    print("Instancias por clase:")
    print(np.unique(y_train, return_counts=True))

    #plotData(X_train, y_train, "IPF")
    '''
    print("Denoising autoencoder...")
    hid = [32,16,32]
    X_train, X_test = autoencoder_denoising.fitTransform(X_train, X_test, 250, hid, bsize=32, kreg=None, areg=None)
    '''
    '''
    print("AllKNN...")
    X_train, y_train = AllKNN(n_neighbors=7, n_jobs=8).fit_resample(X_train, y_train)
    print("Numero de instancias: " + str(len(X_train)))
    print("Instancias por clase:")
    print(np.unique(y_train,return_counts=True))
    '''
    '''
    print("Feature selection...")
    feature_selector = SelectKBest(f_classif, k="all").fit(X_train, y_train)
    X_train = feature_selector.transform(X_train)
    X_test = feature_selector.transform(X_test)
    print("Numero de features: " + str(len(X_train[0])))
    '''

    print("SMOTE...")
    X_train, y_train = SMOTE(sampling_strategy={
        "functional needs repair": 7500,
        "non functional": 22000
    },
                             random_state=123456789,
                             n_jobs=20,
                             k_neighbors=7).fit_resample(X_train, y_train)
    print("Numero de instancias: " + str(len(X_train)))
    print("Instancias por clase:")
    print(np.unique(y_train, return_counts=True))

    #plotData(X_train, y_train, "SMOTE")
    '''
    print("ADASYN...")
    X_train,y_train = ADASYN(sampling_strategy = {"functional needs repair": 5000, "non functional": 22500}, random_state=123456789, n_jobs=8, n_neighbors=7).fit_resample(X_train,y_train)
    print("Numero de instancias: " + str(len(X_train)))
    print("Instancias por clase:")
    print(np.unique(y_train,return_counts=True))
    '''

    print("Cleaning anomalies...")
    ind_functional = np.where(y_train == "functional")[0]
    ind_non_functional = np.where(y_train == "non functional")[0]
    ind_functional_repair = np.where(y_train == "functional needs repair")[0]
    X1, y1 = cleanAnomalies(X_train[ind_functional], y_train[ind_functional])
    X2, y2 = cleanAnomalies(X_train[ind_non_functional],
                            y_train[ind_non_functional])
    X3, y3 = cleanAnomalies(X_train[ind_functional_repair],
                            y_train[ind_functional_repair])
    X_train = np.concatenate((X1, X2), axis=0)
    X_train = np.concatenate((X_train, X3), axis=0)
    y_train = np.concatenate((y1, y2), axis=0)
    y_train = np.concatenate((y_train, y3), axis=0)
    print("Instancias por clase:")
    print(np.unique(y_train, return_counts=True))

    #plotData(X_train, y_train, "anomalias_knn")
    '''
    print("EditedNearestNeighbours...")
    X_train, y_train = EditedNearestNeighbours(sampling_strategy="not minority", n_neighbors=15, n_jobs=20, kind_sel="mode").fit_resample(X_train, y_train)
    print("Numero de instancias: " + str(len(X_train)))
    print("Instancias por clase:")
    print(np.unique(y_train,return_counts=True))
    '''
    '''
    print("SSMA...")
    selector = SSMA(n_neighbors=1, alpha=0.95, max_loop=10, initial_density=0.9).fit(X_train,y_train)
    X_train = selector.X_
    y_train = selector.y_
    print("Numero de instancias: " + str(len(X_train)))
    print("Instancias por clase:")
    print(np.unique(y_train,return_counts=True))
    '''
    '''
    print("Generando la métrica con DML...")
    train_set, _, train_labels, _ = train_test_split(X_train, y_train, train_size=0.5, random_state=123456789)
    print("Tamaño del conjunto original: " + str(len(X_train)) + ", tamaño del train: " + str(len(train_set)))
    dml = KLMNN().fit(train_set, train_labels)
    X_train = dml.transform(X_train)
    X_test = dml.transform(X_test)
    '''

    return X_train, y_train, id_train, X_test, id_test
예제 #11
0
def main(index_exp, index_split):
    
    faulthandler.enable()
    torch.cuda.empty_cache()
    
    best_error = 100
    lr_step = [40, 70, 120]
    multiframe = ['convlstm', 'convfc']
    dirName = '%s_data%d_%s_%s_%s'%(args.model_name, args.data_cate, args.augmentation, args.loss_type, args.file_name)
    fileName = '%s_split%d_exp%d'%(dirName, index_split, index_exp)
    
    # Create folder for results of this model
    if not os.path.exists('./results/%s'%(dirName)):
        os.makedirs('./results/%s'%(dirName))
    
    # ------------- Wrap up dataloader -----------------
    if args.input_type == 'signal':
        X, Y_reg, C = raw_dataloader.read_data([1,2,3], list(range(11)), channel_limit=21, rm_baseline=True)
        num_channel = X.shape[1]
        num_feature = X.shape[2]     # Number of time sample
        
        # Remove trials
        X, Y_reg = preprocessing.remove_trials(X, Y_reg, threshold=60)
        
        # Split data for cross validation
        if args.num_fold == 1:
            train_data, test_data, train_target, test_target = train_test_split(X, Y_reg, test_size=0.1, random_state=23)
            # Random state 15: training error becomes lower, testing error becomes higher
        else:
            kf = KFold(n_splits=args.num_fold, shuffle=True, random_state=23)
            for i, (train_index, test_index) in enumerate(kf.split(X)):
                if i == index_exp:
                    train_data, train_target = X[train_index, :], Y_reg[train_index]
                    test_data, test_target = X[test_index, :], Y_reg[test_index]
                    
        # Split data for ensemble methods
        if not args.ensemble:
            if args.num_split > 1:
                data_list, target_list = preprocessing.stratified_split(train_data, train_target, n_split=args.num_split, mode=args.split_mode)
                train_data, train_target = data_list[index_split], target_list[index_split]
                '''
                kf = KFold(n_splits=args.num_split, shuffle=True, random_state=32)
                for i, (other_index, split_index) in enumerate(kf.split(train_data)):
                    if i == index_split:
                        train_data, train_target = train_data[split_index, :], train_target[split_index]
                '''
        # Normalize the data
        if args.normalize:
            train_data, test_data = preprocessing.normalize(train_data, test_data)
        
                    
        # Data augmentation
        if args.augmentation == 'overlapping':
            train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation,
                                                             (256, 64, 128))
            test_data, test_target = data_augmentation.aug(test_data, test_target, args.augmentation,
                                                             (256, 64, 128))
        elif args.augmentation == 'add_noise':
            train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation,
                                                             (30, 1))
        elif args.augmentation == 'add_noise_minority':
            train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation,
                                                             (30, 1))
        elif args.augmentation == 'SMOTER':
            train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation)
            
        # scale data
        if args.scale_data:
            train_data, test_data = train_data.reshape((train_data.shape[0],-1)), test_data.reshape((test_data.shape[0],-1))
            train_data, test_data = preprocessing.scale(train_data, test_data)
            train_data = train_data.reshape((train_data.shape[0],num_channel, -1))
            test_data = test_data.reshape((test_data.shape[0],num_channel, -1))
            
        if args.model_name in ['eegnet', 'eegnet_trans_signal']:
            # (sample, channel, time) -> (sample, channel_NN, channel_EEG, time)
            [train_data, test_data] = [X.reshape((X.shape[0], 1, num_channel, num_feature)) \
                                       for X in [train_data, test_data]]
        
        
        (train_dataTS, train_targetTS, test_dataTS, test_targetTS) = map(
                torch.from_numpy, (train_data, train_target, test_data, test_target))
        [train_dataset,test_dataset] = map(\
                Data.TensorDataset, [train_dataTS.float(),test_dataTS.float()], [train_targetTS.float(),test_targetTS.float()])

        if not args.str_sampling:
            train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
        test_loader = Data.DataLoader(test_dataset, batch_size=args.batch_size)
        
        model_param = [train_data.shape]
        
    elif args.input_type == 'power':
        if args.data_cate == 1:
            ERSP_all, tmp_all, freqs = dataloader.load_data()
        elif args.data_cate == 2:
            data_file = './raw_data/ERSP_from_raw_%d_channel21.data'%(args.index_sub)
            with open(data_file, 'rb') as fp:
                dict_ERSP = pickle.load(fp)
            ERSP_all, tmp_all = dict_ERSP['ERSP'], dict_ERSP['SLs']
        num_channel = ERSP_all.shape[1]
        num_freq = ERSP_all.shape[2]
            
        # Remove trials
        ERSP_all, tmp_all = preprocessing.remove_trials(ERSP_all, tmp_all, threshold=60)
        
        # Split data for cross validation
        if args.num_fold == 1:
            train_data, test_data, train_target, test_target = train_test_split(ERSP_all, tmp_all[:,2], test_size=0.1, random_state=23)
        else:
            kf = KFold(n_splits=args.num_fold, shuffle=True, random_state=23)
            for i, (train_index, test_index) in enumerate(kf.split(ERSP_all)):
                if i == index_exp:
                    train_data, test_data = ERSP_all[train_index, :], ERSP_all[test_index, :]
                    if args.data_cate == 2:
                        train_target, test_target = tmp_all[train_index], tmp_all[test_index]
                    else:
                        train_target, test_target = tmp_all[train_index, 2], tmp_all[test_index, 2]
                        
                    if args.add_CE:
                        assert args.data_cate == 2
                        with open('./raw_data/CE_sub%d'%(args.index_sub), 'rb') as fp:
                            CE = pickle.load(fp)
                        CE_train, CE_test = CE[train_index,:], CE[test_index,:]
                        # PCA for CE
                        pca = PCA(n_components=10)
                        pca.fit(CE_train)
                        CE_train, CE_test = pca.transform(CE_train), pca.transform(CE_test)
                        
                    
        # Split data for ensemble methods
        if not args.ensemble:
            if args.num_split > 1:
                data_list, target_list = preprocessing.stratified_split(train_data, train_target, n_split=args.num_split, mode=args.split_mode)
                train_data, train_target = data_list[index_split], target_list[index_split]
                '''
                kf = KFold(n_splits=args.num_split, shuffle=True, random_state=32)
                for i, (other_index, split_index) in enumerate(kf.split(np.arange(len(train_data)))):
                    if i == index_split:
                        train_data, train_target = train_data[split_index, :], train_target[split_index]
                '''
                    
        # Concatenate train and test for standardizinsg
        data = np.concatenate((train_data, test_data), axis=0)
        target = np.concatenate((train_target, test_target))
                    
        # Standardize data
        num_train = len(train_data)
        data, target = preprocessing.standardize(data, target, train_indices = np.arange(num_train), threshold=0.0)
        data = data.reshape((data.shape[0], -1))
        
        # Scale target between 0 and 1
        if args.post_scale:
            print('Scale the target between 0-1')
            target = target/60
        
        # Split data
        train_data, test_data = data[:num_train, :], data[num_train:, :]
        train_target, test_target = target[:num_train], target[num_train:]
        
        # Data augmentation
        if args.augmentation == 'SMOTER':
            train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation)
        
        # center data
        if args.center_flag:
            train_data, test_data = preprocessing.center(train_data, test_data)
            
        # scale data
        if args.scale_data:
            train_data, test_data = preprocessing.scale(train_data, test_data)
            
        # Add conditional entropy
        if args.add_CE:
            train_data = np.concatenate((train_data, CE_train), axis=1)
            test_data = np.concatenate((test_data, CE_train), axis=1)
            
        if args.model_name == 'eegnet_trans_power':
            # (sample, channel, freq) -> (sample, channel_NN, channel_EEG, freq)
            [train_data, test_data] = [X.reshape((X.shape[0], 1, num_channel, num_freq)) \
                                       for X in [train_data, test_data]]
        
        (train_dataTS, train_targetTS, test_dataTS, test_targetTS) = map(
                torch.from_numpy, (train_data, train_target, test_data, test_target))
        [train_dataset,test_dataset] = map(\
                Data.TensorDataset, [train_dataTS.float(),test_dataTS.float()], [train_targetTS.float(),test_targetTS.float()])

        if not args.str_sampling:
            train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
        test_loader = Data.DataLoader(test_dataset, batch_size=args.batch_size)
        
        model_param = [train_data.shape]
        
    elif args.input_type == 'image':
        
        if args.ensemble:
            input_model_name = args.pre_model_name
        else:
            input_model_name = args.model_name
        
        assert (input_model_name in multiframe) == (args.num_time>1)
        
        # Let input size be 224x224 if the model is vgg16
        if input_model_name in ['vgg16', 'resnet50']:
            input_size = 224
        else:
            input_size = 64
            
        # Load Data
        data_transforms = {
                'train': transforms.Compose([
                        ndl.Rescale(input_size, args.num_time),
                        ndl.ToTensor(args.num_time)]), 
                'test': transforms.Compose([
                        ndl.Rescale(input_size, args.num_time),
                        ndl.ToTensor(args.num_time)])
                }

        print("Initializing Datasets and Dataloaders...")

        # Create training and testing datasets
        # image_datasets = {x: ndl.TopoplotLoader(args.image_folder, x, args.num_time, data_transforms[x],
        #                 scale=args.scale_image, index_exp=index_exp, index_split=index_split) for x in ['train', 'test']}
        [train_dataset,test_dataset] = [ndl.TopoplotLoader(args.image_folder, x, args.num_time, data_transforms[x],
                        scale=args.scale_image, index_exp=index_exp, index_split=index_split) for x in ['train', 'test']]

        # Create training and testing dataloaders
        # if not args.str_sampling:
        #     train_loader = Data.DataLoader(image_datasets['train'], batch_size=args.batch_size, shuffle=True, num_workers=4)
        # test_loader = Data.DataLoader(image_datasets['test'], batch_size=args.batch_size, shuffle=False, num_workers=4)
        if not args.str_sampling:
            train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4)
        test_loader = Data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=4)
        model_param = [input_size]
        
    elif args.input_type == 'EEGLearn_img':
        
        # Load data
        with open('./EEGLearn_imgs/data1.data', 'rb') as fp:
            dict_data = pickle.load(fp)
        data, target = dict_data['data'], dict_data['target']
        input_size = data.shape[2]
        
        # Split data for cross validation
        if args.num_fold == 1:
            train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=0.1, random_state=23)
            # Random state 15: training error becomes lower, testing error becomes higher
        else:
            kf = KFold(n_splits=args.num_fold, shuffle=True, random_state=23)
            for i, (train_index, test_index) in enumerate(kf.split(data)):
                if i == index_exp:
                    train_data, train_target = data[train_index, :], target[train_index]
                    test_data, test_target = data[test_index, :], target[test_index]
        
        (train_dataTS, train_targetTS, test_dataTS, test_targetTS) = map(
                torch.from_numpy, (train_data, train_target, test_data, test_target))
        [train_dataset,test_dataset] = map(\
                Data.TensorDataset, [train_dataTS.float(),test_dataTS.float()], [train_targetTS.float(),test_targetTS.float()])

        if not args.str_sampling:
            train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
        test_loader = Data.DataLoader(test_dataset, batch_size=args.batch_size)
        
        
    # ------------ Create model ---------------
    if args.input_type in ['image','EEGLearn_img']:
        model_param = [input_size]
    else:
        model_param = [train_data.shape]
    
    if not args.ensemble:
        model = read_model(args.model_name, model_param)
    else:
        pre_models = []
        for i in range(args.num_split):
            pre_model = read_model(args.pre_model_name, model_param)
            pre_model.load_state_dict( torch.load('%s/last_model_exp%d_split%d.pt'%(args.ensemble, index_exp, i)) )
            set_parameter_requires_grad(pre_model, True)
            pre_models.append(pre_model)
            
        model = models.__dict__[args.model_name](pre_models)
        
    print('Use model %s'%(args.model_name))
        
    # Run on GPU
    model = model.to(device=device)
    
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)

    # define loss function (criterion) and optimizer
    if args.loss_type == 'L2':
        criterion = nn.MSELoss().to(device=device)
    elif args.loss_type == 'L1':
        criterion = nn.L1Loss().to(device=device)
    elif args.loss_type == 'L4':
        criterion = L4Loss
    elif args.loss_type == 'MyLoss':
        criterion = MyLoss
    print('Use %s loss'%(args.loss_type))
    
    optimizer = torch.optim.SGD(model.parameters(), lr=args.lr_rate,momentum=0.9)
    #optimizer = torch.optim.Adam(model.parameters(), lr=args.lr_rate)
    
    # Record loss and accuracy of each epoch
    dict_error = {'train_std': list(range(args.num_epoch)), 'test_std': list(range(args.num_epoch)),
                  'train_mape': list(range(args.num_epoch)), 'test_mape': list(range(args.num_epoch))}
    
    # optionally evaluate the trained model
    if args.evaluate:
        if args.resume:
            if os.path.isfile(args.resume):
                model.load_state_dict(torch.load(args.resume))
        
        _, target, pred, _, _ = validate(test_loader, model, criterion)
        plot_scatter(target, pred, dirName, fileName)
        return 0
    
    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_error = checkpoint['best_error']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            dict_error['train_std'][:args.start_epoch] = checkpoint['dict_error']['train_std']
            dict_error['test_std'][:args.start_epoch] = checkpoint['dict_error']['test_std']
            print("=> loaded checkpoint '{}' (epoch {})"
                  .format(args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))
    
    # ------------- Train model ------------------

    for epoch in range(args.start_epoch, args.num_epoch):
        # Create dataloader if using stratified sampler
        if args.str_sampling:
            sampler = SubsetRandomSampler(get_indices_RSS(train_target, int(0.5*len(train_target))))
            train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, \
                                           sampler=sampler, num_workers=4)
            
        # Learning rate decay
        if epoch in lr_step:
            for param_group in optimizer.param_groups:
                param_group['lr'] *= 0.1
        
        # train for one epoch
        _, dict_error['train_std'][epoch], dict_error['train_mape'][epoch] = \
            train(train_loader, model, criterion, optimizer, epoch)

        # evaluate on validation set
        _, _, _, std_error, dict_error['test_mape'][epoch] = validate(test_loader, model, criterion)
        dict_error['test_std'][epoch] = std_error

        # remember best standard error and save checkpoint
        is_best = std_error < best_error
        best_error = min(std_error, best_error)
        save_checkpoint({
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'best_error': best_error,
            'optimizer': optimizer.state_dict(),
            'dict_error': dict_error
        }, is_best)
        
        # Save best model
        if is_best:
            torch.save(model.state_dict(), './results/%s/best_model_exp%d_split%d.pt'%(dirName, index_exp, index_split))
        if epoch == args.num_epoch-1:
            torch.save(model.state_dict(), './results/%s/last_model_exp%d_split%d.pt'%(dirName, index_exp, index_split))
    # Plot error curve
    plot_error(dict_error, dirName, fileName)
    
    # Plot scatter plots
    _, target, pred, _, _ = validate(test_loader, model, criterion)
    plot_scatter(target, pred, dirName, fileName)
    dict_error['target'], dict_error['pred'] = target, pred
    
    # Plot histogram
    import matplotlib.pyplot as plt
    plt.hist(target, label = 'True')
    plt.hist(pred, label = 'Pred')
    plt.legend(loc='upper right')
    plt.savefig('./results/hist.png')
    
    # Save error over epochs
    with open('./results/%s/%s.data'%(dirName, fileName), 'wb') as fp:
        pickle.dump(dict_error, fp)
예제 #12
0
    def get(self):
        # global var
        # global test
        # var += 1
        # test='/test'+str(var)
        # args = request.args
        # print(var)

        company = request.args.get('company')
        compare = request.args.get('compare')
        start = request.args.get('start')
        end = request.args.get('end')

        df = yf.download(company, start, end)

        close_px = df['Adj Close']
        mavg = close_px.rolling(window=100).mean()

        print(mavg)

        print(df.head())

        print(df.tail())

        import matplotlib.pyplot as plt
        from matplotlib import style

        # Adjusting the size of matplotlib
        import matplotlib as mpl
        mpl.rc('figure', figsize=(8, 7))
        mpl.__version__

        # Adjusting the style of matplotlib
        style.use('ggplot')

        close_px.plot(label=company)
        mavg.plot(label='mavg')
        plt.legend()

        plt.savefig('mavg.png', bbox_inches='tight')

        plt.clf()
        # plt.show()

        rets = close_px / close_px.shift(1) - 1
        rets.plot(label='return')
        plt.savefig('return.png', bbox_inches='tight')
        plt.clf()

        # plt.show()

        dfcomp = yf.download(['AAPL', 'GE', 'GOOG', 'IBM', 'MSFT'], start,
                             end)['Adj Close']

        print(dfcomp.tail())

        retscomp = dfcomp.pct_change()

        corr = retscomp.corr()
        print("hi")

        # cols = [col for col in retscomp.columns if compare in col]
        # print(retscomp[cols])
        print(corr)

        plt.scatter(retscomp[company], retscomp[compare])
        plt.xlabel('Returns-' + company)
        plt.ylabel('Returns-' + compare)

        plt.savefig('compare.png', bbox_inches='tight')
        plt.clf()

        # plt.show()

        #   Error
        #pd.scatter_matrix(retscomp, diagonal='kde', figsize=(10, 10));

        plt.imshow(corr, cmap='hot', interpolation='none')
        plt.colorbar()
        plt.xticks(range(len(corr)), corr.columns)
        plt.yticks(range(len(corr)), corr.columns)

        # plt.show()
        plt.savefig('correlation.png', bbox_inches='tight')
        plt.clf()

        plt.scatter(retscomp.mean(), retscomp.std())
        plt.xlabel('Expected returns')
        plt.ylabel('Risk')
        for label, x, y in zip(retscomp.columns, retscomp.mean(),
                               retscomp.std()):
            plt.annotate(label,
                         xy=(x, y),
                         xytext=(20, -20),
                         textcoords='offset points',
                         ha='right',
                         va='bottom',
                         bbox=dict(boxstyle='round,pad=0.5',
                                   fc='yellow',
                                   alpha=0.5),
                         arrowprops=dict(arrowstyle='->',
                                         connectionstyle='arc3,rad=0'))

        # plt.show()
        plt.savefig('risk-ret-rate.png', bbox_inches='tight')
        plt.clf()

        dfreg = df.loc[:, ['Adj Close', 'Volume']]

        a = df['High'] - df['Close']
        print(a)

        dfreg['HL_PCT'] = a / df['Close'] * 100.0
        print("yo --- yo ")

        print(dfreg['HL_PCT'])

        print(df['Close'])
        print(df['Open'])

        b = df['Close'] - df['Open']

        dfreg['PCT_change'] = b / df['Open'] * 100.0

        import math
        import numpy as np
        from sklearn import preprocessing, svm
        from sklearn.model_selection import train_test_split

        # Drop missing value
        dfreg.fillna(value=-99999, inplace=True)

        print(dfreg.shape)
        # We want to separate 1 percent of the data to forecast
        forecast_out = int(math.ceil(0.01 * len(dfreg)))

        # Separating the label here, we want to predict the AdjClose
        forecast_col = 'Adj Close'
        dfreg['label'] = dfreg[forecast_col].shift(-forecast_out)
        X = np.array(dfreg.drop(['label'], 1))

        # Scale the X so that everyone can have the same distribution for linear regression
        X = preprocessing.scale(X)

        # Finally We want to find Data Series of late X and early X (train) for model generation and evaluation
        X_lately = X[-forecast_out:]
        X = X[:-forecast_out]

        # Separate label and identify it as y
        y = np.array(dfreg['label'])
        y = y[:-forecast_out]

        print('Dimension of X', X.shape)
        print('Dimension of y', y.shape)

        # Separation of training and testing of model by cross validation train test split
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2)

        from sklearn.linear_model import LinearRegression
        from sklearn.neighbors import KNeighborsRegressor

        from sklearn.linear_model import Ridge
        from sklearn.preprocessing import PolynomialFeatures
        from sklearn.pipeline import make_pipeline

        # Linear regression
        clfreg = LinearRegression(n_jobs=-1)
        clfreg.fit(X_train, y_train)

        # Quadratic Regression 2
        clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())
        clfpoly2.fit(X_train, y_train)

        # Quadratic Regression 3
        clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())
        clfpoly3.fit(X_train, y_train)

        # KNN Regression
        clfknn = KNeighborsRegressor(n_neighbors=2)
        clfknn.fit(X_train, y_train)

        confidencereg = clfreg.score(X_test, y_test)
        confidencepoly2 = clfpoly2.score(X_test, y_test)
        confidencepoly3 = clfpoly3.score(X_test, y_test)
        confidenceknn = clfknn.score(X_test, y_test)

        print("The linear regression confidence is ", confidencereg)
        print("The quadratic regression 2 confidence is ", confidencepoly2)
        print("The quadratic regression 3 confidence is ", confidencepoly3)
        print("The knn regression confidence is ", confidenceknn)

        # Printing the forecast
        forecast_set = clfreg.predict(X_lately)
        dfreg['Forecast'] = np.nan
        print(forecast_set, confidencereg, forecast_out)

        last_date = dfreg.iloc[-1].name
        last_unix = last_date
        next_unix = last_unix + datetime.timedelta(days=1)

        for i in forecast_set:
            next_date = next_unix
            next_unix += datetime.timedelta(days=1)
            dfreg.loc[next_date] = [
                np.nan for _ in range(len(dfreg.columns) - 1)
            ] + [i]

        dfreg['Adj Close'].tail(500).plot()
        dfreg['Forecast'].tail(500).plot()
        plt.legend(loc=4)
        plt.xlabel('Date')
        plt.ylabel('Price')
        # plt.show()
        plt.savefig('forecast.png', bbox_inches='tight')
        plt.clf()

        from scipy.stats import norm

        # data = yf.download("AAPL", start = '2012-01-01', end='2017-01-01')['Adj Close']

        result = []
        #Define Variables
        S = yf.download(company, start, end)['Adj Close'][
            -1]  #apple['Adj Close'][-1] #starting stock price (i.e. last available real stock price)
        T = 50  #Number of trading days
        days = (df.index[-1] - df.index[0]).days
        cagr = (((
            (df['Adj Close'][-1]) / df['Adj Close'][1]))**(365.0 / days)) - 1
        mu = cagr  # 0.2309 #Return

        df['Returns'] = df['Adj Close'].pct_change()
        vol = df['Returns'].std() * math.sqrt(252)
        # vol = #0.4259 #Volatility

        #choose number of runs to simulate - I have chosen 10,000
        for i in range(100):
            #create list of daily returns using random normal distribution
            daily_returns = np.random.normal(mu / T, vol / math.sqrt(T), T) + 1

            #set starting price and create price series generated by above random daily returns
            price_list = [S]

            for x in daily_returns:
                price_list.append(price_list[-1] * x)

            #plot data from each individual run which we will plot at the end
            plt.plot(price_list)

            #append the ending value of each simulated run to the empty list we created at the beginning
            result.append(price_list[-1])

        #show the plot of multiple price series created above
        # plt.show()
        plt.savefig('monte.png', bbox_inches='tight')
        plt.clf()

        #create histogram of ending stock values for our mutliple simulations
        plt.hist(result, bins=50)
        # plt.show()
        plt.savefig('histo.png', bbox_inches='tight')
        plt.clf()

        #use numpy mean function to calculate the mean of the result
        print(round(np.mean(result), 2))
예제 #13
0
def CV(X, Y, S, D, classical):
    n_splits = 10

    # Cross validation (mixed subjects), 10 splits
    dict_error = {x:[AverageMeter() for i in range(n_splits)] for x in ['train_std', 'val_std', 'test_std', \
                                                                  'train_mape', 'val_mape', 'test_mape']}
    log_all = []
    start_time = time.time()
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=23)
    for i_exp, (train_index, test_index) in enumerate(kf.split(X)):
        print('----- [%.1f] Exp %d -----' % (time.time() - start_time, i_exp))

        # Wrap up training and testing data
        train_data, test_data = X[train_index, :], X[test_index, :]
        train_target, test_target = Y[train_index], Y[test_index]
        train_sub, test_sub = S[train_index], S[test_index]
        train_diff, test_diff = D[train_index], D[test_index]

        # Split training data into training and validation data
        train_data, val_data, train_sub, val_sub, train_diff, val_diff, train_target, val_target = \
            train_test_split(train_data, train_sub, train_diff, train_target, test_size=1/9, random_state=32)
        print('Number of (train, val, test): (%d,%d,%d)' %
              (len(train_data), len(val_data), len(test_data)))

        # Flatten the data
        if classical:
            [train_data, val_data, test_data] = [
                x.reshape((x.shape[0], -1))
                for x in [train_data, val_data, test_data]
            ]

        # Select ERSP correlated with SLs
        if args.SCF:
            train_data, test_data, select_indices = preprocessing.select_correlated_features(train_data, \
                                                                          train_target, test_data, num_features=args.SCF)
            val_data = val_data[:, select_indices == 1]

        # Data augmentation
        if args.augmentation == 'SMOTER':
            train_data, train_target = data_augmentation.aug(
                train_data, train_target, method=args.augmentation)

        # PCA
        if args.PCA:

            pca = PCA(n_components=200)
            pca.fit(train_data)
            train_data = pca.transform(train_data)
            val_data = pca.transform(val_data)
            test_data = pca.transform(test_data)

            #train_data, test_data = preprocessing.PCA_corr(train_data, train_target, test_data, num_features=10)

        # Add subject ID and difficulty level as features
        if args.add_sub_diff:
            # Onehot encode subject ID and difficulty level
            train_sub = onehot_encode(train_sub, 11)
            val_sub = onehot_encode(val_sub, 11)
            test_sub = onehot_encode(test_sub, 11)
            train_diff = onehot_encode(train_diff, 3)
            val_diff = onehot_encode(val_diff, 3)
            test_diff = onehot_encode(test_diff, 3)

            # Standardize data
            _, test_data = preprocessing.scale(train_data,
                                               test_data,
                                               mode='minmax')
            train_data, val_data = preprocessing.scale(train_data,
                                                       val_data,
                                                       mode='minmax')

            # Concatenate subject and difficulty
            train_data = np.concatenate((train_data, train_sub, train_diff),
                                        axis=1)
            val_data = np.concatenate((val_data, val_sub, val_diff), axis=1)
            test_data = np.concatenate((test_data, test_sub, test_diff),
                                       axis=1)

        # Regression
        if classical:

            train_pred, val_pred, test_pred = classical_regression(
                train_data, val_data, test_data, train_target)

            # Record error and prediction
            train_std = mean_squared_error(train_target, train_pred)**0.5
            val_std = mean_squared_error(val_target, val_pred)**0.5
            test_std = mean_squared_error(test_target, test_pred)**0.5
            train_mape = mean_absolute_percentage_error(
                train_target, train_pred)
            val_mape = mean_absolute_percentage_error(val_target, val_pred)

            test_mape = mean_absolute_percentage_error(test_target, test_pred)
            print('Split %d    Std: (%.1f,%.1f,%.1f), MAPE: (%.1f,%.1f,%.1f)' %
                  (i_exp, train_std, val_std, test_std, train_mape, val_mape,
                   test_mape))

            # test_pred_all[curr_test_index:curr_test_index+len(test_index)] = test_pred
            # test_target_all[curr_test_index:curr_test_index+len(test_index)] = test_target
            # test_pred_all = np.concatenate((test_pred_all, test_pred))
            # test_target_all = np.concatenate((test_target_all, test_target))
        else:
            train_std, val_std, test_std, train_mape, val_mape, test_mape = \
                deep_regression(train_data, val_data, test_data, train_target, val_target, test_target, train_sub,
                                val_sub, test_sub, -1, i_exp)

        dict_error['train_std'][i_exp].update(train_std, len(train_data))
        dict_error['val_std'][i_exp].update(val_std, len(val_data))
        dict_error['test_std'][i_exp].update(test_std, len(test_data))
        dict_error['train_mape'][i_exp].update(train_mape, len(train_data))
        dict_error['val_mape'][i_exp].update(val_mape, len(val_data))
        dict_error['test_mape'][i_exp].update(test_mape, len(test_data))

        log_sub = 'Exp%d\t\tStd: (%.1f,%.1f,%.1f), MAPE: (%.1f,%.1f,%.1f)\n' % (
            i_exp, dict_error['train_std'][i_exp].avg,
            dict_error['val_std'][i_exp].avg,
            dict_error['test_std'][i_exp].avg,
            dict_error['train_mape'][i_exp].avg,
            dict_error['val_mape'][i_exp].avg,
            dict_error['test_mape'][i_exp].avg)
        print(log_sub)
        log_all.append(log_sub)

        if classical:
            evaluate_result.plot_scatter(train_target,
                                         train_pred,
                                         dirName=args.dirName,
                                         fileName='%s_sub%d_train' %
                                         (args.dirName, i_exp))

    log_total = 'Total\t\tStd: (%.1f,%.1f,%.1f), MAPE: (%.1f,%.1f,%.1f)\n' % (
        avg_list(dict_error['train_std']), avg_list(dict_error['val_std']),
        avg_list(dict_error['test_std']), avg_list(dict_error['train_mape']),
        avg_list(dict_error['val_mape']), avg_list(dict_error['test_mape']))
    print(log_total)
    log_all.append(log_total)

    return log_all, dict_error
예제 #14
0
def LOSO(X, Y, S, D, classical):
    # Leave one subject out
    dict_error = {x:[AverageMeter() for i in range(11)] for x in ['train_std', 'val_std', 'test_std', \
                                                                  'train_mape', 'val_mape', 'test_mape']}
    log_all = []
    start_time = time.time()
    for i_base in range(11):
        print('----- [%.1f] Subject %d -----' %
              (time.time() - start_time, i_base))

        lst_model = LSTransform.LST(11, i_base)
        indices_base, indices_other = np.where(S == i_base)[0], np.where(
            S != i_base)[0]
        base_data, base_target, base_sub, base_diff = X[
            indices_base, :], Y[indices_base], S[indices_base], D[indices_base]
        other_data, other_target, other_sub, other_diff = X[
            indices_other, :], Y[indices_other], S[indices_other], D[
                indices_other]
        test_pred_all, test_target_all = np.array([]), np.array([])

        # K-fold cross validation (all test data are in one subject)
        kf = KFold(n_splits=5, shuffle=True, random_state=23)
        for i_split, (more_index, few_index) in enumerate(kf.split(base_data)):
            print('--- [%.1f] Split %d ---' %
                  (time.time() - start_time, i_split))
            # Wrap up training and testing data
            train_index, test_index = few_index, more_index
            train_data, test_data = np.concatenate(
                (base_data[train_index, :], other_data),
                axis=0), base_data[test_index, :]
            train_target, test_target = np.concatenate(
                (base_target[train_index], other_target),
                axis=0), base_target[test_index]
            train_sub, test_sub = np.concatenate(
                (base_sub[train_index], other_sub),
                axis=0), base_sub[test_index]
            train_diff, test_diff = np.concatenate(
                (base_diff[train_index], other_diff),
                axis=0), base_diff[test_index]

            # Split training data into training and validation data
            train_data, val_data, train_sub, val_sub, train_diff, val_diff, train_target, val_target = \
                train_test_split(train_data, train_sub, train_diff, train_target, test_size=1/9, random_state=32)

            print('Number of (train, val, test): (%d,%d,%d)' %
                  (len(train_data), len(val_data), len(test_data)))

            if args.LST:
                # LST for training data
                lst_model.fit_(train_data, train_target, train_sub)
                train_data = lst_model.transform_(train_data, train_target,
                                                  train_sub, args.num_closest,
                                                  args.dist_type)
                val_data = lst_model.transform_(val_data, val_target, val_sub,
                                                args.num_closest,
                                                args.dist_type)

            if args.SS:  # Source separation
                print('Apply source separation for time signal...')
                SS_model = source_separation.SourceSeparation(
                    train_data.shape[1], 11)
                SS_model.fit(train_data, train_sub)
                train_data = SS_model.transform(train_data, train_sub)
                val_data = SS_model.transform(val_data, val_sub)
                test_data = SS_model.transform(test_data, test_sub)

            # Flatten the data
            if classical:
                [train_data, val_data, test_data] = [
                    x.reshape((x.shape[0], -1))
                    for x in [train_data, val_data, test_data]
                ]

            # Select ERSP correlated with SLs
            if args.SCF:
                train_data, test_data, select_indices = preprocessing.select_correlated_features(train_data, \
                                                                              train_target, test_data, num_features=args.SCF)
                val_data = val_data[:, select_indices == 1]

            # Data augmentation
            if args.augmentation == 'SMOTER':
                train_data, train_target = data_augmentation.aug(
                    train_data, train_target, method=args.augmentation)

            # PCA
            if args.PCA:

                pca = PCA(n_components=200)
                pca.fit(train_data)
                train_data = pca.transform(train_data)
                val_data = pca.transform(val_data)
                test_data = pca.transform(test_data)

                #train_data, test_data = preprocessing.PCA_corr(train_data, train_target, test_data, num_features=10)

            # Add subject ID and difficulty level as features
            if args.add_sub_diff:
                # Onehot encode subject ID and difficulty level
                train_sub = onehot_encode(train_sub, 11)
                val_sub = onehot_encode(val_sub, 11)
                test_sub = onehot_encode(test_sub, 11)
                train_diff = onehot_encode(train_diff, 3)
                val_diff = onehot_encode(val_diff, 3)
                test_diff = onehot_encode(test_diff, 3)

                # Standardize data
                _, test_data = preprocessing.scale(train_data,
                                                   test_data,
                                                   mode='minmax')
                train_data, val_data = preprocessing.scale(train_data,
                                                           val_data,
                                                           mode='minmax')

                # Concatenate subject and difficulty
                train_data = np.concatenate(
                    (train_data, train_sub, train_diff), axis=1)
                val_data = np.concatenate((val_data, val_sub, val_diff),
                                          axis=1)
                test_data = np.concatenate((test_data, test_sub, test_diff),
                                           axis=1)

            # Regression
            if classical:

                train_pred, val_pred, test_pred = classical_regression(
                    train_data, val_data, test_data, train_target)

                # Record error and prediction
                train_std = mean_squared_error(train_target, train_pred)**0.5
                val_std = mean_squared_error(val_target, val_pred)**0.5
                test_std = mean_squared_error(test_target, test_pred)**0.5
                train_mape = mean_absolute_percentage_error(
                    train_target, train_pred)
                val_mape = mean_absolute_percentage_error(val_target, val_pred)

                test_mape = mean_absolute_percentage_error(
                    test_target, test_pred)
                print(
                    'Split %d    Std: (%.1f,%.1f,%.1f), MAPE: (%.1f,%.1f,%.1f)'
                    % (i_split, train_std, val_std, test_std, train_mape,
                       val_mape, test_mape))

                # test_pred_all[curr_test_index:curr_test_index+len(test_index)] = test_pred
                # test_target_all[curr_test_index:curr_test_index+len(test_index)] = test_target
                test_pred_all = np.concatenate((test_pred_all, test_pred))
                test_target_all = np.concatenate(
                    (test_target_all, test_target))
            else:
                train_std, val_std, test_std, train_mape, val_mape, test_mape = \
                    deep_regression(train_data, val_data, test_data, train_target, val_target, test_target, train_sub,
                                    val_sub, test_sub, i_base, i_split)

            dict_error['train_std'][i_base].update(train_std, len(train_data))
            dict_error['val_std'][i_base].update(val_std, len(val_data))
            dict_error['test_std'][i_base].update(test_std, len(test_data))
            dict_error['train_mape'][i_base].update(train_mape,
                                                    len(train_data))
            dict_error['val_mape'][i_base].update(val_mape, len(val_data))
            dict_error['test_mape'][i_base].update(test_mape, len(test_data))

        log_sub = 'Sub%2d\t\tStd: (%.1f,%.1f,%.1f), MAPE: (%.1f,%.1f,%.1f)\n' % (
            i_base, dict_error['train_std'][i_base].avg,
            dict_error['val_std'][i_base].avg,
            dict_error['test_std'][i_base].avg,
            dict_error['train_mape'][i_base].avg,
            dict_error['val_mape'][i_base].avg,
            dict_error['test_mape'][i_base].avg)
        print(log_sub)
        log_all.append(log_sub)

        if classical:
            evaluate_result.plot_scatter(train_target,
                                         train_pred,
                                         dirName=args.dirName,
                                         fileName='%s_sub%d_train' %
                                         (args.dirName, i_base))
            #evaluate_result.plot_scatter(test_target_all, test_pred_all, dirName=args.dirName, fileName='%s_sub%d'%(args.dirName,i_base))

    log_total = 'Total\t\tStd: (%.1f,%.1f,%.1f), MAPE: (%.1f,%.1f,%.1f)\n' % (
        avg_list(dict_error['train_std']), avg_list(dict_error['val_std']),
        avg_list(dict_error['test_std']), avg_list(dict_error['train_mape']),
        avg_list(dict_error['val_mape']), avg_list(dict_error['test_mape']))
    print(log_total)
    log_all.append(log_total)

    return log_all, dict_error
예제 #15
0
파일: app.py 프로젝트: geetika0101/ProjectX
    def get(self):

        company = request.args.get('company')
        # compare = request.args.get('compare')
        startDate = request.args.get('start')
        endDate = request.args.get('end')
        df = yf.download(company, startDate, endDate)
        close_px = df['Adj Close']
        mavg = close_px.rolling(window=100).mean()
        print(mavg)
        print(df.head())
        print(df.tail())
        mpl.rc('figure', figsize=(8, 7))
        mpl.__version__

        # Adjusting the style of matplotlib
        style.use('ggplot')

        close_px.plot(label=company)
        mavg.plot(label='mavg')
        plt.legend()

        plt.savefig('mavg.png', bbox_inches='tight')

        plt.clf()
        # plt.show()

        rets = close_px / close_px.shift(1) - 1
        rets.plot(label='return')
        plt.savefig('return.png', bbox_inches='tight')
        plt.clf()
        # plt.show()

        # close_px = df['Adj Close']
        dfreg = df.loc[:, ['Adj Close', 'Volume']]

        a = df['High'] - df['Close']
        print(a)

        dfreg['HL_PCT'] = a / df['Close'] * 100.0
        print("yo --- yo ")

        print(dfreg['HL_PCT'])

        print(df['Close'])
        print(df['Open'])

        b = df['Close'] - df['Open']

        dfreg['PCT_change'] = b / df['Open'] * 100.0

        # Drop missing value
        dfreg.fillna(value=-99999, inplace=True)

        print(dfreg.shape)
        # We want to separate 1 percent of the data to forecast
        forecast_out = int(math.ceil(0.01 * len(dfreg)))

        # Separating the label here, we want to predict the AdjClose
        forecast_col = 'Adj Close'
        dfreg['label'] = dfreg[forecast_col].shift(-forecast_out)
        X = np.array(dfreg.drop(['label'], 1))

        # Scale the X so that everyone can have the same distribution for linear regression
        X = preprocessing.scale(X)

        # Finally We want to find Data Series of late X and early X (train) for model generation and evaluation
        X_lately = X[-forecast_out:]
        X = X[:-forecast_out]

        # Separate label and identify it as y
        y = np.array(dfreg['label'])
        y = y[:-forecast_out]

        print('Dimension of X', X.shape)
        print('Dimension of y', y.shape)

        # Separation of training and testing of model by cross validation train test split
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.2)

        # Linear regression
        clfreg = LinearRegression(n_jobs=-1)
        clfreg.fit(X_train, y_train)

        # Quadratic Regression 2
        clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge())
        clfpoly2.fit(X_train, y_train)

        # Quadratic Regression 3
        clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge())
        clfpoly3.fit(X_train, y_train)

        # KNN Regression
        clfknn = KNeighborsRegressor(n_neighbors=2)
        clfknn.fit(X_train, y_train)

        confidencereg = clfreg.score(X_test, y_test)
        confidencepoly2 = clfpoly2.score(X_test, y_test)
        confidencepoly3 = clfpoly3.score(X_test, y_test)
        confidenceknn = clfknn.score(X_test, y_test)

        print("The linear regression confidence is ", confidencereg)
        print("The quadratic regression 2 confidence is ", confidencepoly2)
        print("The quadratic regression 3 confidence is ", confidencepoly3)
        print("The knn regression confidence is ", confidenceknn)

        # Printing the forecast
        forecast_set = clfknn.predict(X_lately)
        dfreg['Forecast'] = np.nan
        print(forecast_set, confidencereg, forecast_out)

        last_date = dfreg.iloc[-1].name
        last_unix = last_date
        next_unix = last_unix + datetime.timedelta(days=1)

        for i in forecast_set:
            next_date = next_unix
            next_unix += datetime.timedelta(days=1)
            dfreg.loc[next_date] = [
                np.nan for _ in range(len(dfreg.columns) - 1)
            ] + [i]

        dfreg['Adj Close'].tail(500).plot()
        dfreg['Forecast'].tail(500).plot()
        plt.legend(loc=4)
        plt.xlabel('Date')
        plt.ylabel('Price')
        # plt.show()
        plt.savefig('forecast.png', bbox_inches='tight')
        plt.clf()
        val = {'lastValue': forecast_set[forecast_out - 1]}
        fore = json.dumps(val)
        foreval = json.loads(fore)
        return foreval
예제 #16
0
def main():
    #14:name
    names_dataset = dataset = pd.read_csv('dataset.csv',
                                          encoding="latin1",
                                          usecols=(14, ))
    #5:gender, 6:gender_confidence, 8:confidence in profile, 10:description, 11:no of favourited tweets,
    #13:link color, 17:retweet count, 18:sidebar color, 19:tweet text, 21:tweet count
    dataset = pd.read_csv('dataset.csv',
                          encoding="latin1",
                          usecols=(5, 6, 8, 10, 11, 13, 17, 18, 19, 21))

    #words = pd.read_csv('manually_filtered_stats.csv', encoding = "latin1", usecols = (0,))
    #divide into dependent and independent variables
    #6:gender_confidence, 8:confidence in profile, 10:description, 11:no of favourited tweets,
    #13:link color, 17:retweet count, 18:sidebar color, 19:tweet text, 21:tweet count
    X = dataset.iloc[:, 1:]
    #5:gender
    y = dataset.iloc[:, 0]

    #10:description, 19:tweet text
    description_and_tweet = pd.read_csv('dataset.csv',
                                        encoding="latin1",
                                        usecols=(10, 19))
    description_and_tweet = description_and_tweet.replace(np.nan,
                                                          '',
                                                          regex=True)
    x1 = description_and_tweet.iloc[:, 0].values
    x2 = description_and_tweet.iloc[:, 1].values
    description_and_tweet_combined = x1 + ' ' + x2

    #swap # of favorite tweets and link_color column
    #link_color_col = numpy.copy(X[:, 1])
    #X[:, 1] = X[:, 0]
    #X[:, 0] = link_color_col

    #swap # of favorite tweets and sidebar_color column
    #sidebar_color_col = numpy.copy(X[:, 3])
    #X[:, 3] = X[:, 1]
    #X[:, 1] = sidebar_color_col

    #Might need to be updated/reviewed because of change of columns
    stats.stats(X, y)

    X, y = preprocessing.preprocessData(X.values, y.values,
                                        names_dataset.values,
                                        description_and_tweet_combined)
    """
    #84 columns
    X_feature_names = [
        16 dummy features extracted from link color     0-15
        16 dummy features extracted from sidebar color  16-31
        '# of favorite tweets',                         32      
        '# of retweets',                                33
        '# of tweets',                                  34
        5 features extracted from description,          35-39
        5 features extracted from tweet text,           40-44
        3 features extracted from name (dummy),         45-47
        36 features extracted from tweet text           48-83
    ]
    """

    #Might need to be updated/reviewed because of change of columns
    #It doesn't seem it is affected by order of columns, but with dummy variables, it might generate too many plots

    feature_names = [
        'lk_red', 'lk_red-orange', 'lk_orange-brown', 'lk_orange-yellow',
        'lk_yellow', 'lk_yellow-green', 'lk_green', 'lk_green-cyan', 'lk_cyan',
        'lk_cyan-blue', 'lk_blue', 'lk_blue-magenta', 'lk_magenta',
        'lk_magenta-pink', 'lk_pink', 'lk_pink-red', 'sb_red', 'sb_red-orange',
        'sb_orange-brown', 'sb_orange-yellow', 'sb_yellow', 'sb_yellow-green',
        'sb_green', 'sb_green-cyan', 'sb_cyan', 'sb_cyan-blue', 'sb_blue',
        'sb_blue-magenta', 'sb_magenta', 'sb_magenta-pink', 'sb_pink',
        'sb_pink-red', '# of favorite tweets', '# of retweets', '# of tweets',
        '# of hashtags in description', 'URLs present in description',
        '# of emoticons used in description', 'length of description',
        '# of mentions in description', '# of hashtags in tweet text',
        'URLs present in tweet text', '# of emoticons used in tweet text',
        'length of tweet text', '# of mentions in tweet text',
        'feature 1 from name', 'feature 2 from name', 'feature 3 from name',
        'women word_freq', 'bitch word_freq', 'nation word_freq',
        'tec  word_freq', 'season word_freq', 'hair word_freq',
        'dad word_freq', 'player word_freq', 'cat word_freq',
        'polit word_freq', 'blogger word_freq', 'radio word_freq',
        'pushawardslizquen word_freq', 'boy word_freq', 'author word_freq',
        'footbal word_freq', 'kid word_freq', 'travel word_freq',
        'social word_freq', 'heart word_freq', 'vote word_freq',
        'food word_freq', 'guy word_freq', 'beauti word_freq',
        'lover word_freq', 'via word_freq', 'writer word_freq',
        'artist word_freq', 'man word_freq', 'sport word_freq',
        'f**k word_freq', 'girl word_freq', 'fan word_freq', 'game word_freq',
        'love word_freq', 'weather word_freq'
    ]

    #[ 0  8 11 12 14 15 16 22 24 25 32 34 35 38 44 46 47 48 53 60 63 69 71 72 76 77 79 80 81 82]
    index_temp = [
        0, 8, 11, 12, 14, 15, 16, 22, 24, 25, 32, 34, 35, 38, 44, 46, 47, 48,
        53, 60, 63, 69, 71, 72, 76, 77, 79, 80, 81, 82
    ]
    print("first line: ", X[0, :])
    plotting.plot(X, y, feature_names, index_temp)

    #Might need to be updated/reviewed because of change of columns
    #This is happening over the entire dataset and should only happen on the continuous variables
    X = preprocessing.scale(X)

    #select top features using Reverse Feature Elimination
    #not affected by order of columns
    top_features = tuning.postModelStats(X, y)

    #print("top_features:", top_features)
    #[ 0  8 11 12 14 15 16 22 24 25 32 34 35 38 44 46 47 48 53 60 63 69 71 72 76 77 79 80 81 82]

    X = X[:, top_features]

    #after preprocessing: split data into training and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

    #Each run will take approximately 10 to 15 minutes given the number of features used in the model
    process('Support Vector Classifier - RBF Kernel', 'svc_rbf', X, y, X_train,
            y_train, X_test, y_test)
    process('Logistic Regression', 'lr', X, y, X_train, y_train, X_test,
            y_test)
    process('Support Vector Classifier - Linear Kernel', 'svc_linear', X, y,
            X_train, y_train, X_test, y_test)
    process('K nearest Classifier', 'knc', X, y, X_train, y_train, X_test,
            y_test)
예제 #17
0
def main(index_exp=0):

    dirName = '%s_%s_data%d_%s' % (args.ext_model, args.rgr_model,
                                   args.data_cate, args.append_name)
    fileName = '%s_exp%d' % (dirName, index_exp)

    # Create folder for results of this model
    if not os.path.exists('./results/%s' % (dirName)):
        os.makedirs('./results/%s' % (dirName))

    print('Extraction model: %s' % (args.ext_model))
    print('Regression model: %s' % (args.rgr_model))

    if args.ext_model == 'vgg16':
        net = tv_models.vgg16(pretrained=True).to(device=device)
        set_parameter_requires_grad(net, True)
        net.classifier[6] = Identity()
    elif args.ext_model == 'resnet50':
        net = tv_models.resnet50(pretrained=True).to(device=device)
        set_parameter_requires_grad(net, True)
        net.fc = Identity()

    # Get dataset
    batchSize = 64
    input_size = 224
    # Load Data
    data_transforms = {
        'train': transforms.Compose([ndl.Rescale(input_size),
                                     ndl.ToTensor()]),
        'test': transforms.Compose([ndl.Rescale(input_size),
                                    ndl.ToTensor()])
    }

    print("Initializing Datasets and Dataloaders...")

    # Create training and testing datasets
    image_datasets = {
        x: ndl.TopoplotLoader(args.image_folder,
                              x,
                              transform=data_transforms[x],
                              index_exp=index_exp)
        for x in ['train', 'test']
    }

    # Create training and testing dataloaders
    dataloaders_dict = {
        'train':
        Data.DataLoader(image_datasets['train'],
                        batch_size=batchSize,
                        shuffle=False,
                        num_workers=4),
        'test':
        Data.DataLoader(image_datasets['test'],
                        batch_size=batchSize,
                        shuffle=False,
                        num_workers=4)
    }

    # Extract features by VGG16
    net.eval()  # Disable batchnorm, dropout
    X_train, Y_train = extract_layer(dataloaders_dict['train'], net)
    X_test, Y_test = extract_layer(dataloaders_dict['test'], net)

    # Standardize data before PCA
    if args.scale:
        X_train, X_test = preprocessing.scale(X_train, X_test, mode=args.scale)

    # Apply PCA to reduce dimension
    if args.n_components > 1:
        args.n_components = int(args.n_components)
    pca = PCA(n_components=args.n_components, svd_solver='full')
    pca.fit(X_train)

    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    print('(X) Number of features after PCA: %d' % (X_train.shape[1]))
    print('(X) Explained variance ratio: %.3f' %
          (np.sum(pca.explained_variance_ratio_)))

    # Add conditional entropy
    if args.add_CE and args.data_cate == 2:
        print('Add conditional entropy as additional features...')

        with open(
                './raw_data/CE_sub%d_channel21_exp%d_train.data' %
            (args.subject_ID, index_exp), 'rb') as fp:
            CE_train = pickle.load(fp)
        with open(
                './raw_data/CE_sub%d_channel21_exp%d_test.data' %
            (args.subject_ID, index_exp), 'rb') as fp:
            CE_test = pickle.load(fp)

        # Scale CE
        CE_train, CE_test = preprocessing.scale(CE_train, CE_test)

        # Apply PCA
        pca = PCA(n_components=30, svd_solver='full')
        pca.fit(CE_train)
        CE_train = pca.transform(CE_train)
        CE_test = pca.transform(CE_test)

        print('(CE) Number of features after PCA: %d' % (CE_train.shape[1]))
        print('(CE) Explained variance ratio: %.3f' %
              (np.sum(pca.explained_variance_ratio_)))

        # Concatentate with X
        X_train = np.concatenate((X_train, CE_train), axis=1)
        X_test = np.concatenate((X_test, CE_test), axis=1)

    # Regression to predict solution latency
    X_train_Reg = X_train
    X_test_Reg = X_test
    if args.rgr_model == 'LR':
        rgr = linear_model.LinearRegression()
    elif args.rgr_model == 'Ridge':
        rgr = linear_model.Ridge(alpha=1)
    elif args.rgr_model == 'GPR':
        kernel = RBF(10, (1e-2, 1e2)) + ConstantKernel(10, (1e-2, 1e2))
        rgr = GaussianProcessRegressor(kernel=kernel, random_state=0)
    elif args.rgr_model == 'ELMK':
        rgr = elm.ELMKernel()
    elif args.rgr_model == 'ELMR':
        params = ["sigmoid", 1, 500, False]
        rgr = elm.ELMRandom(params)

    if args.rgr_model not in ['ELMK', 'ELMR']:
        rgr.fit(X_train_Reg, Y_train)
        pred_train = rgr.predict(X_train_Reg)
        pred_test = rgr.predict(X_test_Reg)
    else:
        # Scale target into -1~1
        if args.scale_target == 2:

            scaler = TargetScaler(num_step=10)
            scaler.fit(Y_train)
            Y_train, Y_test = scaler.transform(Y_train), scaler.transform(
                Y_test)
        elif args.scale_target == 1:

            Y_train, Y_test = (Y_train - 30) / 30, (Y_test - 30) / 30

        # Concatenate data for extreme learning machine
        train_data = np.concatenate((Y_train[:, np.newaxis], X_train), axis=1)
        test_data = np.concatenate((Y_test[:, np.newaxis], X_test), axis=1)

        rgr.search_param(train_data, cv="kfold", of="rmse", eval=10)

        pred_train = rgr.train(train_data).predicted_targets
        pred_test = rgr.test(test_data).predicted_targets

        # Scale target back to 0~60
        if args.scale_target == 2:

            [Y_train, Y_test, pred_train, pred_test] = [scaler.transform(x, mode='inverse') for x in \
                                           [Y_train, Y_test, pred_train, pred_test]]
        elif args.scale_target == 1:

            [Y_train, Y_test, pred_train, pred_test] = [x*30+30 for x in \
                                           [Y_train, Y_test, pred_train, pred_test]]

    evaluate_result.plot_scatter(Y_test, pred_test, dirName, fileName)

    print('Train std: %.3f' % (mean_squared_error(Y_train, pred_train)**0.5))
    print('Test std: %.3f' % (mean_squared_error(Y_test, pred_test)**0.5))

    # Save targets and predictions
    dict_target = {}
    dict_target['target'], dict_target['pred'] = Y_test, pred_test
    with open('./results/%s/%s.data' % (dirName, fileName), 'wb') as fp:
        pickle.dump(dict_target, fp)

    return