def main(): parser = argparse.ArgumentParser(description="Available Parameters:") parser.add_argument("--n_hidden_units", default=64, type=int) parser.add_argument("--n_hidden_layers", default=1, type=int) parser.add_argument("--train_epochs", default=100, type=int) parser.add_argument("--write_output", default=True, type=bool) args = parser.parse_args() torch.manual_seed(0) np.random.seed(0) profiles = pd.read_csv("../data/new_profiles_200t.csv") comments = pd.read_csv("../data/new_comments_200t.csv") comments = comments.drop_duplicates() profiles = preprocessing.categorical_to_numerical(profiles, col="category_1") all_users = set(profiles.profile_username.values) data = preprocessing.scale(profiles.drop(columns=["category_1", "profile_username"]).values) name_to_record = {name: record for name, record in zip(all_users, data)} input_dim, output_dim = data.shape[1], len(profiles.category_1.unique()) + 1 user_to_label = {user: category for user, category in profiles[["profile_username", "category_1"]].values} K = 5 skf = StratifiedKFold(n_splits=K) models_metrics, models_histories = defaultdict(dict), defaultdict(list) for kth_fold, (train_idx, test_idx) in enumerate(skf.split(profiles.profile_username.values, profiles.category_1.values), start=1): print("Starting {}th Fold".format(kth_fold)) authors = profiles.profile_username.values username_to_index = utils.get_users_indices(authors) interactions = utils.get_interactions(comments, username_to_index) edge_index = utils.get_edge_index(interactions) x = utils.get_x(authors, name_to_record, input_dim=input_dim) y = utils.get_y(user_to_label, authors) train_mask = [True if i in train_idx else False for i in range(len(x))] test_mask = [True if i in test_idx else False for i in range(len(x))] device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') data = Data(x=x, y=y, edge_index=edge_index, train_mask=train_mask, test_mask=test_mask).to(device) assert len(x)==len(y), "Train Input and Output tensor do not have the same dimensions" models = utils.get_models(data.num_nodes, input_dim, output_dim, args.n_hidden_units, args.n_hidden_layers, device=device, lr=0.005) histories = utils.train(data, models, epochs=args.train_epochs) models_histories = utils.update_histories(models_histories, histories) current_metrics = utils.test(data, models) utils.update_metrics_dict(models_metrics, current_metrics) print('\n') models_histories = {model: list(history/K) for model, history in models_histories.items()} # Get mean traces models_metrics = utils.calculate_statistics(models_metrics) if args.write_output: utils.write_json("../data/results/models_metrics_{}e_{}l_{}u.json".format(args.train_epochs, args.n_hidden_layers, args.n_hidden_units), models_metrics) utils.write_json("../data/results/models_histories_{}e_{}l_{}u.json".format(args.train_epochs, args.n_hidden_layers, args.n_hidden_units), models_histories)
def quantify_mood_text(mood_list): # quantify the textual description of the mood using AlchemyAPI sentiment analysis technique # Parameter: # mood_list : the list that contains the words describing the mood of the music # Returns # sentiment_score : the aggregated score of the sentiment extracted from the word # get the list of the words from the string get_words = [] for words in mood_list: extra_words = words.split(" / ") for word in extra_words: get_words.append(word) sentiment_score = 0.0 for word in set(get_words): response = alchemy_obj.sentiment("text", word) if(response.has_key('docSentiment')): if (response['docSentiment'].has_key('score')): a_score = response['docSentiment']['score'] sentiment_score = sentiment_score + float(a_score) sentiment_score = sentiment_score / float(len(mood_list)) # scale it into a range between 0 and 200 sentiment_score = preprocessing.scale(sentiment_score,-1, 1, 0, 100) return sentiment_score
def run_experiments1(algor=None): X1, y1, X2, y2 = get_data() X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2, shuffle=True) dataset_phishing_websites = "Phishing Websites" if not algor: run_decision_tree_exp(X1_train, y1_train, X1_test, y1_test, dataset_phishing_websites) run_boosting_experiment(X1_train, y1_train, X1_test, y1_test, dataset_phishing_websites) run_nn_experiment(X1_train, y1_train, X1_test, y1_test, dataset_phishing_websites) run_knn_experiment(X1_train, y1_train, X1_test, y1_test, dataset_phishing_websites) X1_train_scaled = preprocessing.scale(X1_train) X1_train_scaled = pd.DataFrame(X1_train_scaled) X1_test_scaled = preprocessing.scale(X1_test) X1_test_scaled = pd.DataFrame(X1_test_scaled) run_svm_exp(X1_train_scaled, y1_train, X1_test_scaled, y1_test, dataset_phishing_websites) else: if algor == 'dt': run_decision_tree_exp(X1_train, y1_train, X1_test, y1_test, dataset_phishing_websites) elif algor == 'boosting': run_boosting_experiment(X1_train, y1_train, X1_test, y1_test, dataset_phishing_websites) elif algor == 'nn': run_nn_experiment(X1_train, y1_train, X1_test, y1_test, dataset_phishing_websites) elif algor == 'knn': run_knn_experiment(X1_train, y1_train, X1_test, y1_test, dataset_phishing_websites) elif algor == 'svm': X1_train_scaled = preprocessing.scale(X1_train) X1_train_scaled = pd.DataFrame(X1_train_scaled) X1_test_scaled = preprocessing.scale(X1_test) X1_test_scaled = pd.DataFrame(X1_test_scaled) run_svm_exp(X1_train_scaled, y1_train, X1_test_scaled, y1_test, dataset_phishing_websites) else: raise ValueError(algor + ' does not exist')
def run_experiments2(algor=None): X1, y1, X2, y2 = get_data() X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, shuffle=True) print(X2_train.shape) dataset_madelon = "Madelon" if not algor: run_decision_tree_exp(X2_train, y2_train, X2_test, y2_test, dataset_madelon) run_boosting_experiment(X2_train, y2_train, X2_test, y2_test, dataset_madelon) run_nn_experiment(X2_train, y2_train, X2_test, y2_test, dataset_madelon) run_knn_experiment(X2_train, y2_train, X2_test, y2_test, dataset_madelon) X2_train_scaled = preprocessing.scale(X2_train) X2_train_scaled = pd.DataFrame(X2_train_scaled) X2_test_scaled = preprocessing.scale(X2_test) X2_test_scaled = pd.DataFrame(X2_test_scaled) run_svm_exp(X2_train_scaled, y2_train, X2_test_scaled, y2_test, dataset_madelon) else: if algor == 'dt': run_decision_tree_exp(X2_train, y2_train, X2_test, y2_test, dataset_madelon) elif algor == 'boosting': run_boosting_experiment(X2_train, y2_train, X2_test, y2_test, dataset_madelon) elif algor == 'nn': run_nn_experiment(X2_train, y2_train, X2_test, y2_test, dataset_madelon) elif algor == 'knn': run_knn_experiment(X2_train, y2_train, X2_test, y2_test, dataset_madelon) elif algor == 'svm': X2_train_scaled = preprocessing.scale(X2_train) X2_train_scaled = pd.DataFrame(X2_train_scaled) X2_test_scaled = preprocessing.scale(X2_test) X2_test_scaled = pd.DataFrame(X2_test_scaled) run_svm_exp(X2_train_scaled, y2_train, X2_test_scaled, y2_test, dataset_madelon) else: raise ValueError(algor + ' does not exist')
def pre_process(self): if K.image_data_format() == 'channels_first': x_train = self.x_train.reshape(self.x_train.shape[0], 1, self.img_rows, self.img_cols) x_val = self.x_val.reshape(self.x_val.shape[0], 1, self.img_rows, self.img_cols) x_test = self.x_test.reshape(self.x_test.shape[0], 1, self.img_rows, self.img_cols) input_shape = (1, self.img_rows, self.img_cols) else: x_train = self.x_train.reshape(self.x_train.shape[0], self.img_rows, self.img_cols, 1) x_val = self.x_val.reshape(self.x_val.shape[0], self.img_rows, self.img_cols, 1) x_test = self.x_test.reshape(self.x_test.shape[0], self.img_rows, self.img_cols, 1) input_shape = (self.img_rows, self.img_cols, 1) x_train = x_train.astype('float32') x_val = x_val.astype('float32') x_test = x_test.astype('float32') x_train, X_min, X_max = scale(x_train, 0, 255) x_val, _, _ = scale(x_val, 0, 255, X_min=X_min, X_max=X_max) x_test, _, _ = scale(x_test, 0, 255, X_min=X_min, X_max=X_max) x_train /= 255 x_val /= 255 x_test /= 255 # convert class vectors to binary class matrices f = False if f: i = 0 for row in x_train: x_train[i, :] = preprocess_input(row) i = i + 1 i = 0 for row in x_val: x_val[i, :] = preprocess_input(row) i = i + 1 for row in x_test: x_test[i, :] = preprocess_input(row) i = i + 1 self.y_train = keras.utils.to_categorical(self.y_train, self.num_classes) self.y_val = keras.utils.to_categorical(self.y_val, self.num_classes) self.y_test = keras.utils.to_categorical(self.y_test, self.num_classes) return #x_train, y_train, x_val, y_val, x_test, y_test, input_shape
def predict_emotion_using_AV_model(data_df): # using Russell's Arousal-Valence model, estimate the angular quantity that represents # the predicted sentiment # In Russell's model, caloriesBurned is the horizontal axis (pleasrue-displeasure) # while beats per minute (bpm) represetns the vertical axis (sleepiness-arousal) # Parameter: # data_df : the data frame to be testeed upon, its columns must be two. # Returns # predicted_emotions : the textual description that displays the predicted emotion' # scale x any y to the range of -1 and 1 so that they can be applied to the AV model coordinate min_value_0 = min(data_df.iloc[:,0]) max_value_0 = max(data_df.iloc[:,0]) data_df.iloc[:,0] = data_df.iloc[:,0].apply(lambda x: preprocessing.scale(x,min_value_0, max_value_0, -1.0,1.0)) min_value_1 = min(data_df.iloc[:,1]) max_value_1 = max(data_df.iloc[:,1]) data_df.iloc[:,1] = data_df.iloc[:,1].apply(lambda x: preprocessing.scale(x,min_value_1, max_value_1, -1.0,1.0)) return data_df.apply(lambda x: calculate_angle(x), axis=1)
def preprocess_data(self, x, y=None): '''Prepare the data for the neural network. - Remove 0's from the time channels - Center the data on 0 - Scale it to have a standard deviation of 1''' std = 1 preprocessing.fix_time_zeros(x) means = preprocessing.center(x) stds = preprocessing.scale(x, std, mode='standardize') def repeat_transformation(other): if len(other) == 0: return else: preprocessing.fix_time_zeros(other) other -= means other /= stds/std return repeat_transformation
def preprocess_data(self, x, y=None): '''Prepare the data for the neural network. - Remove 0's from the time channels - Center the data on 0 - Scale it to have a standard deviation of 1''' std = 1 preprocessing.fix_time_zeros(x) means = preprocessing.center(x) stds = preprocessing.scale(x, std, mode='standardize') def repeat_transformation(other): if len(other) == 0: return else: preprocessing.fix_time_zeros(other) other -= means other /= stds / std return repeat_transformation
def main(model_type, t_spread_min, t_spread_max, ell_spread_min, ell_spread_max, n, n_test, n_epochs, data_dir): # Generate data feat, y, _, _ = pendulum(n=n, t_spread=[t_spread_min, t_spread_max], ell_spread=[ell_spread_min, ell_spread_max]) # Set up data x_train, x_val, y_train, y_val = train_test_split(feat, y, test_size=val_proportion, random_state=42) x_scaler, x_train, x_val = scale(x_train, x_val) y_scaler, y_train, y_val = scale(y_train, y_val) t_range_str = f'trange{int(100*t_spread_min)}to{int(100*t_spread_max)}' model_name = f'{model_type}_{t_range_str}_{n_epochs}ep' os.makedirs(data_dir, exist_ok=True) if not os.path.isfile(f'{data_dir}x_scaler_{t_range_str}.pkl'): with open(f'{data_dir}x_scaler_{t_range_str}.pkl', 'wb') as file_pi: pickle.dump(x_scaler, file_pi) with open(f'{data_dir}y_scaler_{t_range_str}.pkl', 'wb') as file_pi: pickle.dump(y_scaler, file_pi) # train and save models model_number = 1 while os.path.isfile(f'{data_dir}model_{model_name}_{str(model_number).zfill(3)}.h5'): model_number += 1 if model_type == 'de': models = [mlp(loss='nll') for _ in range(n_models)] elif model_type == 'cd': n_features = x_train.shape[1] n_outputs = y_train.shape[1] dropout_reg = 2. / n models = [make_model(n_features, n_outputs, n_neurons, dropout_reg)] elif model_type == 'bnn': models = [mlp_flipout()] else: raise ValueError(f'Model type {model_type} not recognized!') for j, mod in enumerate(models): print(f'Model {j+1}') history = mod.fit(x_train, y_train, epochs=n_epochs, validation_data=(x_val, y_val)) mod.save_weights(f'{data_dir}model_{model_name}_{str(model_number+j).zfill(3)}.h5') with open(f'{data_dir}history_{model_name}_{str(model_number+j).zfill(3)}.pkl', 'wb') as file_pi: pickle.dump(history.history, file_pi) # Generate test set feat_test, _, _, _ = pendulum(n=n_test, t_spread=[t_spread_min, t_spread_max], ell_spread=[ell_spread_min, ell_spread_max], seed=666) feat_test = x_scaler.transform(feat_test) # make predictions if model_type == 'de': y_pred = [] for model in models: y_pred.append(model(feat_test.astype('float32'))) elif model_type == 'cd': y_pred = np.array([models[0].predict(feat_test) for _ in range(n_models)]) elif model_type == 'bnn': y_pred = [models[0](feat_test.astype('float32')) for _ in range(n_models)] if model_type == 'de' or model_type == 'bnn': y_pred_val = [pred.loc.numpy() for pred in y_pred] y_pred_unc = [pred.scale.numpy() for pred in y_pred] elif model_type == 'cd': y_pred_val = y_pred[:, :, :1] y_pred_unc = np.sqrt(np.exp(y_pred[:, :, 1:])) y_pred_val_resc = [y_scaler.inverse_transform(y) for y in y_pred_val] y_pred_unc_resc = [y / y_scaler.scale_[0] for y in y_pred_unc] y_pred_val_resc = np.array(y_pred_val_resc).reshape((n_models, n_test)) y_pred_unc_resc = np.array(y_pred_unc_resc).reshape((n_models, n_test)) y_pred_mean = np.mean(y_pred_val_resc, axis=0) y_pred_ep_unc = np.std(y_pred_val_resc, axis=0) y_pred_al_unc = np.sqrt(np.mean(y_pred_unc_resc * y_pred_unc_resc, axis=0)) y_pred_unc = np.sqrt(y_pred_al_unc ** 2 + y_pred_ep_unc ** 2) np.save(f'{data_dir}y_pred_test_{model_name}_{str(model_number).zfill(3)}.npy', y_pred_mean) np.save(f'{data_dir}y_pred_test_alunc_{model_name}_{str(model_number).zfill(3)}.npy', y_pred_al_unc) np.save(f'{data_dir}y_pred_test_epunc_{model_name}_{str(model_number).zfill(3)}.npy', y_pred_ep_unc) np.save(f'{data_dir}y_pred_test_prunc_{model_name}_{str(model_number).zfill(3)}.npy', y_pred_unc)
def Pipeline(X_train, y_train, X_test, n_dims=44): id_train = np.array(X_train["id"]) X_train = X_train.drop(columns=["id"]) id_test = np.array(X_test["id"]) X_test = X_test.drop(columns=["id"]) X_train = np.array(X_train) y_train = np.array(y_train) X_test = np.array(X_test) ind_numeric = [] for i in range(len(X_train[0])): if len(np.unique(X_train[:, i])) > 2: ind_numeric.append(i) print("Hay " + str(len(ind_numeric)) + " variables numericas") ''' ind_delete = np.where(y_train=="functional needs repair")[0] y_train = np.delete(y_train, ind_delete, axis=0) X_train = np.delete(X_train, ind_delete, axis=0) ''' #plotData(X_train, y_train, "raw") print("Scaling data...") X_train = preprocessing.scale(X_train) X_test = preprocessing.scale(X_test) #plotData(X_train, y_train, "scaled") print("PCA con " + str(n_dims) + " componentes...") X_train_binary = np.delete(X_train, ind_numeric, axis=1) X_test_binary = np.delete(X_test, ind_numeric, axis=1) X_train_numeric = X_train[:, ind_numeric] X_test_numeric = X_test[:, ind_numeric] pca = PCA(n_components=n_dims) #pca = KernelPCA(n_components=n_dims, kernel="linear", n_jobs=-1) X1 = pca.fit_transform(X_train_binary) X2 = pca.transform(X_test_binary) X_train = np.hstack((X_train_numeric, X1)) X_test = np.hstack((X_test_numeric, X2)) print("Numero de features: " + str(len(X_train[0]))) #plotData(X_train, y_train, "PCA") ''' print("Reduccion de dimensionalidad con AutoEncoder...") hid = [50,60,50] X_train, X_test = autoencoder.fitTransform(X_train, X_test, 50, hid, bsize=32) print("Numero de features: " + str(len(X_train[0]))) ''' ''' print("Reduccion de dimensionalidad con AutoEncoder...") hid = [250,200,150,100,50] X_train_binary = np.delete(X_train, ind_numeric, axis=1) X_test_binary = np.delete(X_test, ind_numeric, axis=1) X_train_numeric = X_train[:,ind_numeric] X_test_numeric = X_test[:,ind_numeric] X1, X2 = autoencoder.fitTransform(X_train_binary, X_test_binary, 30, hid, bsize=32) X_train = np.hstack((X_train_numeric, X1)) X_test = np.hstack((X_test_numeric, X2)) print("Numero de features: " + str(len(X_train[0]))) ''' print("IPF...") X_train, y_train = IPF(X_train, y_train) print("Numero de instancias: " + str(len(X_train))) print("Instancias por clase:") print(np.unique(y_train, return_counts=True)) #plotData(X_train, y_train, "IPF") ''' print("Denoising autoencoder...") hid = [32,16,32] X_train, X_test = autoencoder_denoising.fitTransform(X_train, X_test, 250, hid, bsize=32, kreg=None, areg=None) ''' ''' print("AllKNN...") X_train, y_train = AllKNN(n_neighbors=7, n_jobs=8).fit_resample(X_train, y_train) print("Numero de instancias: " + str(len(X_train))) print("Instancias por clase:") print(np.unique(y_train,return_counts=True)) ''' ''' print("Feature selection...") feature_selector = SelectKBest(f_classif, k="all").fit(X_train, y_train) X_train = feature_selector.transform(X_train) X_test = feature_selector.transform(X_test) print("Numero de features: " + str(len(X_train[0]))) ''' print("SMOTE...") X_train, y_train = SMOTE(sampling_strategy={ "functional needs repair": 7500, "non functional": 22000 }, random_state=123456789, n_jobs=20, k_neighbors=7).fit_resample(X_train, y_train) print("Numero de instancias: " + str(len(X_train))) print("Instancias por clase:") print(np.unique(y_train, return_counts=True)) #plotData(X_train, y_train, "SMOTE") ''' print("ADASYN...") X_train,y_train = ADASYN(sampling_strategy = {"functional needs repair": 5000, "non functional": 22500}, random_state=123456789, n_jobs=8, n_neighbors=7).fit_resample(X_train,y_train) print("Numero de instancias: " + str(len(X_train))) print("Instancias por clase:") print(np.unique(y_train,return_counts=True)) ''' print("Cleaning anomalies...") ind_functional = np.where(y_train == "functional")[0] ind_non_functional = np.where(y_train == "non functional")[0] ind_functional_repair = np.where(y_train == "functional needs repair")[0] X1, y1 = cleanAnomalies(X_train[ind_functional], y_train[ind_functional]) X2, y2 = cleanAnomalies(X_train[ind_non_functional], y_train[ind_non_functional]) X3, y3 = cleanAnomalies(X_train[ind_functional_repair], y_train[ind_functional_repair]) X_train = np.concatenate((X1, X2), axis=0) X_train = np.concatenate((X_train, X3), axis=0) y_train = np.concatenate((y1, y2), axis=0) y_train = np.concatenate((y_train, y3), axis=0) print("Instancias por clase:") print(np.unique(y_train, return_counts=True)) #plotData(X_train, y_train, "anomalias_knn") ''' print("EditedNearestNeighbours...") X_train, y_train = EditedNearestNeighbours(sampling_strategy="not minority", n_neighbors=15, n_jobs=20, kind_sel="mode").fit_resample(X_train, y_train) print("Numero de instancias: " + str(len(X_train))) print("Instancias por clase:") print(np.unique(y_train,return_counts=True)) ''' ''' print("SSMA...") selector = SSMA(n_neighbors=1, alpha=0.95, max_loop=10, initial_density=0.9).fit(X_train,y_train) X_train = selector.X_ y_train = selector.y_ print("Numero de instancias: " + str(len(X_train))) print("Instancias por clase:") print(np.unique(y_train,return_counts=True)) ''' ''' print("Generando la métrica con DML...") train_set, _, train_labels, _ = train_test_split(X_train, y_train, train_size=0.5, random_state=123456789) print("Tamaño del conjunto original: " + str(len(X_train)) + ", tamaño del train: " + str(len(train_set))) dml = KLMNN().fit(train_set, train_labels) X_train = dml.transform(X_train) X_test = dml.transform(X_test) ''' return X_train, y_train, id_train, X_test, id_test
def main(index_exp, index_split): faulthandler.enable() torch.cuda.empty_cache() best_error = 100 lr_step = [40, 70, 120] multiframe = ['convlstm', 'convfc'] dirName = '%s_data%d_%s_%s_%s'%(args.model_name, args.data_cate, args.augmentation, args.loss_type, args.file_name) fileName = '%s_split%d_exp%d'%(dirName, index_split, index_exp) # Create folder for results of this model if not os.path.exists('./results/%s'%(dirName)): os.makedirs('./results/%s'%(dirName)) # ------------- Wrap up dataloader ----------------- if args.input_type == 'signal': X, Y_reg, C = raw_dataloader.read_data([1,2,3], list(range(11)), channel_limit=21, rm_baseline=True) num_channel = X.shape[1] num_feature = X.shape[2] # Number of time sample # Remove trials X, Y_reg = preprocessing.remove_trials(X, Y_reg, threshold=60) # Split data for cross validation if args.num_fold == 1: train_data, test_data, train_target, test_target = train_test_split(X, Y_reg, test_size=0.1, random_state=23) # Random state 15: training error becomes lower, testing error becomes higher else: kf = KFold(n_splits=args.num_fold, shuffle=True, random_state=23) for i, (train_index, test_index) in enumerate(kf.split(X)): if i == index_exp: train_data, train_target = X[train_index, :], Y_reg[train_index] test_data, test_target = X[test_index, :], Y_reg[test_index] # Split data for ensemble methods if not args.ensemble: if args.num_split > 1: data_list, target_list = preprocessing.stratified_split(train_data, train_target, n_split=args.num_split, mode=args.split_mode) train_data, train_target = data_list[index_split], target_list[index_split] ''' kf = KFold(n_splits=args.num_split, shuffle=True, random_state=32) for i, (other_index, split_index) in enumerate(kf.split(train_data)): if i == index_split: train_data, train_target = train_data[split_index, :], train_target[split_index] ''' # Normalize the data if args.normalize: train_data, test_data = preprocessing.normalize(train_data, test_data) # Data augmentation if args.augmentation == 'overlapping': train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation, (256, 64, 128)) test_data, test_target = data_augmentation.aug(test_data, test_target, args.augmentation, (256, 64, 128)) elif args.augmentation == 'add_noise': train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation, (30, 1)) elif args.augmentation == 'add_noise_minority': train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation, (30, 1)) elif args.augmentation == 'SMOTER': train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation) # scale data if args.scale_data: train_data, test_data = train_data.reshape((train_data.shape[0],-1)), test_data.reshape((test_data.shape[0],-1)) train_data, test_data = preprocessing.scale(train_data, test_data) train_data = train_data.reshape((train_data.shape[0],num_channel, -1)) test_data = test_data.reshape((test_data.shape[0],num_channel, -1)) if args.model_name in ['eegnet', 'eegnet_trans_signal']: # (sample, channel, time) -> (sample, channel_NN, channel_EEG, time) [train_data, test_data] = [X.reshape((X.shape[0], 1, num_channel, num_feature)) \ for X in [train_data, test_data]] (train_dataTS, train_targetTS, test_dataTS, test_targetTS) = map( torch.from_numpy, (train_data, train_target, test_data, test_target)) [train_dataset,test_dataset] = map(\ Data.TensorDataset, [train_dataTS.float(),test_dataTS.float()], [train_targetTS.float(),test_targetTS.float()]) if not args.str_sampling: train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) test_loader = Data.DataLoader(test_dataset, batch_size=args.batch_size) model_param = [train_data.shape] elif args.input_type == 'power': if args.data_cate == 1: ERSP_all, tmp_all, freqs = dataloader.load_data() elif args.data_cate == 2: data_file = './raw_data/ERSP_from_raw_%d_channel21.data'%(args.index_sub) with open(data_file, 'rb') as fp: dict_ERSP = pickle.load(fp) ERSP_all, tmp_all = dict_ERSP['ERSP'], dict_ERSP['SLs'] num_channel = ERSP_all.shape[1] num_freq = ERSP_all.shape[2] # Remove trials ERSP_all, tmp_all = preprocessing.remove_trials(ERSP_all, tmp_all, threshold=60) # Split data for cross validation if args.num_fold == 1: train_data, test_data, train_target, test_target = train_test_split(ERSP_all, tmp_all[:,2], test_size=0.1, random_state=23) else: kf = KFold(n_splits=args.num_fold, shuffle=True, random_state=23) for i, (train_index, test_index) in enumerate(kf.split(ERSP_all)): if i == index_exp: train_data, test_data = ERSP_all[train_index, :], ERSP_all[test_index, :] if args.data_cate == 2: train_target, test_target = tmp_all[train_index], tmp_all[test_index] else: train_target, test_target = tmp_all[train_index, 2], tmp_all[test_index, 2] if args.add_CE: assert args.data_cate == 2 with open('./raw_data/CE_sub%d'%(args.index_sub), 'rb') as fp: CE = pickle.load(fp) CE_train, CE_test = CE[train_index,:], CE[test_index,:] # PCA for CE pca = PCA(n_components=10) pca.fit(CE_train) CE_train, CE_test = pca.transform(CE_train), pca.transform(CE_test) # Split data for ensemble methods if not args.ensemble: if args.num_split > 1: data_list, target_list = preprocessing.stratified_split(train_data, train_target, n_split=args.num_split, mode=args.split_mode) train_data, train_target = data_list[index_split], target_list[index_split] ''' kf = KFold(n_splits=args.num_split, shuffle=True, random_state=32) for i, (other_index, split_index) in enumerate(kf.split(np.arange(len(train_data)))): if i == index_split: train_data, train_target = train_data[split_index, :], train_target[split_index] ''' # Concatenate train and test for standardizinsg data = np.concatenate((train_data, test_data), axis=0) target = np.concatenate((train_target, test_target)) # Standardize data num_train = len(train_data) data, target = preprocessing.standardize(data, target, train_indices = np.arange(num_train), threshold=0.0) data = data.reshape((data.shape[0], -1)) # Scale target between 0 and 1 if args.post_scale: print('Scale the target between 0-1') target = target/60 # Split data train_data, test_data = data[:num_train, :], data[num_train:, :] train_target, test_target = target[:num_train], target[num_train:] # Data augmentation if args.augmentation == 'SMOTER': train_data, train_target = data_augmentation.aug(train_data, train_target, args.augmentation) # center data if args.center_flag: train_data, test_data = preprocessing.center(train_data, test_data) # scale data if args.scale_data: train_data, test_data = preprocessing.scale(train_data, test_data) # Add conditional entropy if args.add_CE: train_data = np.concatenate((train_data, CE_train), axis=1) test_data = np.concatenate((test_data, CE_train), axis=1) if args.model_name == 'eegnet_trans_power': # (sample, channel, freq) -> (sample, channel_NN, channel_EEG, freq) [train_data, test_data] = [X.reshape((X.shape[0], 1, num_channel, num_freq)) \ for X in [train_data, test_data]] (train_dataTS, train_targetTS, test_dataTS, test_targetTS) = map( torch.from_numpy, (train_data, train_target, test_data, test_target)) [train_dataset,test_dataset] = map(\ Data.TensorDataset, [train_dataTS.float(),test_dataTS.float()], [train_targetTS.float(),test_targetTS.float()]) if not args.str_sampling: train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) test_loader = Data.DataLoader(test_dataset, batch_size=args.batch_size) model_param = [train_data.shape] elif args.input_type == 'image': if args.ensemble: input_model_name = args.pre_model_name else: input_model_name = args.model_name assert (input_model_name in multiframe) == (args.num_time>1) # Let input size be 224x224 if the model is vgg16 if input_model_name in ['vgg16', 'resnet50']: input_size = 224 else: input_size = 64 # Load Data data_transforms = { 'train': transforms.Compose([ ndl.Rescale(input_size, args.num_time), ndl.ToTensor(args.num_time)]), 'test': transforms.Compose([ ndl.Rescale(input_size, args.num_time), ndl.ToTensor(args.num_time)]) } print("Initializing Datasets and Dataloaders...") # Create training and testing datasets # image_datasets = {x: ndl.TopoplotLoader(args.image_folder, x, args.num_time, data_transforms[x], # scale=args.scale_image, index_exp=index_exp, index_split=index_split) for x in ['train', 'test']} [train_dataset,test_dataset] = [ndl.TopoplotLoader(args.image_folder, x, args.num_time, data_transforms[x], scale=args.scale_image, index_exp=index_exp, index_split=index_split) for x in ['train', 'test']] # Create training and testing dataloaders # if not args.str_sampling: # train_loader = Data.DataLoader(image_datasets['train'], batch_size=args.batch_size, shuffle=True, num_workers=4) # test_loader = Data.DataLoader(image_datasets['test'], batch_size=args.batch_size, shuffle=False, num_workers=4) if not args.str_sampling: train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4) test_loader = Data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, num_workers=4) model_param = [input_size] elif args.input_type == 'EEGLearn_img': # Load data with open('./EEGLearn_imgs/data1.data', 'rb') as fp: dict_data = pickle.load(fp) data, target = dict_data['data'], dict_data['target'] input_size = data.shape[2] # Split data for cross validation if args.num_fold == 1: train_data, test_data, train_target, test_target = train_test_split(data, target, test_size=0.1, random_state=23) # Random state 15: training error becomes lower, testing error becomes higher else: kf = KFold(n_splits=args.num_fold, shuffle=True, random_state=23) for i, (train_index, test_index) in enumerate(kf.split(data)): if i == index_exp: train_data, train_target = data[train_index, :], target[train_index] test_data, test_target = data[test_index, :], target[test_index] (train_dataTS, train_targetTS, test_dataTS, test_targetTS) = map( torch.from_numpy, (train_data, train_target, test_data, test_target)) [train_dataset,test_dataset] = map(\ Data.TensorDataset, [train_dataTS.float(),test_dataTS.float()], [train_targetTS.float(),test_targetTS.float()]) if not args.str_sampling: train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) test_loader = Data.DataLoader(test_dataset, batch_size=args.batch_size) # ------------ Create model --------------- if args.input_type in ['image','EEGLearn_img']: model_param = [input_size] else: model_param = [train_data.shape] if not args.ensemble: model = read_model(args.model_name, model_param) else: pre_models = [] for i in range(args.num_split): pre_model = read_model(args.pre_model_name, model_param) pre_model.load_state_dict( torch.load('%s/last_model_exp%d_split%d.pt'%(args.ensemble, index_exp, i)) ) set_parameter_requires_grad(pre_model, True) pre_models.append(pre_model) model = models.__dict__[args.model_name](pre_models) print('Use model %s'%(args.model_name)) # Run on GPU model = model.to(device=device) if torch.cuda.device_count() > 1: model = nn.DataParallel(model) # define loss function (criterion) and optimizer if args.loss_type == 'L2': criterion = nn.MSELoss().to(device=device) elif args.loss_type == 'L1': criterion = nn.L1Loss().to(device=device) elif args.loss_type == 'L4': criterion = L4Loss elif args.loss_type == 'MyLoss': criterion = MyLoss print('Use %s loss'%(args.loss_type)) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr_rate,momentum=0.9) #optimizer = torch.optim.Adam(model.parameters(), lr=args.lr_rate) # Record loss and accuracy of each epoch dict_error = {'train_std': list(range(args.num_epoch)), 'test_std': list(range(args.num_epoch)), 'train_mape': list(range(args.num_epoch)), 'test_mape': list(range(args.num_epoch))} # optionally evaluate the trained model if args.evaluate: if args.resume: if os.path.isfile(args.resume): model.load_state_dict(torch.load(args.resume)) _, target, pred, _, _ = validate(test_loader, model, criterion) plot_scatter(target, pred, dirName, fileName) return 0 # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_error = checkpoint['best_error'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) dict_error['train_std'][:args.start_epoch] = checkpoint['dict_error']['train_std'] dict_error['test_std'][:args.start_epoch] = checkpoint['dict_error']['test_std'] print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # ------------- Train model ------------------ for epoch in range(args.start_epoch, args.num_epoch): # Create dataloader if using stratified sampler if args.str_sampling: sampler = SubsetRandomSampler(get_indices_RSS(train_target, int(0.5*len(train_target)))) train_loader = Data.DataLoader(train_dataset, batch_size=args.batch_size, \ sampler=sampler, num_workers=4) # Learning rate decay if epoch in lr_step: for param_group in optimizer.param_groups: param_group['lr'] *= 0.1 # train for one epoch _, dict_error['train_std'][epoch], dict_error['train_mape'][epoch] = \ train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set _, _, _, std_error, dict_error['test_mape'][epoch] = validate(test_loader, model, criterion) dict_error['test_std'][epoch] = std_error # remember best standard error and save checkpoint is_best = std_error < best_error best_error = min(std_error, best_error) save_checkpoint({ 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_error': best_error, 'optimizer': optimizer.state_dict(), 'dict_error': dict_error }, is_best) # Save best model if is_best: torch.save(model.state_dict(), './results/%s/best_model_exp%d_split%d.pt'%(dirName, index_exp, index_split)) if epoch == args.num_epoch-1: torch.save(model.state_dict(), './results/%s/last_model_exp%d_split%d.pt'%(dirName, index_exp, index_split)) # Plot error curve plot_error(dict_error, dirName, fileName) # Plot scatter plots _, target, pred, _, _ = validate(test_loader, model, criterion) plot_scatter(target, pred, dirName, fileName) dict_error['target'], dict_error['pred'] = target, pred # Plot histogram import matplotlib.pyplot as plt plt.hist(target, label = 'True') plt.hist(pred, label = 'Pred') plt.legend(loc='upper right') plt.savefig('./results/hist.png') # Save error over epochs with open('./results/%s/%s.data'%(dirName, fileName), 'wb') as fp: pickle.dump(dict_error, fp)
def get(self): # global var # global test # var += 1 # test='/test'+str(var) # args = request.args # print(var) company = request.args.get('company') compare = request.args.get('compare') start = request.args.get('start') end = request.args.get('end') df = yf.download(company, start, end) close_px = df['Adj Close'] mavg = close_px.rolling(window=100).mean() print(mavg) print(df.head()) print(df.tail()) import matplotlib.pyplot as plt from matplotlib import style # Adjusting the size of matplotlib import matplotlib as mpl mpl.rc('figure', figsize=(8, 7)) mpl.__version__ # Adjusting the style of matplotlib style.use('ggplot') close_px.plot(label=company) mavg.plot(label='mavg') plt.legend() plt.savefig('mavg.png', bbox_inches='tight') plt.clf() # plt.show() rets = close_px / close_px.shift(1) - 1 rets.plot(label='return') plt.savefig('return.png', bbox_inches='tight') plt.clf() # plt.show() dfcomp = yf.download(['AAPL', 'GE', 'GOOG', 'IBM', 'MSFT'], start, end)['Adj Close'] print(dfcomp.tail()) retscomp = dfcomp.pct_change() corr = retscomp.corr() print("hi") # cols = [col for col in retscomp.columns if compare in col] # print(retscomp[cols]) print(corr) plt.scatter(retscomp[company], retscomp[compare]) plt.xlabel('Returns-' + company) plt.ylabel('Returns-' + compare) plt.savefig('compare.png', bbox_inches='tight') plt.clf() # plt.show() # Error #pd.scatter_matrix(retscomp, diagonal='kde', figsize=(10, 10)); plt.imshow(corr, cmap='hot', interpolation='none') plt.colorbar() plt.xticks(range(len(corr)), corr.columns) plt.yticks(range(len(corr)), corr.columns) # plt.show() plt.savefig('correlation.png', bbox_inches='tight') plt.clf() plt.scatter(retscomp.mean(), retscomp.std()) plt.xlabel('Expected returns') plt.ylabel('Risk') for label, x, y in zip(retscomp.columns, retscomp.mean(), retscomp.std()): plt.annotate(label, xy=(x, y), xytext=(20, -20), textcoords='offset points', ha='right', va='bottom', bbox=dict(boxstyle='round,pad=0.5', fc='yellow', alpha=0.5), arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0')) # plt.show() plt.savefig('risk-ret-rate.png', bbox_inches='tight') plt.clf() dfreg = df.loc[:, ['Adj Close', 'Volume']] a = df['High'] - df['Close'] print(a) dfreg['HL_PCT'] = a / df['Close'] * 100.0 print("yo --- yo ") print(dfreg['HL_PCT']) print(df['Close']) print(df['Open']) b = df['Close'] - df['Open'] dfreg['PCT_change'] = b / df['Open'] * 100.0 import math import numpy as np from sklearn import preprocessing, svm from sklearn.model_selection import train_test_split # Drop missing value dfreg.fillna(value=-99999, inplace=True) print(dfreg.shape) # We want to separate 1 percent of the data to forecast forecast_out = int(math.ceil(0.01 * len(dfreg))) # Separating the label here, we want to predict the AdjClose forecast_col = 'Adj Close' dfreg['label'] = dfreg[forecast_col].shift(-forecast_out) X = np.array(dfreg.drop(['label'], 1)) # Scale the X so that everyone can have the same distribution for linear regression X = preprocessing.scale(X) # Finally We want to find Data Series of late X and early X (train) for model generation and evaluation X_lately = X[-forecast_out:] X = X[:-forecast_out] # Separate label and identify it as y y = np.array(dfreg['label']) y = y[:-forecast_out] print('Dimension of X', X.shape) print('Dimension of y', y.shape) # Separation of training and testing of model by cross validation train test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) from sklearn.linear_model import LinearRegression from sklearn.neighbors import KNeighborsRegressor from sklearn.linear_model import Ridge from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import make_pipeline # Linear regression clfreg = LinearRegression(n_jobs=-1) clfreg.fit(X_train, y_train) # Quadratic Regression 2 clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge()) clfpoly2.fit(X_train, y_train) # Quadratic Regression 3 clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge()) clfpoly3.fit(X_train, y_train) # KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) clfknn.fit(X_train, y_train) confidencereg = clfreg.score(X_test, y_test) confidencepoly2 = clfpoly2.score(X_test, y_test) confidencepoly3 = clfpoly3.score(X_test, y_test) confidenceknn = clfknn.score(X_test, y_test) print("The linear regression confidence is ", confidencereg) print("The quadratic regression 2 confidence is ", confidencepoly2) print("The quadratic regression 3 confidence is ", confidencepoly3) print("The knn regression confidence is ", confidenceknn) # Printing the forecast forecast_set = clfreg.predict(X_lately) dfreg['Forecast'] = np.nan print(forecast_set, confidencereg, forecast_out) last_date = dfreg.iloc[-1].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_set: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg.loc[next_date] = [ np.nan for _ in range(len(dfreg.columns) - 1) ] + [i] dfreg['Adj Close'].tail(500).plot() dfreg['Forecast'].tail(500).plot() plt.legend(loc=4) plt.xlabel('Date') plt.ylabel('Price') # plt.show() plt.savefig('forecast.png', bbox_inches='tight') plt.clf() from scipy.stats import norm # data = yf.download("AAPL", start = '2012-01-01', end='2017-01-01')['Adj Close'] result = [] #Define Variables S = yf.download(company, start, end)['Adj Close'][ -1] #apple['Adj Close'][-1] #starting stock price (i.e. last available real stock price) T = 50 #Number of trading days days = (df.index[-1] - df.index[0]).days cagr = ((( (df['Adj Close'][-1]) / df['Adj Close'][1]))**(365.0 / days)) - 1 mu = cagr # 0.2309 #Return df['Returns'] = df['Adj Close'].pct_change() vol = df['Returns'].std() * math.sqrt(252) # vol = #0.4259 #Volatility #choose number of runs to simulate - I have chosen 10,000 for i in range(100): #create list of daily returns using random normal distribution daily_returns = np.random.normal(mu / T, vol / math.sqrt(T), T) + 1 #set starting price and create price series generated by above random daily returns price_list = [S] for x in daily_returns: price_list.append(price_list[-1] * x) #plot data from each individual run which we will plot at the end plt.plot(price_list) #append the ending value of each simulated run to the empty list we created at the beginning result.append(price_list[-1]) #show the plot of multiple price series created above # plt.show() plt.savefig('monte.png', bbox_inches='tight') plt.clf() #create histogram of ending stock values for our mutliple simulations plt.hist(result, bins=50) # plt.show() plt.savefig('histo.png', bbox_inches='tight') plt.clf() #use numpy mean function to calculate the mean of the result print(round(np.mean(result), 2))
def CV(X, Y, S, D, classical): n_splits = 10 # Cross validation (mixed subjects), 10 splits dict_error = {x:[AverageMeter() for i in range(n_splits)] for x in ['train_std', 'val_std', 'test_std', \ 'train_mape', 'val_mape', 'test_mape']} log_all = [] start_time = time.time() kf = KFold(n_splits=n_splits, shuffle=True, random_state=23) for i_exp, (train_index, test_index) in enumerate(kf.split(X)): print('----- [%.1f] Exp %d -----' % (time.time() - start_time, i_exp)) # Wrap up training and testing data train_data, test_data = X[train_index, :], X[test_index, :] train_target, test_target = Y[train_index], Y[test_index] train_sub, test_sub = S[train_index], S[test_index] train_diff, test_diff = D[train_index], D[test_index] # Split training data into training and validation data train_data, val_data, train_sub, val_sub, train_diff, val_diff, train_target, val_target = \ train_test_split(train_data, train_sub, train_diff, train_target, test_size=1/9, random_state=32) print('Number of (train, val, test): (%d,%d,%d)' % (len(train_data), len(val_data), len(test_data))) # Flatten the data if classical: [train_data, val_data, test_data] = [ x.reshape((x.shape[0], -1)) for x in [train_data, val_data, test_data] ] # Select ERSP correlated with SLs if args.SCF: train_data, test_data, select_indices = preprocessing.select_correlated_features(train_data, \ train_target, test_data, num_features=args.SCF) val_data = val_data[:, select_indices == 1] # Data augmentation if args.augmentation == 'SMOTER': train_data, train_target = data_augmentation.aug( train_data, train_target, method=args.augmentation) # PCA if args.PCA: pca = PCA(n_components=200) pca.fit(train_data) train_data = pca.transform(train_data) val_data = pca.transform(val_data) test_data = pca.transform(test_data) #train_data, test_data = preprocessing.PCA_corr(train_data, train_target, test_data, num_features=10) # Add subject ID and difficulty level as features if args.add_sub_diff: # Onehot encode subject ID and difficulty level train_sub = onehot_encode(train_sub, 11) val_sub = onehot_encode(val_sub, 11) test_sub = onehot_encode(test_sub, 11) train_diff = onehot_encode(train_diff, 3) val_diff = onehot_encode(val_diff, 3) test_diff = onehot_encode(test_diff, 3) # Standardize data _, test_data = preprocessing.scale(train_data, test_data, mode='minmax') train_data, val_data = preprocessing.scale(train_data, val_data, mode='minmax') # Concatenate subject and difficulty train_data = np.concatenate((train_data, train_sub, train_diff), axis=1) val_data = np.concatenate((val_data, val_sub, val_diff), axis=1) test_data = np.concatenate((test_data, test_sub, test_diff), axis=1) # Regression if classical: train_pred, val_pred, test_pred = classical_regression( train_data, val_data, test_data, train_target) # Record error and prediction train_std = mean_squared_error(train_target, train_pred)**0.5 val_std = mean_squared_error(val_target, val_pred)**0.5 test_std = mean_squared_error(test_target, test_pred)**0.5 train_mape = mean_absolute_percentage_error( train_target, train_pred) val_mape = mean_absolute_percentage_error(val_target, val_pred) test_mape = mean_absolute_percentage_error(test_target, test_pred) print('Split %d Std: (%.1f,%.1f,%.1f), MAPE: (%.1f,%.1f,%.1f)' % (i_exp, train_std, val_std, test_std, train_mape, val_mape, test_mape)) # test_pred_all[curr_test_index:curr_test_index+len(test_index)] = test_pred # test_target_all[curr_test_index:curr_test_index+len(test_index)] = test_target # test_pred_all = np.concatenate((test_pred_all, test_pred)) # test_target_all = np.concatenate((test_target_all, test_target)) else: train_std, val_std, test_std, train_mape, val_mape, test_mape = \ deep_regression(train_data, val_data, test_data, train_target, val_target, test_target, train_sub, val_sub, test_sub, -1, i_exp) dict_error['train_std'][i_exp].update(train_std, len(train_data)) dict_error['val_std'][i_exp].update(val_std, len(val_data)) dict_error['test_std'][i_exp].update(test_std, len(test_data)) dict_error['train_mape'][i_exp].update(train_mape, len(train_data)) dict_error['val_mape'][i_exp].update(val_mape, len(val_data)) dict_error['test_mape'][i_exp].update(test_mape, len(test_data)) log_sub = 'Exp%d\t\tStd: (%.1f,%.1f,%.1f), MAPE: (%.1f,%.1f,%.1f)\n' % ( i_exp, dict_error['train_std'][i_exp].avg, dict_error['val_std'][i_exp].avg, dict_error['test_std'][i_exp].avg, dict_error['train_mape'][i_exp].avg, dict_error['val_mape'][i_exp].avg, dict_error['test_mape'][i_exp].avg) print(log_sub) log_all.append(log_sub) if classical: evaluate_result.plot_scatter(train_target, train_pred, dirName=args.dirName, fileName='%s_sub%d_train' % (args.dirName, i_exp)) log_total = 'Total\t\tStd: (%.1f,%.1f,%.1f), MAPE: (%.1f,%.1f,%.1f)\n' % ( avg_list(dict_error['train_std']), avg_list(dict_error['val_std']), avg_list(dict_error['test_std']), avg_list(dict_error['train_mape']), avg_list(dict_error['val_mape']), avg_list(dict_error['test_mape'])) print(log_total) log_all.append(log_total) return log_all, dict_error
def LOSO(X, Y, S, D, classical): # Leave one subject out dict_error = {x:[AverageMeter() for i in range(11)] for x in ['train_std', 'val_std', 'test_std', \ 'train_mape', 'val_mape', 'test_mape']} log_all = [] start_time = time.time() for i_base in range(11): print('----- [%.1f] Subject %d -----' % (time.time() - start_time, i_base)) lst_model = LSTransform.LST(11, i_base) indices_base, indices_other = np.where(S == i_base)[0], np.where( S != i_base)[0] base_data, base_target, base_sub, base_diff = X[ indices_base, :], Y[indices_base], S[indices_base], D[indices_base] other_data, other_target, other_sub, other_diff = X[ indices_other, :], Y[indices_other], S[indices_other], D[ indices_other] test_pred_all, test_target_all = np.array([]), np.array([]) # K-fold cross validation (all test data are in one subject) kf = KFold(n_splits=5, shuffle=True, random_state=23) for i_split, (more_index, few_index) in enumerate(kf.split(base_data)): print('--- [%.1f] Split %d ---' % (time.time() - start_time, i_split)) # Wrap up training and testing data train_index, test_index = few_index, more_index train_data, test_data = np.concatenate( (base_data[train_index, :], other_data), axis=0), base_data[test_index, :] train_target, test_target = np.concatenate( (base_target[train_index], other_target), axis=0), base_target[test_index] train_sub, test_sub = np.concatenate( (base_sub[train_index], other_sub), axis=0), base_sub[test_index] train_diff, test_diff = np.concatenate( (base_diff[train_index], other_diff), axis=0), base_diff[test_index] # Split training data into training and validation data train_data, val_data, train_sub, val_sub, train_diff, val_diff, train_target, val_target = \ train_test_split(train_data, train_sub, train_diff, train_target, test_size=1/9, random_state=32) print('Number of (train, val, test): (%d,%d,%d)' % (len(train_data), len(val_data), len(test_data))) if args.LST: # LST for training data lst_model.fit_(train_data, train_target, train_sub) train_data = lst_model.transform_(train_data, train_target, train_sub, args.num_closest, args.dist_type) val_data = lst_model.transform_(val_data, val_target, val_sub, args.num_closest, args.dist_type) if args.SS: # Source separation print('Apply source separation for time signal...') SS_model = source_separation.SourceSeparation( train_data.shape[1], 11) SS_model.fit(train_data, train_sub) train_data = SS_model.transform(train_data, train_sub) val_data = SS_model.transform(val_data, val_sub) test_data = SS_model.transform(test_data, test_sub) # Flatten the data if classical: [train_data, val_data, test_data] = [ x.reshape((x.shape[0], -1)) for x in [train_data, val_data, test_data] ] # Select ERSP correlated with SLs if args.SCF: train_data, test_data, select_indices = preprocessing.select_correlated_features(train_data, \ train_target, test_data, num_features=args.SCF) val_data = val_data[:, select_indices == 1] # Data augmentation if args.augmentation == 'SMOTER': train_data, train_target = data_augmentation.aug( train_data, train_target, method=args.augmentation) # PCA if args.PCA: pca = PCA(n_components=200) pca.fit(train_data) train_data = pca.transform(train_data) val_data = pca.transform(val_data) test_data = pca.transform(test_data) #train_data, test_data = preprocessing.PCA_corr(train_data, train_target, test_data, num_features=10) # Add subject ID and difficulty level as features if args.add_sub_diff: # Onehot encode subject ID and difficulty level train_sub = onehot_encode(train_sub, 11) val_sub = onehot_encode(val_sub, 11) test_sub = onehot_encode(test_sub, 11) train_diff = onehot_encode(train_diff, 3) val_diff = onehot_encode(val_diff, 3) test_diff = onehot_encode(test_diff, 3) # Standardize data _, test_data = preprocessing.scale(train_data, test_data, mode='minmax') train_data, val_data = preprocessing.scale(train_data, val_data, mode='minmax') # Concatenate subject and difficulty train_data = np.concatenate( (train_data, train_sub, train_diff), axis=1) val_data = np.concatenate((val_data, val_sub, val_diff), axis=1) test_data = np.concatenate((test_data, test_sub, test_diff), axis=1) # Regression if classical: train_pred, val_pred, test_pred = classical_regression( train_data, val_data, test_data, train_target) # Record error and prediction train_std = mean_squared_error(train_target, train_pred)**0.5 val_std = mean_squared_error(val_target, val_pred)**0.5 test_std = mean_squared_error(test_target, test_pred)**0.5 train_mape = mean_absolute_percentage_error( train_target, train_pred) val_mape = mean_absolute_percentage_error(val_target, val_pred) test_mape = mean_absolute_percentage_error( test_target, test_pred) print( 'Split %d Std: (%.1f,%.1f,%.1f), MAPE: (%.1f,%.1f,%.1f)' % (i_split, train_std, val_std, test_std, train_mape, val_mape, test_mape)) # test_pred_all[curr_test_index:curr_test_index+len(test_index)] = test_pred # test_target_all[curr_test_index:curr_test_index+len(test_index)] = test_target test_pred_all = np.concatenate((test_pred_all, test_pred)) test_target_all = np.concatenate( (test_target_all, test_target)) else: train_std, val_std, test_std, train_mape, val_mape, test_mape = \ deep_regression(train_data, val_data, test_data, train_target, val_target, test_target, train_sub, val_sub, test_sub, i_base, i_split) dict_error['train_std'][i_base].update(train_std, len(train_data)) dict_error['val_std'][i_base].update(val_std, len(val_data)) dict_error['test_std'][i_base].update(test_std, len(test_data)) dict_error['train_mape'][i_base].update(train_mape, len(train_data)) dict_error['val_mape'][i_base].update(val_mape, len(val_data)) dict_error['test_mape'][i_base].update(test_mape, len(test_data)) log_sub = 'Sub%2d\t\tStd: (%.1f,%.1f,%.1f), MAPE: (%.1f,%.1f,%.1f)\n' % ( i_base, dict_error['train_std'][i_base].avg, dict_error['val_std'][i_base].avg, dict_error['test_std'][i_base].avg, dict_error['train_mape'][i_base].avg, dict_error['val_mape'][i_base].avg, dict_error['test_mape'][i_base].avg) print(log_sub) log_all.append(log_sub) if classical: evaluate_result.plot_scatter(train_target, train_pred, dirName=args.dirName, fileName='%s_sub%d_train' % (args.dirName, i_base)) #evaluate_result.plot_scatter(test_target_all, test_pred_all, dirName=args.dirName, fileName='%s_sub%d'%(args.dirName,i_base)) log_total = 'Total\t\tStd: (%.1f,%.1f,%.1f), MAPE: (%.1f,%.1f,%.1f)\n' % ( avg_list(dict_error['train_std']), avg_list(dict_error['val_std']), avg_list(dict_error['test_std']), avg_list(dict_error['train_mape']), avg_list(dict_error['val_mape']), avg_list(dict_error['test_mape'])) print(log_total) log_all.append(log_total) return log_all, dict_error
def get(self): company = request.args.get('company') # compare = request.args.get('compare') startDate = request.args.get('start') endDate = request.args.get('end') df = yf.download(company, startDate, endDate) close_px = df['Adj Close'] mavg = close_px.rolling(window=100).mean() print(mavg) print(df.head()) print(df.tail()) mpl.rc('figure', figsize=(8, 7)) mpl.__version__ # Adjusting the style of matplotlib style.use('ggplot') close_px.plot(label=company) mavg.plot(label='mavg') plt.legend() plt.savefig('mavg.png', bbox_inches='tight') plt.clf() # plt.show() rets = close_px / close_px.shift(1) - 1 rets.plot(label='return') plt.savefig('return.png', bbox_inches='tight') plt.clf() # plt.show() # close_px = df['Adj Close'] dfreg = df.loc[:, ['Adj Close', 'Volume']] a = df['High'] - df['Close'] print(a) dfreg['HL_PCT'] = a / df['Close'] * 100.0 print("yo --- yo ") print(dfreg['HL_PCT']) print(df['Close']) print(df['Open']) b = df['Close'] - df['Open'] dfreg['PCT_change'] = b / df['Open'] * 100.0 # Drop missing value dfreg.fillna(value=-99999, inplace=True) print(dfreg.shape) # We want to separate 1 percent of the data to forecast forecast_out = int(math.ceil(0.01 * len(dfreg))) # Separating the label here, we want to predict the AdjClose forecast_col = 'Adj Close' dfreg['label'] = dfreg[forecast_col].shift(-forecast_out) X = np.array(dfreg.drop(['label'], 1)) # Scale the X so that everyone can have the same distribution for linear regression X = preprocessing.scale(X) # Finally We want to find Data Series of late X and early X (train) for model generation and evaluation X_lately = X[-forecast_out:] X = X[:-forecast_out] # Separate label and identify it as y y = np.array(dfreg['label']) y = y[:-forecast_out] print('Dimension of X', X.shape) print('Dimension of y', y.shape) # Separation of training and testing of model by cross validation train test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Linear regression clfreg = LinearRegression(n_jobs=-1) clfreg.fit(X_train, y_train) # Quadratic Regression 2 clfpoly2 = make_pipeline(PolynomialFeatures(2), Ridge()) clfpoly2.fit(X_train, y_train) # Quadratic Regression 3 clfpoly3 = make_pipeline(PolynomialFeatures(3), Ridge()) clfpoly3.fit(X_train, y_train) # KNN Regression clfknn = KNeighborsRegressor(n_neighbors=2) clfknn.fit(X_train, y_train) confidencereg = clfreg.score(X_test, y_test) confidencepoly2 = clfpoly2.score(X_test, y_test) confidencepoly3 = clfpoly3.score(X_test, y_test) confidenceknn = clfknn.score(X_test, y_test) print("The linear regression confidence is ", confidencereg) print("The quadratic regression 2 confidence is ", confidencepoly2) print("The quadratic regression 3 confidence is ", confidencepoly3) print("The knn regression confidence is ", confidenceknn) # Printing the forecast forecast_set = clfknn.predict(X_lately) dfreg['Forecast'] = np.nan print(forecast_set, confidencereg, forecast_out) last_date = dfreg.iloc[-1].name last_unix = last_date next_unix = last_unix + datetime.timedelta(days=1) for i in forecast_set: next_date = next_unix next_unix += datetime.timedelta(days=1) dfreg.loc[next_date] = [ np.nan for _ in range(len(dfreg.columns) - 1) ] + [i] dfreg['Adj Close'].tail(500).plot() dfreg['Forecast'].tail(500).plot() plt.legend(loc=4) plt.xlabel('Date') plt.ylabel('Price') # plt.show() plt.savefig('forecast.png', bbox_inches='tight') plt.clf() val = {'lastValue': forecast_set[forecast_out - 1]} fore = json.dumps(val) foreval = json.loads(fore) return foreval
def main(): #14:name names_dataset = dataset = pd.read_csv('dataset.csv', encoding="latin1", usecols=(14, )) #5:gender, 6:gender_confidence, 8:confidence in profile, 10:description, 11:no of favourited tweets, #13:link color, 17:retweet count, 18:sidebar color, 19:tweet text, 21:tweet count dataset = pd.read_csv('dataset.csv', encoding="latin1", usecols=(5, 6, 8, 10, 11, 13, 17, 18, 19, 21)) #words = pd.read_csv('manually_filtered_stats.csv', encoding = "latin1", usecols = (0,)) #divide into dependent and independent variables #6:gender_confidence, 8:confidence in profile, 10:description, 11:no of favourited tweets, #13:link color, 17:retweet count, 18:sidebar color, 19:tweet text, 21:tweet count X = dataset.iloc[:, 1:] #5:gender y = dataset.iloc[:, 0] #10:description, 19:tweet text description_and_tweet = pd.read_csv('dataset.csv', encoding="latin1", usecols=(10, 19)) description_and_tweet = description_and_tweet.replace(np.nan, '', regex=True) x1 = description_and_tweet.iloc[:, 0].values x2 = description_and_tweet.iloc[:, 1].values description_and_tweet_combined = x1 + ' ' + x2 #swap # of favorite tweets and link_color column #link_color_col = numpy.copy(X[:, 1]) #X[:, 1] = X[:, 0] #X[:, 0] = link_color_col #swap # of favorite tweets and sidebar_color column #sidebar_color_col = numpy.copy(X[:, 3]) #X[:, 3] = X[:, 1] #X[:, 1] = sidebar_color_col #Might need to be updated/reviewed because of change of columns stats.stats(X, y) X, y = preprocessing.preprocessData(X.values, y.values, names_dataset.values, description_and_tweet_combined) """ #84 columns X_feature_names = [ 16 dummy features extracted from link color 0-15 16 dummy features extracted from sidebar color 16-31 '# of favorite tweets', 32 '# of retweets', 33 '# of tweets', 34 5 features extracted from description, 35-39 5 features extracted from tweet text, 40-44 3 features extracted from name (dummy), 45-47 36 features extracted from tweet text 48-83 ] """ #Might need to be updated/reviewed because of change of columns #It doesn't seem it is affected by order of columns, but with dummy variables, it might generate too many plots feature_names = [ 'lk_red', 'lk_red-orange', 'lk_orange-brown', 'lk_orange-yellow', 'lk_yellow', 'lk_yellow-green', 'lk_green', 'lk_green-cyan', 'lk_cyan', 'lk_cyan-blue', 'lk_blue', 'lk_blue-magenta', 'lk_magenta', 'lk_magenta-pink', 'lk_pink', 'lk_pink-red', 'sb_red', 'sb_red-orange', 'sb_orange-brown', 'sb_orange-yellow', 'sb_yellow', 'sb_yellow-green', 'sb_green', 'sb_green-cyan', 'sb_cyan', 'sb_cyan-blue', 'sb_blue', 'sb_blue-magenta', 'sb_magenta', 'sb_magenta-pink', 'sb_pink', 'sb_pink-red', '# of favorite tweets', '# of retweets', '# of tweets', '# of hashtags in description', 'URLs present in description', '# of emoticons used in description', 'length of description', '# of mentions in description', '# of hashtags in tweet text', 'URLs present in tweet text', '# of emoticons used in tweet text', 'length of tweet text', '# of mentions in tweet text', 'feature 1 from name', 'feature 2 from name', 'feature 3 from name', 'women word_freq', 'bitch word_freq', 'nation word_freq', 'tec word_freq', 'season word_freq', 'hair word_freq', 'dad word_freq', 'player word_freq', 'cat word_freq', 'polit word_freq', 'blogger word_freq', 'radio word_freq', 'pushawardslizquen word_freq', 'boy word_freq', 'author word_freq', 'footbal word_freq', 'kid word_freq', 'travel word_freq', 'social word_freq', 'heart word_freq', 'vote word_freq', 'food word_freq', 'guy word_freq', 'beauti word_freq', 'lover word_freq', 'via word_freq', 'writer word_freq', 'artist word_freq', 'man word_freq', 'sport word_freq', 'f**k word_freq', 'girl word_freq', 'fan word_freq', 'game word_freq', 'love word_freq', 'weather word_freq' ] #[ 0 8 11 12 14 15 16 22 24 25 32 34 35 38 44 46 47 48 53 60 63 69 71 72 76 77 79 80 81 82] index_temp = [ 0, 8, 11, 12, 14, 15, 16, 22, 24, 25, 32, 34, 35, 38, 44, 46, 47, 48, 53, 60, 63, 69, 71, 72, 76, 77, 79, 80, 81, 82 ] print("first line: ", X[0, :]) plotting.plot(X, y, feature_names, index_temp) #Might need to be updated/reviewed because of change of columns #This is happening over the entire dataset and should only happen on the continuous variables X = preprocessing.scale(X) #select top features using Reverse Feature Elimination #not affected by order of columns top_features = tuning.postModelStats(X, y) #print("top_features:", top_features) #[ 0 8 11 12 14 15 16 22 24 25 32 34 35 38 44 46 47 48 53 60 63 69 71 72 76 77 79 80 81 82] X = X[:, top_features] #after preprocessing: split data into training and test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20) #Each run will take approximately 10 to 15 minutes given the number of features used in the model process('Support Vector Classifier - RBF Kernel', 'svc_rbf', X, y, X_train, y_train, X_test, y_test) process('Logistic Regression', 'lr', X, y, X_train, y_train, X_test, y_test) process('Support Vector Classifier - Linear Kernel', 'svc_linear', X, y, X_train, y_train, X_test, y_test) process('K nearest Classifier', 'knc', X, y, X_train, y_train, X_test, y_test)
def main(index_exp=0): dirName = '%s_%s_data%d_%s' % (args.ext_model, args.rgr_model, args.data_cate, args.append_name) fileName = '%s_exp%d' % (dirName, index_exp) # Create folder for results of this model if not os.path.exists('./results/%s' % (dirName)): os.makedirs('./results/%s' % (dirName)) print('Extraction model: %s' % (args.ext_model)) print('Regression model: %s' % (args.rgr_model)) if args.ext_model == 'vgg16': net = tv_models.vgg16(pretrained=True).to(device=device) set_parameter_requires_grad(net, True) net.classifier[6] = Identity() elif args.ext_model == 'resnet50': net = tv_models.resnet50(pretrained=True).to(device=device) set_parameter_requires_grad(net, True) net.fc = Identity() # Get dataset batchSize = 64 input_size = 224 # Load Data data_transforms = { 'train': transforms.Compose([ndl.Rescale(input_size), ndl.ToTensor()]), 'test': transforms.Compose([ndl.Rescale(input_size), ndl.ToTensor()]) } print("Initializing Datasets and Dataloaders...") # Create training and testing datasets image_datasets = { x: ndl.TopoplotLoader(args.image_folder, x, transform=data_transforms[x], index_exp=index_exp) for x in ['train', 'test'] } # Create training and testing dataloaders dataloaders_dict = { 'train': Data.DataLoader(image_datasets['train'], batch_size=batchSize, shuffle=False, num_workers=4), 'test': Data.DataLoader(image_datasets['test'], batch_size=batchSize, shuffle=False, num_workers=4) } # Extract features by VGG16 net.eval() # Disable batchnorm, dropout X_train, Y_train = extract_layer(dataloaders_dict['train'], net) X_test, Y_test = extract_layer(dataloaders_dict['test'], net) # Standardize data before PCA if args.scale: X_train, X_test = preprocessing.scale(X_train, X_test, mode=args.scale) # Apply PCA to reduce dimension if args.n_components > 1: args.n_components = int(args.n_components) pca = PCA(n_components=args.n_components, svd_solver='full') pca.fit(X_train) X_train = pca.transform(X_train) X_test = pca.transform(X_test) print('(X) Number of features after PCA: %d' % (X_train.shape[1])) print('(X) Explained variance ratio: %.3f' % (np.sum(pca.explained_variance_ratio_))) # Add conditional entropy if args.add_CE and args.data_cate == 2: print('Add conditional entropy as additional features...') with open( './raw_data/CE_sub%d_channel21_exp%d_train.data' % (args.subject_ID, index_exp), 'rb') as fp: CE_train = pickle.load(fp) with open( './raw_data/CE_sub%d_channel21_exp%d_test.data' % (args.subject_ID, index_exp), 'rb') as fp: CE_test = pickle.load(fp) # Scale CE CE_train, CE_test = preprocessing.scale(CE_train, CE_test) # Apply PCA pca = PCA(n_components=30, svd_solver='full') pca.fit(CE_train) CE_train = pca.transform(CE_train) CE_test = pca.transform(CE_test) print('(CE) Number of features after PCA: %d' % (CE_train.shape[1])) print('(CE) Explained variance ratio: %.3f' % (np.sum(pca.explained_variance_ratio_))) # Concatentate with X X_train = np.concatenate((X_train, CE_train), axis=1) X_test = np.concatenate((X_test, CE_test), axis=1) # Regression to predict solution latency X_train_Reg = X_train X_test_Reg = X_test if args.rgr_model == 'LR': rgr = linear_model.LinearRegression() elif args.rgr_model == 'Ridge': rgr = linear_model.Ridge(alpha=1) elif args.rgr_model == 'GPR': kernel = RBF(10, (1e-2, 1e2)) + ConstantKernel(10, (1e-2, 1e2)) rgr = GaussianProcessRegressor(kernel=kernel, random_state=0) elif args.rgr_model == 'ELMK': rgr = elm.ELMKernel() elif args.rgr_model == 'ELMR': params = ["sigmoid", 1, 500, False] rgr = elm.ELMRandom(params) if args.rgr_model not in ['ELMK', 'ELMR']: rgr.fit(X_train_Reg, Y_train) pred_train = rgr.predict(X_train_Reg) pred_test = rgr.predict(X_test_Reg) else: # Scale target into -1~1 if args.scale_target == 2: scaler = TargetScaler(num_step=10) scaler.fit(Y_train) Y_train, Y_test = scaler.transform(Y_train), scaler.transform( Y_test) elif args.scale_target == 1: Y_train, Y_test = (Y_train - 30) / 30, (Y_test - 30) / 30 # Concatenate data for extreme learning machine train_data = np.concatenate((Y_train[:, np.newaxis], X_train), axis=1) test_data = np.concatenate((Y_test[:, np.newaxis], X_test), axis=1) rgr.search_param(train_data, cv="kfold", of="rmse", eval=10) pred_train = rgr.train(train_data).predicted_targets pred_test = rgr.test(test_data).predicted_targets # Scale target back to 0~60 if args.scale_target == 2: [Y_train, Y_test, pred_train, pred_test] = [scaler.transform(x, mode='inverse') for x in \ [Y_train, Y_test, pred_train, pred_test]] elif args.scale_target == 1: [Y_train, Y_test, pred_train, pred_test] = [x*30+30 for x in \ [Y_train, Y_test, pred_train, pred_test]] evaluate_result.plot_scatter(Y_test, pred_test, dirName, fileName) print('Train std: %.3f' % (mean_squared_error(Y_train, pred_train)**0.5)) print('Test std: %.3f' % (mean_squared_error(Y_test, pred_test)**0.5)) # Save targets and predictions dict_target = {} dict_target['target'], dict_target['pred'] = Y_test, pred_test with open('./results/%s/%s.data' % (dirName, fileName), 'wb') as fp: pickle.dump(dict_target, fp) return