def remove_outliers_one(): folder_creator(PATH_CLEANED_FOLDER + "final/", 1) for crypto in os.listdir(PATH_COMPLETE_FOLDER): df=pd.read_csv(PATH_COMPLETE_FOLDER+crypto,sep=",",header=0) #df=cut_dataset_by_range(PATH_COMPLETE_FOLDER,crypto.replace(".csv",""),'2017-06-27','2019-12-31') #df_orig = cut_dataset_by_range(PATH_COMPLETE_FOLDER, crypto.replace(".csv", ""), '2017-08-22', '2019-12-31') df.to_csv(PATH_CLEANED_FOLDER + "final/" + crypto, sep=",", index=False) #df.to_csv(PATH_CLEANED_FOLDER + "final/" + crypto, sep=",", index=False) """low=0.20 high=0.95 res=df.Close.quantile([low,high]) print(res) true_index=(res.loc[low] <= df.Close.values) & (df.Close.values <= res.loc[high]) false_index=~true_index""" #df.Close=df.Close[true_index] i=0 """for index in false_index: if index==True: if i!=0 and res.loc[low]<=df_orig.Close[i-1] and df_orig.Close[i-1]<=res.loc[high]: df.Close[i]=df_orig.Close[i-1] elif i!=0 and df_orig.Close[i-1]<=res.loc[low]: df.Close[i]=res.loc[low] elif i!=0 and df_orig.Close[i-1]>=res.loc[high]: df.Close[i] = res.loc[high] else: df.Close[i] =res.loc[low] i+=1""" #df[true_index]=df.Close[true_index] """print("Open")
def missing_values(PATH_DATASET): folder_creator(PATH_DATA_UNDERSTANDING, 1) folder_creator(PATH_DATA_UNDERSTANDING + "missing_values_by_year/", 1) count_missing_values(PATH_DATASET) count_missing_values_by_year(PATH_DATASET) generate_bar_chart_by_year(PATH_DATA_UNDERSTANDING + "missing_values_by_year/")
def remove_outliers_dbscan(): folder_creator(PATH_CLEANED_FOLDER+"/final",1) excluded_features = ['Date'] for crypto in os.listdir(PATH_COMPLETE_FOLDER): #uses all features df=pd.read_csv(PATH_COMPLETE_FOLDER+crypto,sep=",",header=0) scaler = RobustScaler() for col in df.columns: if col not in excluded_features: normalized = scaler.fit_transform(df[col].values.reshape(-1, 1)) df[col] = pd.Series(normalized.reshape(-1)) model = DBSCAN(eps=0.1, min_samples=18).fit(df.drop('Date',axis=1)) print (len(df[model.labels_==-1].values)) labels=model.labels_ n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print("numb of clusters: "+ str(n_clusters_)) print("numb of outliers: "+ str(n_noise_) ) #outliers #print(df[model.labels_==-1]) #saving the not normalized one df = pd.read_csv(PATH_COMPLETE_FOLDER + crypto, sep=",", header=0) """df.Close[model.labels_ == -1]=np.median(df.Close[model.labels_ != -1]) df.Open[model.labels_ == -1] = np.median(df.Open[model.labels_ != -1]) df.High[model.labels_ == -1] = np.median(df.High[model.labels_ != -1]) df.Low[model.labels_ == -1] = np.median(df.Low[model.labels_ != -1])""" #print(df[model.labels_==-1].Close) #print(model.labels_) df[model.labels_!=-1].to_csv(PATH_CLEANED_FOLDER+"final/"+crypto,sep=",",index=False)
def cut_datasets_for_clustering(input_path, output_path, start_date, end_date_for_clustering): folder_creator(output_path, 1) for crypto in os.listdir(input_path): df = cut_dataset_by_range(input_path, crypto.replace(".csv", ""), start_date, end_date_for_clustering) df.to_csv(output_path + crypto, index=False)
def comparison_macro_avg_recall_single_vs_baseline(input_path_single,input_path_baseline,output_path): folder_creator(output_path,0) df_report = pd.DataFrame() for crypto in os.listdir(input_path_single): # read baseline file = open(input_path_baseline + crypto+"_macro_avg_recall.txt", "r") macro_avg_recall_baseline = file.read() file.close() #find best configuration max_macro_avg_recall = -1 config = "" for configuration in os.listdir(os.path.join(input_path_single, crypto)): df = pd.read_csv(os.path.join(input_path_single, crypto, configuration, "stats/macro_avg_recall.csv"), header=0) if df["macro_avg_recall"][0] > max_macro_avg_recall: max_macro_avg_recall = df["macro_avg_recall"][0] config = configuration #generate csv containing these info df_report = df_report.append( {'crypto': crypto, 'model type': 'single_target', 'macro_avg_recall': float(max_macro_avg_recall), 'config': config}, ignore_index=True) df_report=df_report.append({'crypto':crypto,'model type':'baseline','macro_avg_recall': float(macro_avg_recall_baseline),'config':'standard'},ignore_index=True) df_report.to_csv(os.path.join(output_path,"single_target_vs_baseline_report.csv"),index=False) comparison_macro_avg_recall_single_vs_baseline_plot(df_report,output_path)
def quantile_transform2(input_path, output_path): folder_creator(output_path, 1) for crypto in os.listdir(input_path): df = pd.read_csv(input_path + crypto, sep=",", header=0) qt = QuantileTransformer(n_quantiles=50, random_state=0, output_distribution="normal") for feature in df.columns.values: #todo aggironare con il while qua... if feature not in [ 'Date', 'Open', 'High', 'Close', 'Low', 'Adj Close' ]: stat, p = stats.normaltest(df[feature]) if p <= 0.05: print('transforming:' + feature) p = -1 n_t = 1 while p <= 0.05: qt = QuantileTransformer(n_quantiles=n_t, random_state=0, output_distribution="normal") quanrtil = qt.fit_transform(df[feature].values.reshape( -1, 1)) new_values = pd.Series(quanrtil.reshape(-1)) stat, p = stats.normaltest(new_values) if p > 0.05: df[feature] = pd.Series(new_values) print('num_quantiles:' + str(n_t)) elif (n_t < 100): n_t += 1 else: break df.to_csv(output_path + crypto, sep=",", index=False)
def quantile_transform(input_path, output_path): folder_creator(output_path, 1) for crypto in os.listdir(input_path): print(crypto) df = pd.read_csv(input_path + crypto, sep=",", header=0) for feature in df.columns.values: if feature != "Date": print('transforming:' + feature) p = -1 n_t = 1 while p <= 0.05: qt = QuantileTransformer(n_quantiles=n_t, random_state=0, output_distribution="normal") quanrtil = qt.fit_transform(df[feature].values.reshape( -1, 1)) new_values = pd.Series(quanrtil.reshape(-1)) stat, p = stats.normaltest(new_values) if p > 0.05: df[feature] = pd.Series(new_values) print('num_quantiles:' + str(n_t)) else: n_t += 1 df.to_csv(output_path + crypto, sep=",", index=False)
def report_multi_target_k_oriented(path_single_target,types,output_path,percent): output_path = output_path + "multi_target_k_oriented/" df_report=pd.DataFrame() df=pd.read_csv(os.path.join(path_single_target,"single_vs_baseline_report.csv")) avg_baseline=df.loc[df['model type']=="baseline"]['macro_avg_recall'] avg_single_target=df.loc[df['model type']=="single_target"]['macro_avg_recall'] df_report=df_report.append({"Model":"baseline",'value':float(avg_baseline)},ignore_index=True) df_report=df_report.append({'Model': "single target", 'value': float(avg_single_target)},ignore_index=True) #ora average per ogni k for k in types: path_multi_target = "../modelling/techniques/forecasting/outputs_multi_"+str(percent)+"/" + k + "/multi_target/" highest_by_crypto=[] for cluster in os.listdir(os.path.join(path_multi_target, "clusters/")): for crypto in os.listdir(os.path.join(path_multi_target, "clusters", cluster, "result")): highest_macro_avg_recall = -1 for configuration in os.listdir(os.path.join(path_multi_target, "clusters", cluster, "result", crypto)): df = pd.read_csv( os.path.join(path_multi_target, "clusters", cluster, "result", crypto, configuration, "stats/macro_avg_recall.csv")) value = df['macro_avg_recall'][0] if value > highest_macro_avg_recall: highest_macro_avg_recall = value highest_by_crypto.append(highest_macro_avg_recall) df_report = df_report.append({'Model': k, 'value': np.average(highest_by_crypto)},ignore_index=True) folder_creator(output_path, 0) df_report.to_csv(os.path.join(output_path, "multi_target_k_oriented.csv"), index=False) report_multi_target_k_oriented_plot(df_report,output_path)
def save_clusters(input_path, clusters, k_used, CLUSTERING_PATH): dict_symbol_id = get_dict_symbol_id(CLUSTERING_PATH) folder_creator(CLUSTERING_PATH + "clusters/", 0) folder_creator(CLUSTERING_PATH + "clusters/" + k_used + "/", 1) df = pd.DataFrame(columns=['cluster_id', 'cryptos']) i = 0 for cluster in clusters: cryptocurrencies = [] for crypto_id in cluster: cryptocurrencies.append(dict_symbol_id.symbol[crypto_id]) #folder_creator(CLUSTERING_PATH + "clusters/"+k_used+"/cluster_"+str(i)+"/", 1,) """for crypto in cryptocurrencies: for crypto_with_date in os.listdir(input_path): if crypto_with_date.startswith(crypto): copyfile(input_path + crypto_with_date, CLUSTERING_PATH + "clusters/" + k_used + "/cluster_"+str(i)+"/" + crypto_with_date) """ df = df.append({ 'cluster_id': str(i), 'cryptos': cryptocurrencies }, ignore_index=True) #i += 1 df.to_csv(CLUSTERING_PATH + "clusters/" + k_used + "/" + k_used + ".csv", sep=",", index=False)
def power_transformation(input_path, output_path): folder_creator(output_path, 1) for crypto in os.listdir(input_path): df = pd.read_csv(input_path + crypto, sep=",", header=0) for feature in df.columns.values: if feature not in ['Date']: df[feature], lam = boxcox(df[feature] + 0.1) #print('Feature: '+ feature + '\nLambda: %f' % lam) df.to_csv(output_path + crypto, sep=",", index=False)
def report_multi_target_crypto_oriented(path_baseline,path_single_target,types,output_path,cryptocurrencies,percent): output_path=output_path + "multi_target_crypto_oriented/" i=0 report=i while i< len(cryptocurrencies): save_baseline_single_target = True df_report = pd.DataFrame() for k in types: path_multi_target = "../modelling/techniques/forecasting/outputs_multi_"+str(percent)+"/" + k + "/multi_target/" for cluster in os.listdir(os.path.join(path_multi_target,"clusters/")): for crypto in os.listdir(os.path.join(path_multi_target,"clusters",cluster,"result")): if crypto in cryptocurrencies[i]: report=str(i) if save_baseline_single_target: file = open(path_baseline + crypto + "_macro_avg_recall.txt", "r") macro_avg_recall_baseline = file.read() df_report = df_report.append( {'crypto': crypto, 'model type': "baseline", 'macro_avg_recall': float(macro_avg_recall_baseline), 'config':"standard"}, ignore_index=True) file.close() #find best for single target highest_macro_avg_recall_single = -1 config_single = "" for config_single in os.listdir(os.path.join(path_single_target, crypto)): df = pd.read_csv( os.path.join(path_single_target, crypto, config_single, "stats/macro_avg_recall.csv"), header=0) if df["macro_avg_recall"][0] > highest_macro_avg_recall_single: highest_macro_avg_recall_single = df["macro_avg_recall"][0] config_single = config_single df_report =df_report.append( {'crypto': crypto, 'model type': "single_target", 'macro_avg_recall': float(highest_macro_avg_recall_single), 'config': config_single}, ignore_index=True) highest_macro_avg_recall=-1 best_conf="" for configuration in os.listdir(os.path.join(path_multi_target,"clusters",cluster,"result",crypto)): df= pd.read_csv(os.path.join(path_multi_target,"clusters",cluster,"result",crypto,configuration,"stats/macro_avg_recall.csv")) value=df['macro_avg_recall'][0] if value > highest_macro_avg_recall: highest_macro_avg_recall=value best_conf=configuration df_report = df_report.append( {'crypto': crypto, 'model type': k, 'macro_avg_recall': float(highest_macro_avg_recall), 'config': best_conf}, ignore_index=True) save_baseline_single_target = False i+=1 folder_creator(output_path, 0) df_report.to_csv(os.path.join(output_path,"multi_target_crypto_oriented_"+report+".csv"),index=False) report_multi_target_crypto_oriented_plot(df_report,output_path,report)
def remove_features(features_to_remove): folder_creator(PATH_PREPARATION_FOLDER + "selected/", 1) folder_creator(PATH_PREPARATION_FOLDER + "selected/less_features", 1) for crypto in os.listdir(PATH_MAIN_FOLDER): df = pd.read_csv(PATH_MAIN_FOLDER + crypto, delimiter=',', header=0) for feature in features_to_remove: del df[feature] df.to_csv(PATH_PREPARATION_FOLDER + "selected/less_features/" + crypto, sep=",", index=False)
def find_by_dead_before(): folder_creator(PATH_PREPARATION_FOLDER + "selected/" + "dead/", 1) for file in os.listdir(PATH_LESS_FEATURES): df = pd.read_csv(PATH_LESS_FEATURES + file, delimiter=',', header=0) df = df.set_index("Date") # dead before last_date = df.index[::-1][0] if last_date != '2019-12-31': shutil.copy(PATH_LESS_FEATURES + file, PATH_PREPARATION_FOLDER + "selected/dead/" + file)
def integrate_with_lag(input_path): folder_creator(PATH_INTEGRATED_FOLDER, 1) for crypto in os.listdir(input_path): df = pd.read_csv(input_path + crypto, sep=',', header=0) df["Date"] = pd.to_datetime(df["Date"]) df['lag_1'] = df['Close'].shift(1) df['lag_2'] = df['Close'].shift(2) df['lag_3'] = df['Close'].shift(3) df['lag_7'] = df['Close'].shift(7) df = df.iloc[7:] df.to_csv(PATH_INTEGRATED_FOLDER + "/" + crypto, sep=",", index=False)
def standardization(input_path, output_path): folder_creator(output_path, 1) excluded_features = ['Date'] for crypto in os.listdir(input_path): df = pd.read_csv(input_path + crypto, delimiter=',', header=0) scaler = StandardScaler() for col in df.columns: if col not in excluded_features: normalized = scaler.fit_transform(df[col].values.reshape( -1, 1)) df[col] = pd.Series(normalized.reshape(-1)) df.to_csv(output_path + crypto, sep=",", index=False)
def power_transformation2(input_path, output_path): folder_creator(output_path, 1) for crypto in os.listdir(input_path): df = pd.read_csv(input_path + crypto, sep=",", header=0) for feature in df.columns.values: if feature not in [ 'Date', 'Open', 'High', 'Close', 'Low', 'Adj Close', 'Volume' ]: df[feature], lam = boxcox(df[feature] + 0.0000001) """print("DUEEEE") print('Feature: '+ feature + '\nLambda: %f' % lam)""" df.to_csv(output_path + crypto, sep=",", index=False)
def get_most_important_cryptos(cryptocurrencies, startdate, enddate): DATASET_NAME = "original" folder_creator("../acquisition/dataset", 1) DATASET_DIR = "../acquisition/dataset/" + DATASET_NAME folder_creator(DATASET_DIR, 1) currency = "-USD" #f = open("/crypto_symbols.txt", "r") #cryptos = f.readlines() for crypto in cryptocurrencies: #crypto = crypto.replace("\n", "") print("getting info about " + crypto) df = yahoo_finance_history(crypto + currency, startdate, enddate) df.to_csv(DATASET_DIR + "/" + crypto + ".csv", index=False)
def min_max_one_minusone_scaling(input_path, output_path): folder_creator(output_path, 1) excluded_features = ['Date'] for crypto in os.listdir(input_path): df = pd.read_csv(input_path + crypto, delimiter=',', header=0) scaler = MinMaxScaler(feature_range=(-1, 1)) for col in df.columns: if col not in excluded_features: normalized = scaler.fit_transform(df[col].values.reshape( -1, 1)) df[col] = pd.Series(normalized.reshape(-1)) #todo we have to round 8 since the neural network takes floating numbers with this limit (df.round(8)) df.to_csv(output_path + crypto, sep=",", index=False)
def create_horizontal_dataset(data_path, output_path, test_set): cryptocurrencies_with_date_to_pred = os.listdir(data_path) cryptos_in_the_cluster = [] already_created = False folder_creator(output_path + "horizontal_datasets" + "/", 0) print("Creating horizontal version") for date_to_predict in test_set: dictionary_m = {} dataframes = [] #take just the date column one time for dataset_name in cryptocurrencies_with_date_to_pred: splitted = dataset_name.split("_") date_to_predict_crypto = str(splitted[1]).replace(".csv", "") if date_to_predict == date_to_predict_crypto: df_date = pd.read_csv(os.path.join(data_path, dataset_name)) dataframes.append(df_date['Date']) break # creates Close_1,Open_1 ecc for each dataframe i = 1 for dataset_name in cryptocurrencies_with_date_to_pred: splitted = dataset_name.split("_") crypto_name = splitted[0] date_to_predict_crypto = str(splitted[1]).replace(".csv", "") if date_to_predict == date_to_predict_crypto: df = pd.read_csv(os.path.join(data_path, dataset_name), header=0) if already_created == False: cryptos_in_the_cluster.append(crypto_name) df = df.drop('Date', axis=1) df['symbol'] = crypto_name df = df.add_suffix('_' + str(i)) i += 1 #dictionary_m[crypto_name]=crypto_name dictionary_m[crypto_name + 'dataframe'] = df for crypt in cryptos_in_the_cluster: dataframes.append(dictionary_m.get(crypt + 'dataframe')) already_created = True #concat horizontally all the dataframes horizontal = pd.concat(dataframes, axis=1) #serialization horizontal.to_csv(output_path + "horizontal_datasets/horizontal_" + date_to_predict + ".csv", sep=",", index=False) del horizontal del dataframes del dictionary_m print("Horizontal version created for the date: " + str(date_to_predict)) return list(cryptos_in_the_cluster)
def power_transformation_1(input_path, output_path): folder_creator(output_path, 1) for type_of_normalization in os.listdir(input_path): #todo remove this if type_of_normalization == "min_max_normalized": for crypto in os.listdir(input_path + type_of_normalization): df = pd.read_csv(input_path + type_of_normalization + "/" + crypto, sep=",", header=0) for feature in df.columns.values: if feature != "Date": df[feature] = boxcox(df[feature] + 0.0000001, 0.0) df.to_csv(output_path + crypto, sep=",", index=False)
def generate_line_chart(experiment_folder, list_temporal_sequences, list_neurons): cryptocurrencies = get_crypto_symbols_from_folder(experiment_folder + "result/") merge_predictions(experiment_folder, "result") #create the folder which will contain the line chart for crypto in cryptocurrencies: folder_creator( experiment_folder + "/report/line_chart_images/" + crypto, 1) plot_actual_vs_predicted( experiment_folder + "/result/merged_predictions.csv", crypto, list_neurons, list_temporal_sequences, experiment_folder + "/report/line_chart_images/" + crypto + "/")
def feature_selection(df, features, crypto_name, crypto_symbol, output_path): dfnew = pd.DataFrame() dfnew['Date'] = df['Date'] #dfnew=dfnew.set_index("Date") #leggere il file che contiene #fai numero + 1 #crypto_symbol=crypto_symbol.set_index("id") df = df.drop("Date", axis=1) for f1, f2 in product(df.columns.values, df.columns.values): f1_splitted = f1.split("_") index = f1_splitted[len(f1_splitted) - 1] current_symbol = crypto_symbol.symbol[int(index) - 1] f1_replaced = f1.replace("_" + index, "") folder_creator(output_path + current_symbol + "/", 0) # folder_creator(output_path+"/", 1) f2_splitted = f2.split("_") index2 = f2_splitted[len(f2_splitted) - 1] f2_replaced = f2.replace("_" + index2, "") if f1 != f2 and f1_replaced == f2_replaced: second_symbol = crypto_symbol.symbol[int(index2) - 1] #print(f1[:-2]+"-"+f2[:-2]) print(current_symbol + "-" + second_symbol) folder_creator( output_path + current_symbol + "/" + second_symbol + "/", 0) dfnew[str(f1)] = df[str(f1)] dfnew[str(f2)] = df[str(f2)] fig = plt.figure(figsize=(55, 8)) ax = fig.add_subplot(1, 1, 1) dfnew = dfnew.set_index("Date") dfnew.plot(kind='line', ax=ax) #ax.set_xticklabels(dfnew.index.values, rotation=45, fontsize=11) plt.title(current_symbol + " VS " + second_symbol, fontsize=20) # for title #plt.xlabel("Date", fontsize=15) # label for x-axis plt.savefig(output_path + current_symbol + "/" + second_symbol + "/" + f1 + "_" + f2 + ".png", dpi=200) plt.clf() # plt.ylabel(feature, fontsize=15) # label for y-axis # plt.show() dfnew = dfnew.reset_index() dfnew = dfnew.drop(f1, axis=1) dfnew = dfnew.drop(f2, axis=1)
def describe_new(PATH_DATASET, output_path, name_folder_res=None, features_to_use=None): if name_folder_res == None: PATH_OUT = output_path + "descriptions/" else: PATH_OUT = output_path + "descriptions/" + name_folder_res + "/" folder_creator(PATH_OUT, 1) df = pd.read_csv(PATH_DATASET + "horizontal.csv", delimiter=',', header=0) crypto_symbol = pd.read_csv(PATH_DATASET + "symbol_id.csv", index_col=1) # df=cut_dataset_by_range(PATH_DATASET,crypto_name,'2017-07-20','2018-10-27') # if(crypto_name=="BTC"): PATH_CRYPTO = PATH_OUT + "horizontal/" folder_creator(PATH_CRYPTO + "feature_selection/", 1) feature_selection(df, df.columns.values, "horizontal", crypto_symbol, PATH_CRYPTO + "feature_selection/")
def input_missing_values(): folder_creator(PATH_CLEANED_FOLDER+"final",1) #already_treated=['LKK.csv','FAIR.csv'] """for crypto_symbol in os.listdir(PATH_CLEANED_FOLDER+"partial"): df = pd.read_csv(PATH_CLEANED_FOLDER+"partial/"+crypto_symbol, delimiter=',', header=0) already_treated.append(crypto_symbol) df=interpolate_with_time(df) df.to_csv(PATH_CLEANED_FOLDER + "final/" + crypto_symbol, sep=",", index=False)""" for crypto_symbol in os.listdir(PATH_UNCOMPLETE_FOLDER): df = pd.read_csv(PATH_UNCOMPLETE_FOLDER+crypto_symbol, delimiter=',', header=0) #if crypto_symbol not in already_treated: df=interpolate_with_time(df) df.to_csv(PATH_CLEANED_FOLDER + "final/" + crypto_symbol , sep=",", index=False) #merge with complete dataset for crypto_symbol in os.listdir(PATH_COMPLETE_FOLDER): shutil.copy(PATH_COMPLETE_FOLDER+ crypto_symbol, PATH_CLEANED_FOLDER+ "final/" + crypto_symbol)
def max_abs_scaling(input_path, output_path): folder_creator(output_path, 1) excluded_features = ['Date', 'trend'] for crypto in os.listdir(input_path): splitted = crypto.split("_") crypto_name = splitted[0] folder_creator(os.path.join(output_path, crypto_name), 0) df = pd.read_csv(os.path.join(input_path, crypto), delimiter=',', header=0) day_to_predict = df.loc[len(df.Date) - 1] df = df[:-1] #remove the date to predict scaler = MaxAbsScaler() for col in df.columns: if col not in excluded_features: normalized = scaler.fit_transform(df[col].values.reshape( -1, 1)) df[col] = pd.Series(normalized.reshape(-1)) df = df.append(day_to_predict, ignore_index=True) df.to_csv(os.path.join(output_path, crypto_name, crypto), sep=",", index=False)
def find_uncomplete(): folder_creator(PATH_PREPARATION_FOLDER + "selected/" + "uncomplete", 1) folder_creator(PATH_PREPARATION_FOLDER + "selected/" + "complete", 1) for file in os.listdir(PATH_LESS_FEATURES): df = pd.read_csv(PATH_LESS_FEATURES + file, delimiter=',', header=0) df = df.rename({'Adj Close': 'Adj_Close'}, axis=1) df = df.set_index("Date") #with null values if (df["Close"].isnull().any()): try: df.to_csv(PATH_PREPARATION_FOLDER + "selected/uncomplete/" + file) #shutil.copy(PATH_LESS_FEATURES+file,PATH_PREPARATION_FOLDER+"selected/uncomplete/"+file) except: pass else: try: df.to_csv(PATH_PREPARATION_FOLDER + "selected/complete/" + file) #shutil.copy(PATH_LESS_FEATURES + file, PATH_PREPARATION_FOLDER+ "selected/complete/" + file) except: pass
def folders_setup(): # Set the name of folder in which to save all intermediate results folder_creator(PATH_PREPROCESSED, 0)
def single_target(EXPERIMENT_PATH, DATA_PATH, TENSOR_DATA_PATH, window_sequences, list_num_neurons, learning_rate, testing_set, features_to_use, DROPOUT, EPOCHS, PATIENCE, number_of_days_to_predict, start_date, end_date): #################### FOLDER SETUP #################### MODELS_PATH = "models" RESULT_PATH = "result" TIME_PATH = "time" for crypto in os.listdir(DATA_PATH): crypto_name = crypto.replace(".csv", "") # create a folder for data in tensor format folder_creator(TENSOR_DATA_PATH + "/" + crypto_name, 0) # create a folder for results folder_creator(EXPERIMENT_PATH + "/" + MODELS_PATH + "/" + crypto_name, 1) folder_creator(EXPERIMENT_PATH + "/" + RESULT_PATH + "/" + crypto_name, 1) #create folder for time spent folder_creator(EXPERIMENT_PATH + "/" + TIME_PATH + "/" + crypto_name, 1) dataset, features, features_without_date = \ prepare_input_forecasting(PREPROCESSED_PATH, DATA_PATH, crypto_name,start_date,end_date,None, features_to_use) start_time = time.time() for window, num_neurons in product(window_sequences, list_num_neurons): print('Current configuration: ') print("Crypto_symbol: ", crypto, "\t", "Window_sequence: ", window, "\t", "Neurons: ", num_neurons) # print(np.array(dataset)[0]), takes the first row of the dataset (2018-01 2020...etc.) dataset_tensor_format = fromtemporal_totensor( np.array(dataset), window, TENSOR_DATA_PATH + "/" + crypto_name + "/", crypto_name) # DICTIONARY FOR STATISTICS predictions_file = { 'symbol': [], 'date': [], 'observed_class': [], 'predicted_class': [] } macro_avg_recall_file = {'symbol': [], 'macro_avg_recall': []} # New folders for this configuration configuration_name = "LSTM_" + str( num_neurons) + "_neurons_" + str(window) + "_days" # Create a folder to save # - best model checkpoint # - statistics (results) statistics = "stats" model_path = EXPERIMENT_PATH + "/" + MODELS_PATH + "/" + crypto_name + "/" + configuration_name + "/" results_path = EXPERIMENT_PATH + "/" + RESULT_PATH + "/" + crypto_name + "/" + configuration_name + "/" + statistics + "/" folder_creator(model_path, 1) folder_creator(results_path, 1) accuracies = [] # starting from the testing set for date_to_predict in testing_set: """the format of train and test is the following one: [ [[Row1],[Row2]], [[Row1],[Row2]], .... [[Row1],[Row2]], ] thus for element accessing there are the following three indexes: 1)e.g [[items],[items]] 2)e.g [items],[items] 3)e.g items """ #train, validation,test = get_training_validation_testing_set(dataset_tensor_format, date_to_predict) train, test = get_training_validation_testing_set( dataset_tensor_format, date_to_predict, number_of_days_to_predict) # ['2018-01-01' other numbers separated by comma],it removes the date. train = train[:, :, 1:] test = test[:, :, 1:] index_of_target_feature = features_without_date.index('Close') # print(index_of_target_feature) # remove the last day before the day to predict: # e.g date to predict 2019-01-07 thus the data about 2019-01-06 will be discarded. # e.g [[items],[items2],[items3]] becames [[items1],[items2]] # also, i will remove the "Close" feature, thanks to the third index (1) # x_train= train[:, :-1, index_of_target_feature:] #todo qua ho messo index of target feature #da rimettere #index_of_target_feature+=1 #x_train = train[:, :-number_of_days_to_predict, :index_of_target_feature] x_train = train[:, :-number_of_days_to_predict, :] print("X_TRAIN") print(x_train) print(x_train.shape) # remove the last n-day before the days to predict, by doing -numberof # returns an array with all the values of the feature close # this contains values about the target feature! y_train = train[:, -number_of_days_to_predict:, index_of_target_feature] print("Y_TRAIN") print(y_train) print(y_train.shape) #print(y_train.shape) #x_val = validation[:, :-1, :] """print("X_VAL") print(x_val) print(x_val.shape)""" # remove the last day before the day to predict, by doing -1 # returns an array with all the values of the feature close to predict! #y_val = validation[:, -1, index_of_target_feature] """print("Y_VAL") print(y_val) print(y_val.shape)""" # NOTE: in the testing set we must have the dates to evaluate the experiment without the date to forecast!!! # remove the day to predict # e.g date to predict 2019-01-07 thus the data about 2019-01-07 will be discarded. # e.g [[items],[items2],[items3]] becames [[items1],[items2]] # todo qua ho messo index of target feature #x_test = test[:, :-1, :] #x_test = test[:, :-number_of_days_to_predict, :index_of_target_feature] x_test = test[:, :-number_of_days_to_predict, :] print("X_TEST") print(x_test) #print(x_test.shape) # remove the last day before the day to predict, by doing -1 # returns an array with all the values of the feature close to predict! y_test = test[:, -number_of_days_to_predict:, index_of_target_feature] print("Y_TEST") print(y_test) #print(y_test.shape) # change the data type, from object to float # print(x_train[0][0]) x_train = x_train.astype('float') # print(x_train[0][0]) y_train = y_train.astype('float') x_test = x_test.astype('float') y_test = y_test.astype('float') #print(y_test) # one hot encode y #y_train = to_categorical(y_train) #y_test = to_categorical(y_test) #print(y_train) #print(y_test) #print(np.argmax(y_test)) #batch size must be a factor of the number of training elements BATCH_SIZE = x_train.shape[0] # if the date to predict is the first date in the testing_set #if date_to_predict == testing_set[0]: model, history = train_single_target_model( x_train, y_train, num_neurons=num_neurons, learning_rate=learning_rate, dropout=DROPOUT, epochs=EPOCHS, batch_size=BATCH_SIZE, patience=PATIENCE, num_categories=len(y_train[0]), date_to_predict=date_to_predict, model_path=model_path) # plot neural network's architecture """plot_model(model, to_file=model_path + "neural_network.png", show_shapes=True, show_layer_names=True, expand_nested=True, dpi=150) #plot loss filename="model_train_val_loss_bs_"+str(BATCH_SIZE)+"_target_"+str(date_to_predict) plot_train_and_validation_loss(pd.Series(history.history['loss']),pd.Series(history.history['val_loss']),model_path,filename) #plot accuracy filename = "model_train_val_accuracy_bs_" + str(BATCH_SIZE) + "_target_" + str(date_to_predict) plot_train_and_validation_accuracy(pd.Series(history.history['accuracy']), pd.Series(history.history['val_accuracy']), model_path, filename) # Predict for each date in the validation set test_prediction = model.predict(x_test) # this is important!! K.clear_session() tf_core.random.set_seed(42) # changing data types #test_prediction = float(test_prediction) #test_prediction=test_prediction.astype("float") print("Num of entries for training: ", x_train.shape[0]) # print("Num of element for validation: ", x_test.shape[0]) #print("Training until: ", pd.to_datetime(date_to_predict) - timedelta(days=3)) days = [] i = number_of_days_to_predict-1 while i > 0: d = pd.to_datetime(date_to_predict) - timedelta(days=i) days.append(d) i -= 1 days.append(pd.to_datetime(date_to_predict)) # invert encoding: argmax of numpy takes the higher value in the array i=0 for d in days: print("Predicting for: ", d) print("Predicted: ", np.argmax(test_prediction[i])) print("Actual: ", np.argmax(y_test[i])) i+=1 print("\n") #todo RMSE AND ACCURACY # saving the accuracy on these predictions # Saving the predictions on the dictionarie i = 0 for d in days: predictions_file['symbol'].append(crypto_name) predictions_file['date'].append(d) predictions_file['observed_class'].append(np.argmax(y_test[i])) predictions_file['predicted_class'].append(np.argmax(test_prediction[i])) i += 1 # Saving the accuracy into the dictionaries macro_avg_recall_file['symbol'].append(crypto_name) # accuracy performances= get_classification_stats(predictions_file['observed_class'], predictions_file['predicted_class']) macro_avg_recall_file['macro_avg_recall'].append(performances.get('macro avg').get('recall')) # serialization pd.DataFrame(data=predictions_file).to_csv(results_path + 'predictions.csv', index=False) pd.DataFrame(data=macro_avg_recall_file).to_csv(results_path + 'macro_avg_recall.csv', index=False) #confusion_matrix.to_csv(results_path + 'confusion_matrix.csv', index=False) #pd.DataFrame(data=performances).to_csv(results_path + 'performances.csv', index=False) time_spent=time.time() - start_time f=open(EXPERIMENT_PATH + "/" + TIME_PATH + "/" + crypto_name+"/"+"time_spent.txt","w+") f.write(str(time_spent)) f.close()""" return
def vector_autoregression(input_path, test_set, output_path, crypto_in_the_cluster): folder_creator(output_path + partial_folder + "/", 1) folder_creator(output_path + final_folder, 1) df = pd.read_csv(input_path, sep=',', header=0) df = df.set_index('Date') #it takes only "Close_X" columns (note that Date is not cut off since it is an index) features = df.columns features = [feature for feature in features if feature.startswith('Close')] df = df[features] #min max normalization, sono già normalizzati secondo me non ha senso rifare la normalizzazione #df = df.apply(lambda x: (x - x.min()) / (x.max() - x.min())) """ print(df.head()) print('features:', len(features)) print('they are:', features)""" dataframes_out = [] for crypto in crypto_in_the_cluster: df_out = pd.DataFrame( columns=["date", "observed_value", "predicted_value"]) dataframes_out.append(df_out) for test_date in test_set: try: test_date = str_to_datetime(test_date) #get previous day (just 1) train_date = test_date - timedelta(days=1) train_date = datetime_to_str(train_date) test_date = datetime_to_str(test_date) """print('Last training day: {}'.format(train_date)) print('Testing day: {}'.format(test_date))""" #splitting the dataset in test and train set, based on the date index df_train = df[:train_date] #select the row of the dataframe subject to test df_test = df[test_date:test_date].values[0] model = VAR(df_train) #todo scelta del miglior lag... """for i in [1, 2, 3, 4]: results = model.fit(maxlags=i) print('Lag Order =', i) print('AIC : ', results.aic) print('BIC : ', results.bic) print('FPE : ', results.fpe) print('HQIC: ', results.hqic, '\n')""" results = model.fit(maxlags=4, ic='aic') #get the lag order lag_order = results.k_ar #print(lag_order) #data to forecast, note that "values" transform a dataframe in a nparray #takes the last 4 elements.. #in order to forecast, it expects up to the lag order number of observations from the past data data_for_forecasting = df_train.values[-lag_order:] #print(data_for_forecasting.shape) num_of_days_to_predict = 1 y_predicted = results.forecast(data_for_forecasting, steps=num_of_days_to_predict)[0] #serialization, for each date #filename=os.path.join(output_path, partial_folder, '{}.csv'.format(test_date)) """print(df_test) print(y_predicted)""" """df_out=pd.DataFrame() df_out['observed_value']=df_test df_out['predicted_value'] = y_predicted""" i = 0 for df_out in dataframes_out: dataframes_out[i] = df_out.append( { 'date': test_date, 'observed_value': df_test[i], 'predicted_value': y_predicted[i] }, ignore_index=True) i += 1 except Exception as e: print('Error, possible cause: {}'.format(e)) i = 0 for df_out in dataframes_out: df_out.to_csv(output_path + partial_folder + "/" + crypto_in_the_cluster[i] + ".csv", sep=",", index=False) i += 1 # serialization rmses = [] for crypto in os.listdir(output_path + partial_folder + "/"): df1 = pd.read_csv(output_path + partial_folder + "/" + crypto) # get rmse for each crypto rmse = get_rmse(df1['observed_value'], df1['predicted_value']) rmses.append(rmse) with open( os.path.join(output_path, final_folder, crypto.replace(".csv", "")), 'w+') as out: out.write(str(rmse)) with open(os.path.join(output_path, final_folder, "average_rmse.txt"), 'w+') as out: final = np.mean(rmses) out.write(str(final))
def single_target(EXPERIMENT_PATH, DATA_PATH, TENSOR_DATA_PATH, window_sequences, list_num_neurons, learning_rate, features_to_use, DROPOUT, EPOCHS, PATIENCE,BATCH_SIZE,test_set): #################### FOLDER SETUP #################### MODELS_PATH = "models" RESULT_PATH = "result" # starting from the testing set for crypto_name in os.listdir(DATA_PATH): # create a folder for data in tensor format folder_creator(TENSOR_DATA_PATH + "/" + crypto_name, 0) # create a folder for results folder_creator(EXPERIMENT_PATH + "/" + MODELS_PATH + "/" + crypto_name, 0) folder_creator(EXPERIMENT_PATH + "/" + RESULT_PATH + "/" + crypto_name, 0) for window, num_neurons in product(window_sequences, list_num_neurons): print('Current configuration: ') print("Crypto: ",crypto_name,"\t","Window_sequence: ", window, "\t", "Neurons: ", num_neurons) predictions_file = {'symbol': [], 'date': [], 'observed_class': [], 'predicted_class': []} macro_avg_recall_file = {'symbol': [], 'macro_avg_recall': []} # New folders for this configuration configuration_name = "LSTM_" + str(num_neurons) + "_neurons_" + str(window) + "_days" # Create a folder to save # - best model checkpoint # - statistics (results) statistics = "stats" model_path = EXPERIMENT_PATH + "/" + MODELS_PATH + "/" + crypto_name + "/" + configuration_name + "/" results_path = EXPERIMENT_PATH + "/" + RESULT_PATH + "/" + crypto_name + "/" + configuration_name + "/" + statistics + "/" folder_creator(model_path, 0) folder_creator(results_path, 0) for date_to_predict in test_set: #format of dataset name: Crypto_DATE_TO_PREDICT.csv dataset_name=crypto_name+"_"+str(date_to_predict)+".csv" dataset, features_without_date = \ prepare_input_forecasting(os.path.join(DATA_PATH,crypto_name),dataset_name,features_to_use) #print(dataset.dtypes) dataset_tensor_format = fromtemporal_totensor(np.array(dataset), window, TENSOR_DATA_PATH + "/" + crypto_name + "/", crypto_name+"_"+date_to_predict) #train, validation,test = get_training_validation_testing_set(dataset_tensor_format, date_to_predict) train, test = get_training_validation_testing_set(dataset_tensor_format, date_to_predict) index_of_target_feature = features_without_date.index('trend') x_train = train[:, :-1, :index_of_target_feature] """print("X_TRAIN") print(x_train) print(x_train.shape)""" y_train = train[:, -1, index_of_target_feature] """print("Y_TRAIN") print(y_train) print(y_train.shape)""" x_test = test[:, :-1, :index_of_target_feature] """print("X_TEST") print(x_test) print(x_test.shape)""" y_test = test[:, -1, index_of_target_feature] """print("Y_TEST") print(y_test) print(y_test.shape)""" # change the data type, from object to float x_train = x_train.astype('float') x_test = x_test.astype('float') # one hot encode y y_train = to_categorical(y_train) y_test = to_categorical(y_test) """print(y_train) print(y_test)""" #batch size must be a factor of the number of training elements if BATCH_SIZE == None: BATCH_SIZE = x_train.shape[0] model, history = train_single_target_model(x_train, y_train, num_neurons=num_neurons, learning_rate=learning_rate, dropout=DROPOUT, epochs=EPOCHS, batch_size=BATCH_SIZE, patience=PATIENCE, num_categories=len(y_train[0]), date_to_predict=date_to_predict, model_path=model_path) # plot neural network's architecture plot_model(model, to_file=model_path + "neural_network.png", show_shapes=True, show_layer_names=True, expand_nested=True, dpi=150) #plot loss """filename="model_train_val_loss_bs_"+str(BATCH_SIZE)+"_target_"+str(date_to_predict) plot_train_and_validation_loss(pd.Series(history.history['loss']),pd.Series(history.history['val_loss']),model_path,filename) #plot accuracy filename = "model_train_val_accuracy_bs_" + str(BATCH_SIZE) + "_target_" + str(date_to_predict) plot_train_and_validation_accuracy(pd.Series(history.history['accuracy']), pd.Series(history.history['val_accuracy']), model_path, filename)""" # Predict for each date in the validation set test_prediction = model.predict(x_test) # this is important!! K.clear_session() tf_core.random.set_seed(42) gc.collect() del model del dataset_tensor_format del dataset print("Num of entries for training: ", x_train.shape[0]) # invert encoding: argmax of numpy takes the higher value in the array print("Predicting for: ", date_to_predict) print("Predicted: ", np.argmax(test_prediction)) print("Actual: ", np.argmax(y_test)) print("\n") # Saving the predictions on the dictionarie predictions_file['symbol'].append(crypto_name) predictions_file['date'].append(date_to_predict) predictions_file['observed_class'].append(np.argmax(y_test)) predictions_file['predicted_class'].append(np.argmax(test_prediction)) save_results(macro_avg_recall_file, crypto_name, predictions_file, results_path) return