示例#1
0
def remove_outliers_one():
    folder_creator(PATH_CLEANED_FOLDER + "final/", 1)
    for crypto in os.listdir(PATH_COMPLETE_FOLDER):
        df=pd.read_csv(PATH_COMPLETE_FOLDER+crypto,sep=",",header=0)
        #df=cut_dataset_by_range(PATH_COMPLETE_FOLDER,crypto.replace(".csv",""),'2017-06-27','2019-12-31')
        #df_orig = cut_dataset_by_range(PATH_COMPLETE_FOLDER, crypto.replace(".csv", ""), '2017-08-22', '2019-12-31')
        df.to_csv(PATH_CLEANED_FOLDER + "final/" + crypto, sep=",", index=False)
        #df.to_csv(PATH_CLEANED_FOLDER + "final/" + crypto, sep=",", index=False)

        """low=0.20
        high=0.95
        res=df.Close.quantile([low,high])
        print(res)
        true_index=(res.loc[low] <= df.Close.values) & (df.Close.values <= res.loc[high])
        false_index=~true_index"""
        #df.Close=df.Close[true_index]
        i=0
        """for index in false_index:
            if index==True:
                if i!=0 and res.loc[low]<=df_orig.Close[i-1] and df_orig.Close[i-1]<=res.loc[high]:
                    df.Close[i]=df_orig.Close[i-1]
                elif i!=0 and df_orig.Close[i-1]<=res.loc[low]:
                    df.Close[i]=res.loc[low]
                elif i!=0 and df_orig.Close[i-1]>=res.loc[high]:
                    df.Close[i] = res.loc[high]
                else:
                    df.Close[i] =res.loc[low]
            i+=1"""
        #df[true_index]=df.Close[true_index]
        """print("Open")
示例#2
0
def missing_values(PATH_DATASET):
    folder_creator(PATH_DATA_UNDERSTANDING, 1)
    folder_creator(PATH_DATA_UNDERSTANDING + "missing_values_by_year/", 1)
    count_missing_values(PATH_DATASET)
    count_missing_values_by_year(PATH_DATASET)
    generate_bar_chart_by_year(PATH_DATA_UNDERSTANDING +
                               "missing_values_by_year/")
示例#3
0
def remove_outliers_dbscan():
    folder_creator(PATH_CLEANED_FOLDER+"/final",1)
    excluded_features = ['Date']
    for crypto in os.listdir(PATH_COMPLETE_FOLDER):
        #uses all features
        df=pd.read_csv(PATH_COMPLETE_FOLDER+crypto,sep=",",header=0)

        scaler = RobustScaler()
        for col in df.columns:
            if col not in excluded_features:
                normalized = scaler.fit_transform(df[col].values.reshape(-1, 1))
                df[col] = pd.Series(normalized.reshape(-1))

        model = DBSCAN(eps=0.1, min_samples=18).fit(df.drop('Date',axis=1))

        print (len(df[model.labels_==-1].values))
        labels=model.labels_
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise_ = list(labels).count(-1)
        print("numb of clusters: "+ str(n_clusters_))
        print("numb of outliers: "+ str(n_noise_) )

        #outliers
        #print(df[model.labels_==-1])

        #saving the not normalized one
        df = pd.read_csv(PATH_COMPLETE_FOLDER + crypto, sep=",", header=0)
        """df.Close[model.labels_ == -1]=np.median(df.Close[model.labels_ != -1])
        df.Open[model.labels_ == -1] = np.median(df.Open[model.labels_ != -1])
        df.High[model.labels_ == -1] = np.median(df.High[model.labels_ != -1])
        df.Low[model.labels_ == -1] = np.median(df.Low[model.labels_ != -1])"""
        #print(df[model.labels_==-1].Close)
        #print(model.labels_)
        df[model.labels_!=-1].to_csv(PATH_CLEANED_FOLDER+"final/"+crypto,sep=",",index=False)
示例#4
0
def cut_datasets_for_clustering(input_path, output_path, start_date,
                                end_date_for_clustering):
    folder_creator(output_path, 1)
    for crypto in os.listdir(input_path):
        df = cut_dataset_by_range(input_path, crypto.replace(".csv", ""),
                                  start_date, end_date_for_clustering)
        df.to_csv(output_path + crypto, index=False)
示例#5
0
def comparison_macro_avg_recall_single_vs_baseline(input_path_single,input_path_baseline,output_path):
    folder_creator(output_path,0)
    df_report = pd.DataFrame()
    for crypto in os.listdir(input_path_single):
        # read baseline
        file = open(input_path_baseline + crypto+"_macro_avg_recall.txt", "r")
        macro_avg_recall_baseline = file.read()
        file.close()

        #find best configuration
        max_macro_avg_recall = -1
        config = ""
        for configuration in os.listdir(os.path.join(input_path_single, crypto)):
            df = pd.read_csv(os.path.join(input_path_single, crypto, configuration, "stats/macro_avg_recall.csv"), header=0)
            if df["macro_avg_recall"][0] > max_macro_avg_recall:
                max_macro_avg_recall = df["macro_avg_recall"][0]
                config = configuration

        #generate csv containing these info
        df_report = df_report.append(
            {'crypto': crypto, 'model type': 'single_target', 'macro_avg_recall': float(max_macro_avg_recall),
             'config': config},
            ignore_index=True)
        df_report=df_report.append({'crypto':crypto,'model type':'baseline','macro_avg_recall': float(macro_avg_recall_baseline),'config':'standard'},ignore_index=True)
    df_report.to_csv(os.path.join(output_path,"single_target_vs_baseline_report.csv"),index=False)
    comparison_macro_avg_recall_single_vs_baseline_plot(df_report,output_path)
示例#6
0
def quantile_transform2(input_path, output_path):
    folder_creator(output_path, 1)
    for crypto in os.listdir(input_path):
        df = pd.read_csv(input_path + crypto, sep=",", header=0)
        qt = QuantileTransformer(n_quantiles=50,
                                 random_state=0,
                                 output_distribution="normal")
        for feature in df.columns.values:
            #todo aggironare con il while qua...
            if feature not in [
                    'Date', 'Open', 'High', 'Close', 'Low', 'Adj Close'
            ]:
                stat, p = stats.normaltest(df[feature])
                if p <= 0.05:
                    print('transforming:' + feature)
                    p = -1
                    n_t = 1
                    while p <= 0.05:
                        qt = QuantileTransformer(n_quantiles=n_t,
                                                 random_state=0,
                                                 output_distribution="normal")
                        quanrtil = qt.fit_transform(df[feature].values.reshape(
                            -1, 1))
                        new_values = pd.Series(quanrtil.reshape(-1))
                        stat, p = stats.normaltest(new_values)
                        if p > 0.05:
                            df[feature] = pd.Series(new_values)
                            print('num_quantiles:' + str(n_t))
                        elif (n_t < 100):
                            n_t += 1
                        else:
                            break

        df.to_csv(output_path + crypto, sep=",", index=False)
示例#7
0
def quantile_transform(input_path, output_path):
    folder_creator(output_path, 1)
    for crypto in os.listdir(input_path):
        print(crypto)
        df = pd.read_csv(input_path + crypto, sep=",", header=0)

        for feature in df.columns.values:
            if feature != "Date":
                print('transforming:' + feature)
                p = -1
                n_t = 1
                while p <= 0.05:
                    qt = QuantileTransformer(n_quantiles=n_t,
                                             random_state=0,
                                             output_distribution="normal")
                    quanrtil = qt.fit_transform(df[feature].values.reshape(
                        -1, 1))
                    new_values = pd.Series(quanrtil.reshape(-1))
                    stat, p = stats.normaltest(new_values)
                    if p > 0.05:
                        df[feature] = pd.Series(new_values)
                        print('num_quantiles:' + str(n_t))
                    else:
                        n_t += 1
        df.to_csv(output_path + crypto, sep=",", index=False)
示例#8
0
def report_multi_target_k_oriented(path_single_target,types,output_path,percent):
    output_path = output_path + "multi_target_k_oriented/"
    df_report=pd.DataFrame()
    df=pd.read_csv(os.path.join(path_single_target,"single_vs_baseline_report.csv"))
    avg_baseline=df.loc[df['model type']=="baseline"]['macro_avg_recall']
    avg_single_target=df.loc[df['model type']=="single_target"]['macro_avg_recall']
    df_report=df_report.append({"Model":"baseline",'value':float(avg_baseline)},ignore_index=True)
    df_report=df_report.append({'Model': "single target", 'value': float(avg_single_target)},ignore_index=True)
    #ora average per ogni k
    for k in types:
        path_multi_target = "../modelling/techniques/forecasting/outputs_multi_"+str(percent)+"/" + k + "/multi_target/"
        highest_by_crypto=[]
        for cluster in os.listdir(os.path.join(path_multi_target, "clusters/")):
            for crypto in os.listdir(os.path.join(path_multi_target, "clusters", cluster, "result")):
                highest_macro_avg_recall = -1
                for configuration in os.listdir(os.path.join(path_multi_target, "clusters", cluster, "result", crypto)):
                    df = pd.read_csv(
                        os.path.join(path_multi_target, "clusters", cluster, "result", crypto, configuration,
                                     "stats/macro_avg_recall.csv"))
                    value = df['macro_avg_recall'][0]
                    if value > highest_macro_avg_recall:
                        highest_macro_avg_recall = value
                highest_by_crypto.append(highest_macro_avg_recall)
        df_report = df_report.append({'Model': k, 'value': np.average(highest_by_crypto)},ignore_index=True)
    folder_creator(output_path, 0)
    df_report.to_csv(os.path.join(output_path, "multi_target_k_oriented.csv"), index=False)
    report_multi_target_k_oriented_plot(df_report,output_path)
示例#9
0
def save_clusters(input_path, clusters, k_used, CLUSTERING_PATH):
    dict_symbol_id = get_dict_symbol_id(CLUSTERING_PATH)
    folder_creator(CLUSTERING_PATH + "clusters/", 0)
    folder_creator(CLUSTERING_PATH + "clusters/" + k_used + "/", 1)
    df = pd.DataFrame(columns=['cluster_id', 'cryptos'])
    i = 0
    for cluster in clusters:
        cryptocurrencies = []
        for crypto_id in cluster:
            cryptocurrencies.append(dict_symbol_id.symbol[crypto_id])

        #folder_creator(CLUSTERING_PATH + "clusters/"+k_used+"/cluster_"+str(i)+"/", 1,)
        """for crypto in cryptocurrencies:
            for crypto_with_date in os.listdir(input_path):
                
                if crypto_with_date.startswith(crypto):
                    copyfile(input_path + crypto_with_date,
                         CLUSTERING_PATH + "clusters/" + k_used + "/cluster_"+str(i)+"/" + crypto_with_date)
        """
        df = df.append({
            'cluster_id': str(i),
            'cryptos': cryptocurrencies
        },
                       ignore_index=True)
        #i += 1
    df.to_csv(CLUSTERING_PATH + "clusters/" + k_used + "/" + k_used + ".csv",
              sep=",",
              index=False)
示例#10
0
def power_transformation(input_path, output_path):
    folder_creator(output_path, 1)
    for crypto in os.listdir(input_path):
        df = pd.read_csv(input_path + crypto, sep=",", header=0)
        for feature in df.columns.values:
            if feature not in ['Date']:
                df[feature], lam = boxcox(df[feature] + 0.1)
                #print('Feature: '+ feature + '\nLambda: %f' % lam)
        df.to_csv(output_path + crypto, sep=",", index=False)
示例#11
0
def report_multi_target_crypto_oriented(path_baseline,path_single_target,types,output_path,cryptocurrencies,percent):
    output_path=output_path + "multi_target_crypto_oriented/"
    i=0
    report=i
    while i< len(cryptocurrencies):
        save_baseline_single_target = True
        df_report = pd.DataFrame()
        for k in types:
            path_multi_target = "../modelling/techniques/forecasting/outputs_multi_"+str(percent)+"/" + k + "/multi_target/"
            for cluster in os.listdir(os.path.join(path_multi_target,"clusters/")):
                for crypto in os.listdir(os.path.join(path_multi_target,"clusters",cluster,"result")):
                    if crypto in cryptocurrencies[i]:
                        report=str(i)
                        if save_baseline_single_target:
                            file = open(path_baseline + crypto + "_macro_avg_recall.txt", "r")
                            macro_avg_recall_baseline = file.read()
                            df_report = df_report.append(
                                {'crypto': crypto, 'model type': "baseline",
                                 'macro_avg_recall': float(macro_avg_recall_baseline),
                                 'config':"standard"},
                                ignore_index=True)
                            file.close()

                            #find best for single target
                            highest_macro_avg_recall_single = -1
                            config_single = ""
                            for config_single in os.listdir(os.path.join(path_single_target, crypto)):
                                df = pd.read_csv(
                                    os.path.join(path_single_target, crypto, config_single, "stats/macro_avg_recall.csv"),
                                    header=0)
                                if df["macro_avg_recall"][0] > highest_macro_avg_recall_single:
                                    highest_macro_avg_recall_single = df["macro_avg_recall"][0]
                                    config_single = config_single
                            df_report =df_report.append(
                                {'crypto': crypto, 'model type': "single_target",
                                 'macro_avg_recall': float(highest_macro_avg_recall_single),
                                 'config': config_single},
                                ignore_index=True)
                        highest_macro_avg_recall=-1
                        best_conf=""
                        for configuration in os.listdir(os.path.join(path_multi_target,"clusters",cluster,"result",crypto)):
                            df= pd.read_csv(os.path.join(path_multi_target,"clusters",cluster,"result",crypto,configuration,"stats/macro_avg_recall.csv"))
                            value=df['macro_avg_recall'][0]
                            if value > highest_macro_avg_recall:
                                highest_macro_avg_recall=value
                                best_conf=configuration
                        df_report = df_report.append(
                            {'crypto': crypto, 'model type': k,
                             'macro_avg_recall': float(highest_macro_avg_recall),
                             'config': best_conf},
                            ignore_index=True)
            save_baseline_single_target = False
        i+=1
        folder_creator(output_path, 0)
        df_report.to_csv(os.path.join(output_path,"multi_target_crypto_oriented_"+report+".csv"),index=False)
        report_multi_target_crypto_oriented_plot(df_report,output_path,report)
示例#12
0
def remove_features(features_to_remove):
    folder_creator(PATH_PREPARATION_FOLDER + "selected/", 1)
    folder_creator(PATH_PREPARATION_FOLDER + "selected/less_features", 1)
    for crypto in os.listdir(PATH_MAIN_FOLDER):
        df = pd.read_csv(PATH_MAIN_FOLDER + crypto, delimiter=',', header=0)
        for feature in features_to_remove:
            del df[feature]
        df.to_csv(PATH_PREPARATION_FOLDER + "selected/less_features/" + crypto,
                  sep=",",
                  index=False)
示例#13
0
def find_by_dead_before():
    folder_creator(PATH_PREPARATION_FOLDER + "selected/" + "dead/", 1)
    for file in os.listdir(PATH_LESS_FEATURES):
        df = pd.read_csv(PATH_LESS_FEATURES + file, delimiter=',', header=0)
        df = df.set_index("Date")
        # dead before
        last_date = df.index[::-1][0]
        if last_date != '2019-12-31':
            shutil.copy(PATH_LESS_FEATURES + file,
                        PATH_PREPARATION_FOLDER + "selected/dead/" + file)
示例#14
0
def integrate_with_lag(input_path):
    folder_creator(PATH_INTEGRATED_FOLDER, 1)
    for crypto in os.listdir(input_path):
        df = pd.read_csv(input_path + crypto, sep=',', header=0)
        df["Date"] = pd.to_datetime(df["Date"])
        df['lag_1'] = df['Close'].shift(1)
        df['lag_2'] = df['Close'].shift(2)
        df['lag_3'] = df['Close'].shift(3)
        df['lag_7'] = df['Close'].shift(7)
        df = df.iloc[7:]
        df.to_csv(PATH_INTEGRATED_FOLDER + "/" + crypto, sep=",", index=False)
示例#15
0
def standardization(input_path, output_path):
    folder_creator(output_path, 1)
    excluded_features = ['Date']
    for crypto in os.listdir(input_path):
        df = pd.read_csv(input_path + crypto, delimiter=',', header=0)
        scaler = StandardScaler()
        for col in df.columns:
            if col not in excluded_features:
                normalized = scaler.fit_transform(df[col].values.reshape(
                    -1, 1))
                df[col] = pd.Series(normalized.reshape(-1))
        df.to_csv(output_path + crypto, sep=",", index=False)
示例#16
0
def power_transformation2(input_path, output_path):
    folder_creator(output_path, 1)
    for crypto in os.listdir(input_path):
        df = pd.read_csv(input_path + crypto, sep=",", header=0)
        for feature in df.columns.values:
            if feature not in [
                    'Date', 'Open', 'High', 'Close', 'Low', 'Adj Close',
                    'Volume'
            ]:
                df[feature], lam = boxcox(df[feature] + 0.0000001)
                """print("DUEEEE")
                print('Feature: '+ feature + '\nLambda: %f' % lam)"""
        df.to_csv(output_path + crypto, sep=",", index=False)
示例#17
0
def get_most_important_cryptos(cryptocurrencies, startdate, enddate):
    DATASET_NAME = "original"
    folder_creator("../acquisition/dataset", 1)
    DATASET_DIR = "../acquisition/dataset/" + DATASET_NAME
    folder_creator(DATASET_DIR, 1)
    currency = "-USD"
    #f = open("/crypto_symbols.txt", "r")
    #cryptos = f.readlines()
    for crypto in cryptocurrencies:
        #crypto = crypto.replace("\n", "")
        print("getting info about " + crypto)
        df = yahoo_finance_history(crypto + currency, startdate, enddate)
        df.to_csv(DATASET_DIR + "/" + crypto + ".csv", index=False)
示例#18
0
def min_max_one_minusone_scaling(input_path, output_path):
    folder_creator(output_path, 1)
    excluded_features = ['Date']
    for crypto in os.listdir(input_path):
        df = pd.read_csv(input_path + crypto, delimiter=',', header=0)
        scaler = MinMaxScaler(feature_range=(-1, 1))
        for col in df.columns:
            if col not in excluded_features:
                normalized = scaler.fit_transform(df[col].values.reshape(
                    -1, 1))
                df[col] = pd.Series(normalized.reshape(-1))
        #todo we have to round 8 since the neural network takes floating numbers with this limit (df.round(8))
        df.to_csv(output_path + crypto, sep=",", index=False)
示例#19
0
def create_horizontal_dataset(data_path, output_path, test_set):
    cryptocurrencies_with_date_to_pred = os.listdir(data_path)
    cryptos_in_the_cluster = []
    already_created = False
    folder_creator(output_path + "horizontal_datasets" + "/", 0)
    print("Creating horizontal version")
    for date_to_predict in test_set:
        dictionary_m = {}
        dataframes = []
        #take just the date column one time
        for dataset_name in cryptocurrencies_with_date_to_pred:
            splitted = dataset_name.split("_")
            date_to_predict_crypto = str(splitted[1]).replace(".csv", "")
            if date_to_predict == date_to_predict_crypto:
                df_date = pd.read_csv(os.path.join(data_path, dataset_name))
                dataframes.append(df_date['Date'])
                break

        # creates Close_1,Open_1 ecc for each dataframe
        i = 1
        for dataset_name in cryptocurrencies_with_date_to_pred:
            splitted = dataset_name.split("_")
            crypto_name = splitted[0]
            date_to_predict_crypto = str(splitted[1]).replace(".csv", "")
            if date_to_predict == date_to_predict_crypto:
                df = pd.read_csv(os.path.join(data_path, dataset_name),
                                 header=0)
                if already_created == False:
                    cryptos_in_the_cluster.append(crypto_name)
                df = df.drop('Date', axis=1)
                df['symbol'] = crypto_name
                df = df.add_suffix('_' + str(i))
                i += 1
                #dictionary_m[crypto_name]=crypto_name
                dictionary_m[crypto_name + 'dataframe'] = df
        for crypt in cryptos_in_the_cluster:
            dataframes.append(dictionary_m.get(crypt + 'dataframe'))
        already_created = True
        #concat horizontally all the dataframes
        horizontal = pd.concat(dataframes, axis=1)
        #serialization
        horizontal.to_csv(output_path + "horizontal_datasets/horizontal_" +
                          date_to_predict + ".csv",
                          sep=",",
                          index=False)
        del horizontal
        del dataframes
        del dictionary_m
        print("Horizontal version created for the date: " +
              str(date_to_predict))
    return list(cryptos_in_the_cluster)
示例#20
0
def power_transformation_1(input_path, output_path):
    folder_creator(output_path, 1)
    for type_of_normalization in os.listdir(input_path):
        #todo remove this
        if type_of_normalization == "min_max_normalized":
            for crypto in os.listdir(input_path + type_of_normalization):
                df = pd.read_csv(input_path + type_of_normalization + "/" +
                                 crypto,
                                 sep=",",
                                 header=0)
                for feature in df.columns.values:
                    if feature != "Date":
                        df[feature] = boxcox(df[feature] + 0.0000001, 0.0)
                df.to_csv(output_path + crypto, sep=",", index=False)
示例#21
0
def generate_line_chart(experiment_folder, list_temporal_sequences,
                        list_neurons):
    cryptocurrencies = get_crypto_symbols_from_folder(experiment_folder +
                                                      "result/")

    merge_predictions(experiment_folder, "result")

    #create the folder which will contain the line chart
    for crypto in cryptocurrencies:
        folder_creator(
            experiment_folder + "/report/line_chart_images/" + crypto, 1)
        plot_actual_vs_predicted(
            experiment_folder + "/result/merged_predictions.csv", crypto,
            list_neurons, list_temporal_sequences,
            experiment_folder + "/report/line_chart_images/" + crypto + "/")
示例#22
0
def feature_selection(df, features, crypto_name, crypto_symbol, output_path):
    dfnew = pd.DataFrame()
    dfnew['Date'] = df['Date']
    #dfnew=dfnew.set_index("Date")
    #leggere il file che contiene
    #fai numero + 1
    #crypto_symbol=crypto_symbol.set_index("id")
    df = df.drop("Date", axis=1)
    for f1, f2 in product(df.columns.values, df.columns.values):
        f1_splitted = f1.split("_")
        index = f1_splitted[len(f1_splitted) - 1]
        current_symbol = crypto_symbol.symbol[int(index) - 1]
        f1_replaced = f1.replace("_" + index, "")
        folder_creator(output_path + current_symbol + "/", 0)

        # folder_creator(output_path+"/", 1)
        f2_splitted = f2.split("_")
        index2 = f2_splitted[len(f2_splitted) - 1]
        f2_replaced = f2.replace("_" + index2, "")
        if f1 != f2 and f1_replaced == f2_replaced:
            second_symbol = crypto_symbol.symbol[int(index2) - 1]
            #print(f1[:-2]+"-"+f2[:-2])
            print(current_symbol + "-" + second_symbol)
            folder_creator(
                output_path + current_symbol + "/" + second_symbol + "/", 0)
            dfnew[str(f1)] = df[str(f1)]
            dfnew[str(f2)] = df[str(f2)]

            fig = plt.figure(figsize=(55, 8))
            ax = fig.add_subplot(1, 1, 1)
            dfnew = dfnew.set_index("Date")
            dfnew.plot(kind='line', ax=ax)

            #ax.set_xticklabels(dfnew.index.values, rotation=45, fontsize=11)
            plt.title(current_symbol + " VS " + second_symbol,
                      fontsize=20)  # for title
            #plt.xlabel("Date", fontsize=15)  # label for x-axis
            plt.savefig(output_path + current_symbol + "/" + second_symbol +
                        "/" + f1 + "_" + f2 + ".png",
                        dpi=200)
            plt.clf()
            # plt.ylabel(feature, fontsize=15)  # label for y-axis
            # plt.show()
            dfnew = dfnew.reset_index()
            dfnew = dfnew.drop(f1, axis=1)
            dfnew = dfnew.drop(f2, axis=1)
示例#23
0
def describe_new(PATH_DATASET,
                 output_path,
                 name_folder_res=None,
                 features_to_use=None):
    if name_folder_res == None:
        PATH_OUT = output_path + "descriptions/"
    else:
        PATH_OUT = output_path + "descriptions/" + name_folder_res + "/"

    folder_creator(PATH_OUT, 1)
    df = pd.read_csv(PATH_DATASET + "horizontal.csv", delimiter=',', header=0)
    crypto_symbol = pd.read_csv(PATH_DATASET + "symbol_id.csv", index_col=1)
    # df=cut_dataset_by_range(PATH_DATASET,crypto_name,'2017-07-20','2018-10-27')
    # if(crypto_name=="BTC"):
    PATH_CRYPTO = PATH_OUT + "horizontal/"
    folder_creator(PATH_CRYPTO + "feature_selection/", 1)
    feature_selection(df, df.columns.values, "horizontal", crypto_symbol,
                      PATH_CRYPTO + "feature_selection/")
示例#24
0
def input_missing_values():
    folder_creator(PATH_CLEANED_FOLDER+"final",1)
    #already_treated=['LKK.csv','FAIR.csv']
    """for crypto_symbol in os.listdir(PATH_CLEANED_FOLDER+"partial"):
        df = pd.read_csv(PATH_CLEANED_FOLDER+"partial/"+crypto_symbol, delimiter=',', header=0)
        already_treated.append(crypto_symbol)
        df=interpolate_with_time(df)
        df.to_csv(PATH_CLEANED_FOLDER + "final/" + crypto_symbol, sep=",", index=False)"""

    for crypto_symbol in os.listdir(PATH_UNCOMPLETE_FOLDER):
        df = pd.read_csv(PATH_UNCOMPLETE_FOLDER+crypto_symbol, delimiter=',', header=0)
         #if crypto_symbol not in already_treated:
        df=interpolate_with_time(df)
        df.to_csv(PATH_CLEANED_FOLDER + "final/" + crypto_symbol , sep=",", index=False)

    #merge with complete dataset
    for crypto_symbol in os.listdir(PATH_COMPLETE_FOLDER):
        shutil.copy(PATH_COMPLETE_FOLDER+ crypto_symbol, PATH_CLEANED_FOLDER+ "final/" + crypto_symbol)
示例#25
0
def max_abs_scaling(input_path, output_path):
    folder_creator(output_path, 1)
    excluded_features = ['Date', 'trend']
    for crypto in os.listdir(input_path):
        splitted = crypto.split("_")
        crypto_name = splitted[0]
        folder_creator(os.path.join(output_path, crypto_name), 0)
        df = pd.read_csv(os.path.join(input_path, crypto),
                         delimiter=',',
                         header=0)
        day_to_predict = df.loc[len(df.Date) - 1]
        df = df[:-1]  #remove the date to predict
        scaler = MaxAbsScaler()
        for col in df.columns:
            if col not in excluded_features:
                normalized = scaler.fit_transform(df[col].values.reshape(
                    -1, 1))
                df[col] = pd.Series(normalized.reshape(-1))
        df = df.append(day_to_predict, ignore_index=True)
        df.to_csv(os.path.join(output_path, crypto_name, crypto),
                  sep=",",
                  index=False)
示例#26
0
def find_uncomplete():
    folder_creator(PATH_PREPARATION_FOLDER + "selected/" + "uncomplete", 1)
    folder_creator(PATH_PREPARATION_FOLDER + "selected/" + "complete", 1)
    for file in os.listdir(PATH_LESS_FEATURES):
        df = pd.read_csv(PATH_LESS_FEATURES + file, delimiter=',', header=0)
        df = df.rename({'Adj Close': 'Adj_Close'}, axis=1)
        df = df.set_index("Date")
        #with null values
        if (df["Close"].isnull().any()):
            try:
                df.to_csv(PATH_PREPARATION_FOLDER + "selected/uncomplete/" +
                          file)
                #shutil.copy(PATH_LESS_FEATURES+file,PATH_PREPARATION_FOLDER+"selected/uncomplete/"+file)
            except:
                pass
        else:
            try:
                df.to_csv(PATH_PREPARATION_FOLDER + "selected/complete/" +
                          file)
                #shutil.copy(PATH_LESS_FEATURES + file, PATH_PREPARATION_FOLDER+ "selected/complete/" + file)
            except:
                pass
示例#27
0
def folders_setup():
    # Set the name of folder in which to save all intermediate results
    folder_creator(PATH_PREPROCESSED, 0)
示例#28
0
def single_target(EXPERIMENT_PATH, DATA_PATH, TENSOR_DATA_PATH,
                  window_sequences, list_num_neurons, learning_rate,
                  testing_set, features_to_use, DROPOUT, EPOCHS, PATIENCE,
                  number_of_days_to_predict, start_date, end_date):

    #################### FOLDER SETUP ####################
    MODELS_PATH = "models"
    RESULT_PATH = "result"
    TIME_PATH = "time"

    for crypto in os.listdir(DATA_PATH):

        crypto_name = crypto.replace(".csv", "")
        # create a folder for data in tensor format

        folder_creator(TENSOR_DATA_PATH + "/" + crypto_name, 0)
        # create a folder for results
        folder_creator(EXPERIMENT_PATH + "/" + MODELS_PATH + "/" + crypto_name,
                       1)
        folder_creator(EXPERIMENT_PATH + "/" + RESULT_PATH + "/" + crypto_name,
                       1)

        #create folder for time spent
        folder_creator(EXPERIMENT_PATH + "/" + TIME_PATH + "/" + crypto_name,
                       1)

        dataset, features, features_without_date = \
            prepare_input_forecasting(PREPROCESSED_PATH, DATA_PATH, crypto_name,start_date,end_date,None, features_to_use)

        start_time = time.time()
        for window, num_neurons in product(window_sequences, list_num_neurons):
            print('Current configuration: ')
            print("Crypto_symbol: ", crypto, "\t", "Window_sequence: ", window,
                  "\t", "Neurons: ", num_neurons)
            # print(np.array(dataset)[0]), takes the first row of the dataset (2018-01 2020...etc.)
            dataset_tensor_format = fromtemporal_totensor(
                np.array(dataset), window,
                TENSOR_DATA_PATH + "/" + crypto_name + "/", crypto_name)

            # DICTIONARY FOR STATISTICS
            predictions_file = {
                'symbol': [],
                'date': [],
                'observed_class': [],
                'predicted_class': []
            }
            macro_avg_recall_file = {'symbol': [], 'macro_avg_recall': []}

            # New folders for this configuration
            configuration_name = "LSTM_" + str(
                num_neurons) + "_neurons_" + str(window) + "_days"
            # Create a folder to save
            # - best model checkpoint
            # - statistics (results)
            statistics = "stats"
            model_path = EXPERIMENT_PATH + "/" + MODELS_PATH + "/" + crypto_name + "/" + configuration_name + "/"
            results_path = EXPERIMENT_PATH + "/" + RESULT_PATH + "/" + crypto_name + "/" + configuration_name + "/" + statistics + "/"
            folder_creator(model_path, 1)
            folder_creator(results_path, 1)

            accuracies = []
            # starting from the testing set
            for date_to_predict in testing_set:
                """the format of train and test is the following one:
                 [
                    [[Row1],[Row2]],
                    [[Row1],[Row2]],
                    ....
                    [[Row1],[Row2]],
                 ]
                thus for element accessing there are the following three indexes:
                  1)e.g [[items],[items]]
                  2)e.g [items],[items]
                  3)e.g items
                """
                #train, validation,test = get_training_validation_testing_set(dataset_tensor_format, date_to_predict)
                train, test = get_training_validation_testing_set(
                    dataset_tensor_format, date_to_predict,
                    number_of_days_to_predict)
                # ['2018-01-01' other numbers separated by comma],it removes the date.

                train = train[:, :, 1:]
                test = test[:, :, 1:]

                index_of_target_feature = features_without_date.index('Close')

                # print(index_of_target_feature)
                # remove the last day before the day to predict:
                # e.g date to predict 2019-01-07 thus the data about 2019-01-06 will be discarded.
                # e.g [[items],[items2],[items3]] becames [[items1],[items2]]
                # also, i will remove the "Close" feature, thanks to the third index (1)
                # x_train= train[:, :-1, index_of_target_feature:]
                #todo qua ho messo index of target feature
                #da rimettere
                #index_of_target_feature+=1
                #x_train = train[:, :-number_of_days_to_predict, :index_of_target_feature]
                x_train = train[:, :-number_of_days_to_predict, :]
                print("X_TRAIN")
                print(x_train)
                print(x_train.shape)

                # remove the last n-day before the days to predict, by doing -numberof
                # returns an array with all the values of the feature close
                # this contains values about the target feature!

                y_train = train[:, -number_of_days_to_predict:,
                                index_of_target_feature]
                print("Y_TRAIN")
                print(y_train)
                print(y_train.shape)
                #print(y_train.shape)

                #x_val = validation[:, :-1, :]
                """print("X_VAL")
                print(x_val)
                print(x_val.shape)"""
                # remove the last day before the day to predict, by doing -1
                # returns an array with all the values of the feature close to predict!
                #y_val = validation[:, -1, index_of_target_feature]
                """print("Y_VAL")
                print(y_val)
                print(y_val.shape)"""
                # NOTE: in the testing set we must have the dates to evaluate the experiment without the date to forecast!!!
                # remove the day to predict
                # e.g date to predict 2019-01-07 thus the data about 2019-01-07 will be discarded.
                # e.g [[items],[items2],[items3]] becames [[items1],[items2]]
                # todo qua ho messo index of target feature
                #x_test = test[:, :-1, :]
                #x_test = test[:, :-number_of_days_to_predict, :index_of_target_feature]
                x_test = test[:, :-number_of_days_to_predict, :]
                print("X_TEST")
                print(x_test)
                #print(x_test.shape)
                # remove the last day before the day to predict, by doing -1
                # returns an array with all the values of the feature close to predict!
                y_test = test[:, -number_of_days_to_predict:,
                              index_of_target_feature]
                print("Y_TEST")
                print(y_test)
                #print(y_test.shape)

                # change the data type, from object to float
                # print(x_train[0][0])
                x_train = x_train.astype('float')
                # print(x_train[0][0])
                y_train = y_train.astype('float')
                x_test = x_test.astype('float')
                y_test = y_test.astype('float')
                #print(y_test)
                # one hot encode y
                #y_train  = to_categorical(y_train)
                #y_test = to_categorical(y_test)
                #print(y_train)
                #print(y_test)
                #print(np.argmax(y_test))
                #batch size must be a factor of the number of training elements
                BATCH_SIZE = x_train.shape[0]
                # if the date to predict is the first date in the testing_set
                #if date_to_predict == testing_set[0]:
                model, history = train_single_target_model(
                    x_train,
                    y_train,
                    num_neurons=num_neurons,
                    learning_rate=learning_rate,
                    dropout=DROPOUT,
                    epochs=EPOCHS,
                    batch_size=BATCH_SIZE,
                    patience=PATIENCE,
                    num_categories=len(y_train[0]),
                    date_to_predict=date_to_predict,
                    model_path=model_path)
                # plot neural network's architecture
                """plot_model(model, to_file=model_path + "neural_network.png", show_shapes=True,
                           show_layer_names=True, expand_nested=True, dpi=150)

                #plot loss
                filename="model_train_val_loss_bs_"+str(BATCH_SIZE)+"_target_"+str(date_to_predict)
                plot_train_and_validation_loss(pd.Series(history.history['loss']),pd.Series(history.history['val_loss']),model_path,filename)

                #plot accuracy
                filename = "model_train_val_accuracy_bs_" + str(BATCH_SIZE) + "_target_" + str(date_to_predict)
                plot_train_and_validation_accuracy(pd.Series(history.history['accuracy']),
                                               pd.Series(history.history['val_accuracy']), model_path, filename)

                # Predict for each date in the validation set
                test_prediction = model.predict(x_test)
                # this is important!!
                K.clear_session()
                tf_core.random.set_seed(42)

                # changing data types
                #test_prediction = float(test_prediction)
                #test_prediction=test_prediction.astype("float")

                print("Num of entries for training: ", x_train.shape[0])
                # print("Num of element for validation: ", x_test.shape[0])
                #print("Training until: ", pd.to_datetime(date_to_predict) - timedelta(days=3))

                days = []
                i = number_of_days_to_predict-1
                while i > 0:
                    d = pd.to_datetime(date_to_predict) - timedelta(days=i)
                    days.append(d)
                    i -= 1
                days.append(pd.to_datetime(date_to_predict))

                # invert encoding: argmax of numpy takes the higher value in the array

                i=0
                for d in days:
                    print("Predicting for: ", d)
                    print("Predicted: ", np.argmax(test_prediction[i]))
                    print("Actual: ", np.argmax(y_test[i]))
                    i+=1
                print("\n")

                #todo RMSE AND ACCURACY
                # saving the accuracy on these predictions
                # Saving the predictions on the dictionarie
                i = 0
                for d in days:
                    predictions_file['symbol'].append(crypto_name)
                    predictions_file['date'].append(d)
                    predictions_file['observed_class'].append(np.argmax(y_test[i]))
                    predictions_file['predicted_class'].append(np.argmax(test_prediction[i]))
                    i += 1

            # Saving the accuracy into the dictionaries
            macro_avg_recall_file['symbol'].append(crypto_name)

            # accuracy
            performances= get_classification_stats(predictions_file['observed_class'], predictions_file['predicted_class'])
            macro_avg_recall_file['macro_avg_recall'].append(performances.get('macro avg').get('recall'))

            # serialization
            pd.DataFrame(data=predictions_file).to_csv(results_path + 'predictions.csv', index=False)
            pd.DataFrame(data=macro_avg_recall_file).to_csv(results_path + 'macro_avg_recall.csv', index=False)
            #confusion_matrix.to_csv(results_path + 'confusion_matrix.csv', index=False)
            #pd.DataFrame(data=performances).to_csv(results_path + 'performances.csv', index=False)
        time_spent=time.time() - start_time
        f=open(EXPERIMENT_PATH + "/" + TIME_PATH + "/" + crypto_name+"/"+"time_spent.txt","w+")
        f.write(str(time_spent))
        f.close()"""
    return
示例#29
0
def vector_autoregression(input_path, test_set, output_path,
                          crypto_in_the_cluster):
    folder_creator(output_path + partial_folder + "/", 1)
    folder_creator(output_path + final_folder, 1)

    df = pd.read_csv(input_path, sep=',', header=0)
    df = df.set_index('Date')

    #it takes only "Close_X" columns (note that Date is not cut off since it is an index)
    features = df.columns
    features = [feature for feature in features if feature.startswith('Close')]
    df = df[features]

    #min max normalization, sono già normalizzati secondo me non ha senso rifare la normalizzazione
    #df = df.apply(lambda x: (x - x.min()) / (x.max() - x.min()))
    """ print(df.head())
    print('features:', len(features))
    print('they are:', features)"""
    dataframes_out = []
    for crypto in crypto_in_the_cluster:
        df_out = pd.DataFrame(
            columns=["date", "observed_value", "predicted_value"])
        dataframes_out.append(df_out)

    for test_date in test_set:
        try:
            test_date = str_to_datetime(test_date)
            #get previous day (just 1)
            train_date = test_date - timedelta(days=1)

            train_date = datetime_to_str(train_date)
            test_date = datetime_to_str(test_date)
            """print('Last training day: {}'.format(train_date))
            print('Testing day: {}'.format(test_date))"""

            #splitting the dataset in test and train set, based on the date index
            df_train = df[:train_date]
            #select the row of the dataframe subject to test
            df_test = df[test_date:test_date].values[0]

            model = VAR(df_train)
            #todo scelta del miglior lag...
            """for i in [1, 2, 3, 4]:
                results = model.fit(maxlags=i)
                print('Lag Order =', i)
                print('AIC : ', results.aic)
                print('BIC : ', results.bic)
                print('FPE : ', results.fpe)
                print('HQIC: ', results.hqic, '\n')"""

            results = model.fit(maxlags=4, ic='aic')
            #get the lag order
            lag_order = results.k_ar
            #print(lag_order)
            #data to forecast, note that "values" transform a dataframe in a nparray
            #takes the last 4 elements..
            #in order to forecast, it expects up to the lag order number of observations from the past data
            data_for_forecasting = df_train.values[-lag_order:]
            #print(data_for_forecasting.shape)
            num_of_days_to_predict = 1
            y_predicted = results.forecast(data_for_forecasting,
                                           steps=num_of_days_to_predict)[0]

            #serialization, for each date
            #filename=os.path.join(output_path, partial_folder, '{}.csv'.format(test_date))
            """print(df_test)
            print(y_predicted)"""
            """df_out=pd.DataFrame()
            df_out['observed_value']=df_test
            df_out['predicted_value'] = y_predicted"""

            i = 0
            for df_out in dataframes_out:
                dataframes_out[i] = df_out.append(
                    {
                        'date': test_date,
                        'observed_value': df_test[i],
                        'predicted_value': y_predicted[i]
                    },
                    ignore_index=True)
                i += 1
        except Exception as e:
            print('Error, possible cause: {}'.format(e))

    i = 0
    for df_out in dataframes_out:
        df_out.to_csv(output_path + partial_folder + "/" +
                      crypto_in_the_cluster[i] + ".csv",
                      sep=",",
                      index=False)
        i += 1

    # serialization
    rmses = []
    for crypto in os.listdir(output_path + partial_folder + "/"):
        df1 = pd.read_csv(output_path + partial_folder + "/" + crypto)
        # get rmse for each crypto
        rmse = get_rmse(df1['observed_value'], df1['predicted_value'])
        rmses.append(rmse)

        with open(
                os.path.join(output_path, final_folder,
                             crypto.replace(".csv", "")), 'w+') as out:
            out.write(str(rmse))

    with open(os.path.join(output_path, final_folder, "average_rmse.txt"),
              'w+') as out:
        final = np.mean(rmses)
        out.write(str(final))
示例#30
0
def single_target(EXPERIMENT_PATH, DATA_PATH, TENSOR_DATA_PATH, window_sequences, list_num_neurons, learning_rate,
                  features_to_use, DROPOUT, EPOCHS, PATIENCE,BATCH_SIZE,test_set):

    #################### FOLDER SETUP ####################
    MODELS_PATH = "models"
    RESULT_PATH = "result"
    # starting from the testing set
    for crypto_name in os.listdir(DATA_PATH):
        # create a folder for data in tensor format
        folder_creator(TENSOR_DATA_PATH + "/" + crypto_name, 0)
        # create a folder for results
        folder_creator(EXPERIMENT_PATH + "/" + MODELS_PATH + "/" + crypto_name, 0)
        folder_creator(EXPERIMENT_PATH + "/" + RESULT_PATH + "/" + crypto_name, 0)
        for window, num_neurons in product(window_sequences, list_num_neurons):
            print('Current configuration: ')
            print("Crypto: ",crypto_name,"\t","Window_sequence: ", window, "\t", "Neurons: ", num_neurons)
            predictions_file = {'symbol': [], 'date': [], 'observed_class': [], 'predicted_class': []}
            macro_avg_recall_file = {'symbol': [], 'macro_avg_recall': []}
            # New folders for this configuration
            configuration_name = "LSTM_" + str(num_neurons) + "_neurons_" + str(window) + "_days"
            # Create a folder to save
            # - best model checkpoint
            # - statistics (results)
            statistics = "stats"
            model_path = EXPERIMENT_PATH + "/" + MODELS_PATH + "/" + crypto_name + "/" + configuration_name + "/"
            results_path = EXPERIMENT_PATH + "/" + RESULT_PATH + "/" + crypto_name + "/" + configuration_name + "/" + statistics + "/"
            folder_creator(model_path, 0)
            folder_creator(results_path, 0)
            for date_to_predict in test_set:
                #format of dataset name: Crypto_DATE_TO_PREDICT.csv
                dataset_name=crypto_name+"_"+str(date_to_predict)+".csv"
                dataset, features_without_date = \
                    prepare_input_forecasting(os.path.join(DATA_PATH,crypto_name),dataset_name,features_to_use)
                #print(dataset.dtypes)
                dataset_tensor_format = fromtemporal_totensor(np.array(dataset), window,
                                                              TENSOR_DATA_PATH + "/" + crypto_name + "/",
                                                              crypto_name+"_"+date_to_predict)

                #train, validation,test = get_training_validation_testing_set(dataset_tensor_format, date_to_predict)
                train, test = get_training_validation_testing_set(dataset_tensor_format, date_to_predict)

                index_of_target_feature = features_without_date.index('trend')

                x_train = train[:, :-1, :index_of_target_feature]
                """print("X_TRAIN")
                print(x_train)
                print(x_train.shape)"""

                y_train = train[:, -1, index_of_target_feature]
                """print("Y_TRAIN")
                print(y_train)
                print(y_train.shape)"""

                x_test = test[:, :-1, :index_of_target_feature]
                """print("X_TEST")
                print(x_test)
                print(x_test.shape)"""

                y_test = test[:, -1, index_of_target_feature]
                """print("Y_TEST")
                print(y_test)
                print(y_test.shape)"""

                # change the data type, from object to float
                x_train = x_train.astype('float')
                x_test = x_test.astype('float')

                # one hot encode y
                y_train  = to_categorical(y_train)
                y_test = to_categorical(y_test)
                """print(y_train)
                print(y_test)"""

                #batch size must be a factor of the number of training elements
                if BATCH_SIZE == None:
                    BATCH_SIZE = x_train.shape[0]

                model, history = train_single_target_model(x_train, y_train,
                                             num_neurons=num_neurons,
                                             learning_rate=learning_rate,
                                             dropout=DROPOUT,
                                             epochs=EPOCHS,
                                             batch_size=BATCH_SIZE,
                                             patience=PATIENCE,
                                             num_categories=len(y_train[0]),
                                             date_to_predict=date_to_predict,
                                             model_path=model_path)
                # plot neural network's architecture
                plot_model(model, to_file=model_path + "neural_network.png", show_shapes=True,
                           show_layer_names=True, expand_nested=True, dpi=150)

                #plot loss
                """filename="model_train_val_loss_bs_"+str(BATCH_SIZE)+"_target_"+str(date_to_predict)
                plot_train_and_validation_loss(pd.Series(history.history['loss']),pd.Series(history.history['val_loss']),model_path,filename)

                #plot accuracy
                filename = "model_train_val_accuracy_bs_" + str(BATCH_SIZE) + "_target_" + str(date_to_predict)
                plot_train_and_validation_accuracy(pd.Series(history.history['accuracy']),
                                               pd.Series(history.history['val_accuracy']), model_path, filename)"""

                # Predict for each date in the validation set
                test_prediction = model.predict(x_test)
                # this is important!!
                K.clear_session()
                tf_core.random.set_seed(42)
                gc.collect()
                del model
                del dataset_tensor_format
                del dataset

                print("Num of entries for training: ", x_train.shape[0])
                # invert encoding: argmax of numpy takes the higher value in the array
                print("Predicting for: ", date_to_predict)
                print("Predicted: ", np.argmax(test_prediction))
                print("Actual: ", np.argmax(y_test))
                print("\n")

                # Saving the predictions on the dictionarie
                predictions_file['symbol'].append(crypto_name)
                predictions_file['date'].append(date_to_predict)
                predictions_file['observed_class'].append(np.argmax(y_test))
                predictions_file['predicted_class'].append(np.argmax(test_prediction))
            save_results(macro_avg_recall_file, crypto_name, predictions_file, results_path)
    return