def plot_correlation_matrix(features, image_save_directory, total_values):
    # Select column values to use in the correlation plot
    feature_plot = list(range(0, 10, 1))
    # Select outcomes to show
    feature_plot.extend([-4, -3, -2, -1])
    print(feature_plot)
    print(total_values.columns[feature_plot])
    # http://benalexkeen.com/correlation-in-python/
    # https://stackoverflow.com/questions/26975089/making-the-labels-of-the-scatterplot-vertical-and-horizontal-in-pandas

    #Check if the matrix is singular
    if np.linalg.cond(
            total_values.iloc[:, feature_plot]) < 1 / sys.float_info.epsilon:
        m.rc_file_defaults()  # Reset sns
        axs = pd.plotting.scatter_matrix(total_values.iloc[:, feature_plot],
                                         figsize=(15, 15),
                                         alpha=0.2,
                                         diagonal='kde')
        n = len(features.iloc[:, feature_plot].columns)
        for i in range(n):
            for j in range(n):
                # to get the axis of subplots
                ax = axs[i, j]
                # to make x axis name vertical
                ax.xaxis.label.set_rotation(90)
                # to make y axis name horizontal
                ax.yaxis.label.set_rotation(0)
                # to make sure y axis names are outside the plot area
                ax.yaxis.labelpad = 50
        # plt.yticks(rotation=90)
        vis.save_figure(plt.gcf(),
                        image_save_directory=image_save_directory,
                        filename="Scatter-Matrix")
    else:
        warnings.warn("Inputmatrix is singular and cannot be calculated. ")
def print_characteristics(features_raw, image_save_directory, dataset_name, save_graphs=False):
    for i, d in enumerate(features_raw.dtypes):
        if is_string_dtype(d):
            print("Column {} is a categorical string".format(features_raw.columns[i]))
            s = features_raw[features_raw.columns[i]].value_counts() / features_raw.shape[0]
            fig = vis.paintBarChartForCategorical(s.index, s)
        else:
            print("Column {} is a numerical value".format(features_raw.columns[i]))
            fig = vis.paintHistogram(features_raw, features_raw.columns[i])

        plt.figure(fig.number)

        vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename='feature_{}-{}'.format(i, features_raw.columns[i]))
def plot_correlation_bar(X_scaled, conf, image_save_directory, y_scaled):
    m.rc_file_defaults()  # Reset sns
    corr = X_scaled.corrwith(y_scaled[conf['Common'].get('class_name')],
                             axis=0)
    corr.sort_values().plot.barh(color='blue',
                                 title='Strength of Correlation',
                                 figsize=(10, 25))
    print(corr)
    plt.gcf()

    vis.save_figure(plt.gcf(),
                    image_save_directory=image_save_directory,
                    filename='Correlation_Strength')
def plot_spearman_correlation_matrix(image_save_directory, total_values):
    # http://benalexkeen.com/correlation-in-python/
    matfig = plt.figure(figsize=(20, 20))
    plt.matshow(
        total_values.corr(method='spearman'),
        fignum=1,
        cmap=plt.get_cmap('coolwarm')
    )  # Use spearman correlation instead of pearson to have a robust correlation
    plt.xticks(range(len(total_values.columns)), total_values.columns)
    plt.yticks(range(len(total_values.columns)), total_values.columns)
    plt.xticks(rotation=90)
    plt.colorbar()

    vis.save_figure(plt.gcf(),
                    image_save_directory=image_save_directory,
                    filename='Spearman_Correlation_Plot')
Exemplo n.º 5
0
def execute_treebased_feature_selection(X_scaled, y, conf,
                                        image_save_directory):
    '''


    '''

    print("Tree based feature selection")
    clf = ExtraTreesClassifier(n_estimators=50)
    clf = clf.fit(X_scaled, y)
    print(clf.feature_importances_)
    print("Best score: %f" % clf.score(X_scaled, y))
    model = SelectFromModel(clf, prefit=True)
    X_new = model.transform(X_scaled)
    X_new.shape

    threshold = 0.010
    tree_coef = pd.Series(clf.feature_importances_, index=X_scaled.columns)

    print("Tree search picked " + str(sum(tree_coef >= threshold)) +
          " variables and eliminated the other " +
          str(sum(tree_coef < threshold)) + " variables")
    imp_treecoef = tree_coef.sort_values()
    treecoefList = list(imp_treecoef[imp_treecoef > threshold].index)

    print("Tree based coefficent list:\n", treecoefList)

    plt.figure()
    m.rcParams['figure.figsize'] = (8.0, 20.0)
    imp_treecoef.plot(kind="barh")
    plt.title("Feature importance using Tree Search Model")
    plt.vlines(threshold, 0, len(X_scaled.columns), color='red')
    plt.tight_layout()

    vis.save_figure(plt.gcf(),
                    image_save_directory=image_save_directory,
                    filename="Tree_Based_Importance")

    #if image_save_directory:
    #    if not os.path.isdir(image_save_directory):
    #        os.makedirs(image_save_directory)
    #    plt.savefig(os.path.join(image_save_directory, conf['Common'].get('dataset_name') + '_Tree_Based_Importance'), dpi=300)

    #plt.show(block = False)

    return treecoefList
Exemplo n.º 6
0
def find_tsne_parmeters(X_scaled_subset, y_subset, class_labels, conf,
                        image_save_directory):
    # Optimize t-sne plot
    #tne_gridsearch = False
    # Create a TSNE grid search with two variables
    perplex = [5, 10, 30, 50, 100]
    exaggregation = [5, 12, 20, 50, 100]
    # learning_rate = [10, 50, 200]
    fig, axarr = plt.subplots(len(perplex),
                              len(exaggregation),
                              figsize=(15, 15))
    #if tne_gridsearch == True:
    # for m,l in enumerate(learning_rate):
    for k, p in enumerate(perplex):
        # print("i {}, p {}".format(i, p))
        for j, e in enumerate(exaggregation):
            # print("j {}, e {}".format(j, e))
            X_embedded = TSNE(n_components=2,
                              perplexity=p,
                              early_exaggeration=e,
                              n_iter=5000,
                              n_iter_without_progress=1000,
                              learning_rate=10).fit_transform(X_scaled_subset)

            for i, t in enumerate(set(y_subset)):
                idx = y_subset == t
                axarr[k, j].scatter(X_embedded[idx, 0],
                                    X_embedded[idx, 1],
                                    label=class_labels[t])

            axarr[k, j].set_title("p={}, e={}".format(p, e))

            # clear_output(wait=True)
            print(
                'perplex paramater k={}/{}, exaggregation parameter perplexity={}/{}'
                .format(k, len(perplex), j, len(exaggregation)))
    fig.subplots_adjust(hspace=0.3)

    tsne_param_fig = plt.gcf()

    vis.save_figure(tsne_param_fig,
                    image_save_directory=image_save_directory,
                    filename=conf['Common'].get('dataset_name') +
                    '_TSNE_Calibration_Plot')
def generate_features_outcomes(image_save_directory, source):
    '''


    '''


    # Outcome and Feature Construction
    #Generate the class values, i.e.the y for the data.Construct features. The following dataframes are
    #generated:
    #- source
    #- features
    #- outcomes

    #Load only a subset of the whole raw data to create a debug dataset
    #source = custom(conf['source_path']).iloc[0:1000, :]

    #Plot source
    #plt.figure(num=None, figsize=(12.5, 7), dpi=80, facecolor='w', edgecolor='k')
    #plt.plot(source['Date'], source['Close'])
    #plt.title(conf['source_path'])
    #plt.show()

    bottoms, tops, latestTops, latestBottoms, fig_tops_bot, fig_latest_tops_latest_bot = stock.find_tops_bottoms(source)
    vis.save_figure(fig_tops_bot, image_save_directory=image_save_directory, filename=str(fig_tops_bot.axes[0].get_title()).replace(' ', '_'))
    vis.save_figure(fig_latest_tops_latest_bot, image_save_directory=image_save_directory, filename=str(fig_latest_tops_latest_bot.axes[0].get_title()).replace(' ', '_'))

    topsBottoms = define_tops_bottoms(bottoms, tops)

    pos_trend_long, fig_long = stock.calculate_lowess(source, 300)
    plt.gca()

    vis.save_figure(fig_long, image_save_directory=image_save_directory, filename=str(fig_long.axes[0].get_title()).replace(' ', '_'))
    #plt.show(block = False)

    pos_trend_short, fig_short = stock.calculate_lowess(source, 10)
    plt.gca()
    #plt.show(block = False)
    vis.save_figure(fig_short, image_save_directory=image_save_directory, filename=str(fig_short.axes[0].get_title()).replace(' ', '_'))

    y1day, y5day, y20day, ylong = calculate_y_signals(source, bottoms, tops, latestBottoms, latestTops, pos_trend_long, pos_trend_short)
    y1day, y5day, y20day, ylong = clean_bad_signals_1(y1day, y5day, y20day, ylong, source['Close'], latestBottoms, latestTops)
    y1day, y5day, y20day, ylong = clean_bad_signals_2(y1day, y5day, y20day, ylong, source['Close'], latestBottoms, latestTops)
    y1day, y5day, y20day, ylong = clean_bad_signals_3(y1day, y5day, y20day, ylong, source['Close'], latestBottoms, latestTops)

    # Merge all y values to the series start
    outcomes = pd.DataFrame(index=source.index).join(
        pd.Series(y1day, name="1dTrend").astype('int64')).join(
        pd.Series(y5day, name="5dTrend").astype('int64')).join(
        pd.Series(y20day, name="20dTrend").astype('int64')).join(
        pd.Series(ylong, name="LongTrend").astype('int64')).join(
        pd.Series(topsBottoms, name="TopsBottoms").astype('int64'))

    return outcomes
def plot_umap(X_scaled, class_labels, image_save_directory, y):
    # Use a supervised / unsupervised analysis to make the clusters

    sns.set(style='white', context='poster')
    # import umap
    # %time #Time of the whole cell
    embeddingUnsupervised = umap.UMAP(n_neighbors=5,
                                      random_state=42,
                                      init='random').fit_transform(X_scaled)
    # %time #Time of the whole cell

    if y is not None:
        embeddingSupervised = umap.UMAP(n_neighbors=5,
                                        random_state=42,
                                        init='random').fit_transform(X_scaled,
                                                                     y=y)
        vis.plotUmap(embeddingSupervised, y, list(class_labels.values()),
                     'Dataset supervised clustering')

        vis.save_figure(plt.gcf(),
                        image_save_directory=image_save_directory,
                        filename='UMAP_Supervised')
        print("Plot UMAP supervised")

        vis.plotUmap(embeddingUnsupervised,
                     y,
                     list(class_labels.values()),
                     'Dataset unsupervised clustering',
                     cmapString='RdYlGn')
        print("Plot UMAP unsupervised with class labels")
    else:
        warnings.warn("No y values.")
        vis.plotUmap(embeddingUnsupervised,
                     None,
                     None,
                     'Dataset unsupervised clustering',
                     cmapString='RdYlGn')
        print("Plot UMAP unsupervised without class labels")

    vis.save_figure(plt.gcf(),
                    image_save_directory=image_save_directory,
                    filename='UMAP_Unsupervised')
    print("Plot UMAP unsupervised")
def plot_t_sne(X_scaled_subset, y_scaled_subset, class_labels,
               image_save_directory):
    ### Visualize Data with t-SNE
    # Select a random subset to visualize
    import random
    # Reduce the training set with the number of samples randomly chosen
    # X_train_index_subset = sup.get_data_subset_index(1000, X_scaled)
    np.random.seed(0)
    # X_embedded = TSNE(n_components=2, perplexity=5.0, early_exaggeration=12.0, n_iter=5000,
    #                  n_iter_without_progress=1000, learning_rate=10).fit_transform(embedded)
    X_embedded = TSNE(n_components=2,
                      perplexity=10.0,
                      early_exaggeration=100.0,
                      n_iter=5000,
                      n_iter_without_progress=1000,
                      learning_rate=10).fit_transform(X_scaled_subset)
    #### Plot t-SNE with best parameters
    m.rc_file_defaults()  # Reset sns
    # Plot with texts added to the graphs
    # from adjustText import adjust_text
    #targets = np.array(y[X_train_index_subset]).flatten()
    plt.figure(figsize=(10, 10))
    texts = []

    if y_scaled_subset is not None and class_labels is not None:
        print("Plot t-sne with known classes")
        for i, t in enumerate(set(y_scaled_subset)):
            idx = y_scaled_subset == t
            # for x, y in zip(X_embedded[idx, 0], X_embedded[idx, 1]):
            # texts.append(plt.text(x, y, t))
            plt.scatter(X_embedded[idx, 0],
                        X_embedded[idx, 1],
                        label=class_labels[t])
        # adjust_text(texts, force_points=0.2, force_text=0.2, expand_points=(1,1), expand_text=(1,1), arrowprops=dict(arrowstyle="-", color='black', lw=0.5))
        plt.legend(bbox_to_anchor=(1, 1))
    else:
        print("Plot t-sne without known classes")
        plt.scatter(X_embedded[:, 0], X_embedded[:, 1])

    vis.save_figure(plt.gcf(),
                    image_save_directory=image_save_directory,
                    filename='T-SNE_Plot')
def plot_correlation_matrix2(conf, image_save_directory, total_values):

    # https://blog.insightdatascience.com/data-visualization-in-python-advanced-functionality-in-seaborn-20d217f1a9a6
    feature_plot = list(range(0, 10, 1))
    feature_plot.extend([-1])
    g = sns.pairplot(total_values.iloc[0:1000, feature_plot],
                     hue=conf['Common'].get('class_name'),
                     diag_kind="hist")
    # total_values.columns[-1]
    g.map_upper(sns.regplot)
    g.map_lower(sns.residplot)
    g.map_diag(plt.hist)
    for ax in g.axes.flat:
        plt.setp(ax.get_xticklabels(), rotation=45)
    # FIXME: Legend is incorrect shown. Only numbers instead of class names
    g.add_legend()
    g.set(alpha=0.5)

    vis.save_figure(plt.gcf(),
                    image_save_directory=image_save_directory,
                    filename='Pairplot')
def plot_hierarchical_linkage(X_scaled, conf, image_save_directory):
    '''


    '''
    corr_matrix = X_scaled.corr()
    correlations_array = np.asarray(corr_matrix)
    linkage = hierarchy.linkage(distance.pdist(correlations_array),
                                method='average')
    g = sns.clustermap(corr_matrix,
                       row_linkage=linkage,
                       col_linkage=linkage,
                       row_cluster=True,
                       col_cluster=True,
                       figsize=(8, 8),
                       cmap=plt.get_cmap('coolwarm'))
    plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0)

    label_order = corr_matrix.iloc[:, g.dendrogram_row.reordered_ind].columns

    vis.save_figure(plt.gcf(),
                    image_save_directory=image_save_directory,
                    filename='Hierarchical_Linkage')
Exemplo n.º 12
0
def execute_lasso_feature_selection(X_scaled, y, conf, image_save_directory):
    '''


    '''

    print("Feature selection with lasso regression")
    reg = LassoCV(cv=10, max_iter=100000)
    reg.fit(X_scaled, y)
    coef = pd.Series(reg.coef_, index=X_scaled.columns)
    print("Best alpha using built-in LassoCV: %f" % reg.alpha_)
    print("Best score using built-in LassoCV: %f" % reg.score(X_scaled, y))
    print("Lasso picked " + str(sum(coef != 0)) +
          " variables and eliminated the other " + str(sum(coef == 0)) +
          " variables")
    imp_coef = coef.sort_values()
    coefList = list(imp_coef[imp_coef != 0].index)
    print("Lasso coefficient list\n:", coefList)

    # plt.figure()
    m.rcParams['figure.figsize'] = (8.0, 20.0)
    imp_coef.plot(kind="barh")
    plt.title("Feature importance using Lasso Model")
    plt.tight_layout()

    vis.save_figure(plt.gcf(),
                    image_save_directory=image_save_directory,
                    filename="Lasso_Model_Weights")

    #if image_save_directory:
    #    if not os.path.isdir(image_save_directory):
    #        os.makedirs(image_save_directory)
    #    plt.savefig(os.path.join(image_save_directory, conf['Common'].get('dataset_name') + '_Lasso_Model_Weights'), dpi=300)

    #plt.show(block = False)

    return coefList
def plot_pca(X_scaled, class_labels, image_save_directory, y):

    m.rc_file_defaults()  # Reset sns
    pca_trafo = PCA().fit(X_scaled)
    pca_values = pca_trafo.transform(X_scaled)
    # from adjustText import adjust_text
    targets = np.array(y).flatten()
    fig, ax1 = plt.subplots(figsize=(10, 8))
    plt.semilogy(pca_trafo.explained_variance_ratio_, '--o')
    ax2 = ax1.twinx()  # instantiate a second axes that shares the same x-axis
    plt.semilogy(pca_trafo.explained_variance_ratio_.cumsum(),
                 '--o',
                 color='green')
    plt.xlabel("Principal Component")
    plt.ylabel("Explained variance")
    plt.xticks(np.arange(0, len(pca_trafo.explained_variance_ratio_)))
    plt.hlines(0.95,
               0,
               len(pca_trafo.explained_variance_ratio_.cumsum()),
               colors='red',
               linestyles='solid',
               label='95% variance covered')

    vis.save_figure(plt.gcf(),
                    image_save_directory=image_save_directory,
                    filename='PCA_Variance_Coverage')

    fig = plt.figure()
    sns.heatmap(np.log(pca_trafo.inverse_transform(np.eye(X_scaled.shape[1]))),
                cmap="hot",
                cbar=True)
    necessary_components = pca_trafo.explained_variance_ratio_.cumsum()[
        pca_trafo.explained_variance_ratio_.cumsum() < 0.95]
    print(
        "95% variance covered with the {} first components. Values={}".format(
            len(necessary_components), necessary_components))

    vis.save_figure(plt.gcf(),
                    image_save_directory=image_save_directory,
                    filename='PCA_Heatmap')

    plt.figure(figsize=(10, 10))
    # plt.scatter(pca_values[:,0], pca_values[:,1], c=targets, edgecolor='none', label=class_labels.values(), alpha=0.5)
    for i, t in enumerate(set(targets)):
        idx = targets == t
        plt.scatter(pca_values[idx, 0],
                    pca_values[idx, 1],
                    label=class_labels[t],
                    edgecolor='none',
                    alpha=0.5)
    plt.legend(labels=class_labels.values(), bbox_to_anchor=(1, 1))
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')

    vis.save_figure(plt.gcf(),
                    image_save_directory=image_save_directory,
                    filename='PCA_Plot')
def execute_search_iterations_random_search_SVM(X_train,
                                                y_train,
                                                init_parameter_svm,
                                                pipe_run_random,
                                                scorers,
                                                refit_scorer_name,
                                                iter_setup,
                                                save_fig_prefix=None):
    '''
    Iterated search for parameters. Set sample size, kfolds, number of iterations and top result selection. Execute
    random search cv for the number of entries and extract the best parameters from that search. As a result the
    best C and gamma are extracted.

    :args:
        X_train: Training data, featrues X
        y_train: Training labels, ground truth y
        init_parameter_svm: Initial SVM parameters C and gamma
        pipe_run_random: ML Pipe
        scorers: scorers to use
        refit_scorer_name: Refit scrorer
        save_fig_prefix: Prefix for images from the analysis

    :return:
        param_final: Final parameters C and gamma
    '''

    # Iterated pipeline with increasing number of tries

    sample_size = list(
        (np.array(iter_setup['samples']) * X_train.shape[0]).astype(int))
    kfolds = iter_setup['kfolds']
    number_of_interations = iter_setup['iter']
    select_from_best = iter_setup['selection']

    combined_parameters = zip(sample_size, kfolds, number_of_interations,
                              select_from_best)

    new_parameter_rand = init_parameter_svm  # Initialize the system with the parameter borders

    for i, combination in enumerate(combined_parameters):
        sample_size, folds, iterations, selection = combination
        print(
            "Start random optimization run {} with the following parameters: ".
            format(i))
        print("Sample size: ", sample_size)
        print("Number of folds: ", folds)
        print("Number of tries: ", iterations)
        print("Number of best results to select from: ", selection)

        # Run random search
        new_parameter_rand, results_random_search, clf = exe.run_random_cv_for_SVM(
            X_train,
            y_train,
            new_parameter_rand,
            pipe_run_random,
            scorers,
            refit_scorer_name,
            number_of_samples=sample_size,
            kfolds=folds,
            n_iter_search=iterations,
            plot_best=selection)
        print("Got best parameters: ")
        print(new_parameter_rand)

        # Display random search results
        ax = svmvis.visualize_random_search_results(
            clf,
            refit_scorer_name,
            param_x='param_model__C',
            param_y='param_model__gamma')
        ax_enhanced = svmvis.add_best_results_to_random_search_visualization(
            ax, results_random_search, selection)

        plt.gca()
        plt.tight_layout()

        vis.save_figure(plt.gcf(),
                        image_save_directory=save_fig_prefix,
                        filename='run2_subrun_' + str(i) + '_samples' +
                        str(sample_size) + '_fold' + str(folds) + '_iter' +
                        str(iterations) + '_sel' + str(selection))

        # plt.savefig(save_fig_prefix + '_' + 'run2_subrun_' + str(i) + '_samples' + str(sample_size) + '_fold'
        #            + str(folds) + '_iter' + str(iterations) + '_sel' + str(selection), dpi=300)
        # plt.show(block = False)
        # plt.pause(0.01)
        # plt.close()

        print(
            "===============================================================")

    ##
    print("Best parameter limits: ")
    print(new_parameter_rand)

    print("Best results: ")
    print(results_random_search.round(3).head(10))

    param_final = {}
    param_final['C'] = results_random_search.iloc[0]['param_model__C']
    param_final['gamma'] = results_random_search.iloc[0]['param_model__gamma']

    # param_final = new_parameter_rand[0]
    print("Hyper parameters found")
    print(param_final)

    return param_final, results_random_search
def run_training_estimation(X_train,
                            y_train,
                            X_test,
                            y_test,
                            scorer,
                            model_clf,
                            image_save_directory=None):
    '''
    Run estimation of scorer (default f1) and duration dependent of subset size of input data

    :args:
        X_train: Training data
        y_train: Training labels as numbers
        X_test: Test data
        y_test: Test labels as numbers
        scorer: Scorer for the evaluation, default f1
    :return:
        Nothing

    '''
    # Estimate training duration
    # run_training_estimation = True

    #if run_training_estimation==True:
    #Set test range
    test_range = list(range(100, 6500 + 1, 500))
    #test_range = list(range(100, 1000, 200))
    print("Test range", test_range)

    # SVM model
    # Define the model

    xaxis, durations, scores = exe.estimate_training_duration(
        model_clf, X_train, y_train, X_test, y_test, test_range, scorer)

    # Paint figure
    plt.figure()
    plt.plot(xaxis, durations)
    plt.xlabel('Number of training examples')
    plt.ylabel('Duration [s]')
    plt.title("Training Duration")

    vis.save_figure(plt.gcf(),
                    image_save_directory=image_save_directory,
                    filename='Duration_Samples')

    #if image_save_directory:
    #    if not os.path.isdir(image_save_directory):
    #        os.makedirs(image_save_directory)
    #    plt.savefig(os.path.join(image_save_directory, 'SVM_Duration_Samples'), dpi=300)

    #plt.show(block = False)
    #plt.pause(0.1)
    #plt.close()

    plt.figure()
    plt.plot(xaxis, scores)
    plt.xlabel('Number of training examples')
    plt.ylabel('F1-Score on cross validation set (=the rest). Size={}'.format(
        X_test.shape[0]))
    plt.title("F1 Score Improvement With More Data")

    vis.save_figure(plt.gcf(),
                    image_save_directory=image_save_directory,
                    filename='F1_Samples')
def adapt_features_for_model(features_cleaned1, outcomes_cleaned1, result_dir,
                             class_labels, conf):

    ## Prepare the Feature Columns

    # === Replace signs for missing values or other values with ===#
    features = features_cleaned1.copy()

    # Custom replacements, replace only if there is something to replace, else it makes NAN of it
    # value_replacements = {
    #    'n': 0,
    #    'y': 1,
    #    'unknown': np.NAN
    # }

    # === Replace all custom values and missing values with content from the value_replacement
    for col in features.columns:
        # df_dig[col] = df[col].map(value_replacements)
        # df_dig[col] = df[col].replace('?', np.nan)

        # Everything to numeric
        features[col] = pd.to_numeric(features[col])
        # df_dig[col] = np.int64(df_dig[col])

    print(features.head(5))

    # Create one-hot-encoding for certain classes and replace the original class
    # onehotlabels = pd.get_dummies(df_dig.iloc[:,1])

    # Add one-hot-encondig columns to the dataset
    # for i, name in enumerate(onehotlabels.columns):
    #    df_dig.insert(i+1, column='Cylinder' + str(name), value=onehotlabels.loc[:,name])

    # Remove the original columns
    # df_dig.drop(columns=['cylinders'], inplace=True)

    ## Prepare the Outcomes if they exist
    if outcomes_cleaned1 is not None:
        # Replace classes with digital values
        outcomes = outcomes_cleaned1.copy()
        outcomes = outcomes.astype(int)
        print("Outcome types")
        print(outcomes.dtypes)

        ### Binarize Multiclass Dataset
        # If the binarize setting is used, then binarize the class of the outcome.
        if conf['Preparation'].getboolean('binarize_labels') == True:
            binarized_outcome = (
                outcomes[conf['Common'].get('class_name')] ==
                conf['Preparation'].getint('class_number')).astype(int)
            y = binarized_outcome.values.flatten()
            print("y was binarized. Classes before: {}. Classes after: {}".
                  format(np.unique(outcomes[conf['Common'].get('class_name')]),
                         np.unique(y)))

            # Redefine class labels
            class_labels = {
                0: conf['Preparation'].get('binary_0_label'),
                1: conf['Preparation'].get('binary_1_label')
            }

            print("Class labels redefined to: {}".format(class_labels))
            print("y labels: {}".format(class_labels))
        else:
            y = outcomes[conf['Common'].get('class_name')].values.flatten()
            print("No binarization was made. Classes: {}".format(np.unique(y)))

        print("y shape: {}".format(y.shape))
        print("y unique classes: {}".format(np.unique(y, axis=0)))
    else:
        y = None
        class_labels = None

    ## Determine Missing Data
    #Missing data is only visualized here as it is handled in the training algorithm in S40.

    # Check if there are any nulls in the data
    print("Missing data in the features: ", features.isnull().values.sum())
    features[features.isna().any(axis=1)]

    # Missing data part
    print("Number of missing values per feature")
    missingValueShare = []
    for col in features.columns:
        # if is_string_dtype(df_dig[col]):
        missingValueShare.append(sum(features[col].isna()) / features.shape[0])

    # Print missing value graph
    vis.paintBarChartForMissingValues(features.columns, missingValueShare)
    barplot = plt.gcf()
    vis.save_figure(plt.gcf(),
                    image_save_directory=result_dir,
                    filename=str(barplot.axes[0].get_title()).replace(
                        ' ', '_'))

    # Visualize missing data with missingno
    #fig = plt.figure(num=None, figsize=(8, 8), dpi=80, facecolor='w', edgecolor='k')
    msno.matrix(features)
    fig_matrix = plt.gcf()
    vis.save_figure(fig_matrix,
                    image_save_directory=result_dir,
                    filename='missing_numbers_matrix')

    #plt.savefig(os.path.join(result_dir,'_missing_numbers_matrix'))
    #plt.show(block = False)

    if features.isnull().values.sum() > 0:
        plt.gcf()
        msno.heatmap(features)
        vis.save_figure(plt.gcf(),
                        image_save_directory=result_dir,
                        filename='missing_numbers_heatmap')
        #plt.savefig(os.path.join(result_dir, '_missing_numbers_heatmap'))
        #plt.show(block = False)

    #### View Prepared Binary Features
    # We need some more plots for the binary data types.

    # vis.plotBinaryValues(df_dig, df_dig.columns) #0:-1
    # plt.savefig(image_save_directory + "/BinaryFeatures.png", dpi=70)

    return features, y, class_labels
def main(config_path):
    conf = sup.load_config(config_path)
    # Load annotations file
    y_labels = pd.read_csv(conf['Paths'].get('source_path'), sep=';', header=None).set_index(0).to_dict()[1]

    # Generating filenames for saving the files
    image_save_directory = os.path.join(conf['Paths'].get('results_directory'), "data_generation")
    outcomes_filename_raw = os.path.join(conf['Paths'].get('prepared_data_directory'), "temp", "temp_outcomes_uncut" + ".csv")

    #if os.path.isdir(conf['Paths'].get('prepared_data_directory'))==False:
    os.makedirs(os.path.dirname(outcomes_filename_raw), exist_ok=True)
    #    print("Created directory ", conf['Paths'].get('training_data_directory'))

    #if os.path.isdir(conf['Paths'].get('result_directory'))==False:
    #os.makedirs(conf['Paths'].get('result_directory'), exist_ok=True)
    #    print("Created directory ", conf['Paths'].get('result_directory'))

    #Load only a subset of the whole raw data to create a debug dataset
    source = custom.load_source(conf['Paths'].get('source_path')) #.iloc[0:1000, :]

    #Plot source
    plt.figure(num=None, figsize=(12.5, 7), dpi=80, facecolor='w', edgecolor='k')
    plt.plot(source['Date'], source['Close'])
    plt.title(conf['Paths'].get('source_path'))
    #plt.show(block = False)

    vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename="Source_data")

    #y_labels = annotations #generate_custom_class_labels()
    outcomes = generate_features_outcomes(image_save_directory, source)

    # Drop the 50 last values as they cannot be used for prediction as +50 days ahead is predicted
    source_cut = source.drop(source.tail(50).index, inplace=False)
    outcomes_cut = outcomes.drop(outcomes.tail(50).index, inplace=False)

    vis.plot_three_class_graph(outcomes_cut['1dTrend'].values,
                               source_cut['Close'], source_cut['Date'],
                               0,0,0, ('close', 'neutral', 'positive', 'negative'),
                               title=conf['Common'].get('dataset_name') + '_GT_1dTrend',
                               save_fig_prefix=image_save_directory)

    vis.plot_three_class_graph(outcomes_cut['5dTrend'].values,
                               source_cut['Close'], source_cut['Date'],
                               0,0,0, ('close', 'neutral', 'positive', 'negative'),
                               title=conf['Common'].get('dataset_name') + '_GT_5dTrend',
                               save_fig_prefix=image_save_directory)

    vis.plot_three_class_graph(outcomes_cut['20dTrend'].values,
                               source_cut['Close'], source_cut['Date'],
                               0,0,0, ('close', 'neutral', 'positive', 'negative'),
                               title=conf['Common'].get('dataset_name') + '_GT_20dTrend',
                               save_fig_prefix=image_save_directory)

    vis.plot_three_class_graph(outcomes_cut['LongTrend'].values,
                               source_cut['Close'], source_cut['Date'],
                               0,0,0, ('close', 'neutral', 'positive', 'negative'),
                               title=conf['Common'].get('dataset_name') + '_GT_LongTrend',
                               save_fig_prefix=image_save_directory)

    vis.plot_three_class_graph(outcomes_cut['TopsBottoms'].values,
                               source_cut['Close'], source_cut['Date'],
                               0,0,0, ('close', 'neutral', 'top', 'bottom'),
                               title=conf['Common'].get('dataset_name') + '_GT_TopsBottoms',
                               save_fig_prefix=image_save_directory)

    def binarize(outcomes, class_number):
        return (outcomes == class_number).astype(int)

    vis.plot_two_class_graph(binarize(outcomes_cut['1dTrend'], conf['Common'].getint('class_number')),
                             source_cut['Close'], source_cut['Date'],
                             0,
                             ('close', 'Positive Trend'),
                             title=conf['Common'].get('dataset_name') + '_GT_1dTrend',
                             save_fig_prefix=image_save_directory)

    vis.plot_two_class_graph(binarize(outcomes_cut['5dTrend'], conf['Common'].getint('class_number')),
                             source_cut['Close'], source_cut['Date'],
                             0,
                             ('close', 'Positive Trend'),
                             title=conf['Common'].get('dataset_name') + '_GT_5dTrend',
                             save_fig_prefix=image_save_directory)

    vis.plot_two_class_graph(binarize(outcomes_cut['20dTrend'], conf['Common'].getint('class_number')),
                             source_cut['Close'], source_cut['Date'],
                             0,
                             ('close', 'Positive Trend'),
                             title=conf['Common'].get('dataset_name') + '_GT_20dTrend',
                             save_fig_prefix=image_save_directory)

    vis.plot_two_class_graph(binarize(outcomes_cut['LongTrend'], conf['Common'].getint('class_number')),
                             source_cut['Close'], source_cut['Date'],
                             0,
                             ('close', 'Positive Trend'),
                             title=conf['Common'].get('dataset_name') + '_GT_LongTrend',
                             save_fig_prefix=image_save_directory)

    # Save file
    # Save outcomes to a csv file
    print("Outcomes shape {}".format(outcomes_cut.shape))
    outcomes_cut.to_csv(outcomes_filename_raw, sep=';', index=True, header=True)
    print("Saved outcomes to " + outcomes_filename_raw)
def plot_parallel_coordinates(df, cols, colours, comparison_name, conf,
                              image_save_directory):
    x = [i for i, _ in enumerate(cols)]

    # create dict of categories: colours
    colours = {
        df[comparison_name].astype('category').cat.categories[i]: colours[i]
        for i, _ in enumerate(df[comparison_name].astype(
            'category').cat.categories)
    }

    # Create (X-1) sublots along x axis
    fig, axes = plt.subplots(1, len(x) - 1, sharey=False, figsize=(15, 5))

    # Get min, max and range for each column
    # Normalize the data for each column
    min_max_range = {}
    for col in cols:
        min_max_range[col] = [df[col].min(), df[col].max(), np.ptp(df[col])]
        df[col] = np.true_divide(df[col] - df[col].min(), np.ptp(df[col]))

    # Plot each row
    for i, ax in enumerate(axes):
        for idx in df.index:
            mpg_category = df.loc[idx, comparison_name]
            ax.plot(x, df.loc[idx, cols], colours[mpg_category])
        ax.set_xlim([x[i], x[i + 1]])

    # Set the tick positions and labels on y axis for each plot
    # Tick positions based on normalised data
    # Tick labels are based on original data
    def set_ticks_for_axis(dim, ax, ticks):
        min_val, max_val, val_range = min_max_range[cols[dim]]
        step = val_range / float(ticks - 1)
        tick_labels = [round(min_val + step * i, 2) for i in range(ticks)]
        norm_min = df[cols[dim]].min()
        norm_range = np.ptp(df[cols[dim]])
        norm_step = norm_range / float(ticks - 1)
        ticks = [round(norm_min + norm_step * i, 2) for i in range(ticks)]
        ax.yaxis.set_ticks(ticks)
        ax.set_yticklabels(tick_labels)

    for dim, ax in enumerate(axes):
        ax.xaxis.set_major_locator(ticker.FixedLocator([dim]))
        set_ticks_for_axis(dim, ax, ticks=6)
        ax.set_xticklabels([cols[dim]])

    # Move the final axis' ticks to the right-hand side
    ax = plt.twinx(axes[-1])
    dim = len(axes)
    ax.xaxis.set_major_locator(ticker.FixedLocator([x[-2], x[-1]]))
    set_ticks_for_axis(dim, ax, ticks=6)
    ax.set_xticklabels([cols[-2], cols[-1]])

    # Remove space between subplots
    plt.subplots_adjust(wspace=0)

    # Add legend to plot
    plt.legend([
        plt.Line2D((0, 1), (0, 0), color=colours[cat])
        for cat in df[comparison_name].astype('category').cat.categories
    ],
               df[comparison_name].astype('category').cat.categories,
               bbox_to_anchor=(1.2, 1),
               loc=2,
               borderaxespad=0.)

    plt.title("Values of attributes by category")
    vis.save_figure(plt.gcf(),
                    image_save_directory=image_save_directory,
                    filename='Parallel_Coordinates')