def plot_correlation_matrix(features, image_save_directory, total_values): # Select column values to use in the correlation plot feature_plot = list(range(0, 10, 1)) # Select outcomes to show feature_plot.extend([-4, -3, -2, -1]) print(feature_plot) print(total_values.columns[feature_plot]) # http://benalexkeen.com/correlation-in-python/ # https://stackoverflow.com/questions/26975089/making-the-labels-of-the-scatterplot-vertical-and-horizontal-in-pandas #Check if the matrix is singular if np.linalg.cond( total_values.iloc[:, feature_plot]) < 1 / sys.float_info.epsilon: m.rc_file_defaults() # Reset sns axs = pd.plotting.scatter_matrix(total_values.iloc[:, feature_plot], figsize=(15, 15), alpha=0.2, diagonal='kde') n = len(features.iloc[:, feature_plot].columns) for i in range(n): for j in range(n): # to get the axis of subplots ax = axs[i, j] # to make x axis name vertical ax.xaxis.label.set_rotation(90) # to make y axis name horizontal ax.yaxis.label.set_rotation(0) # to make sure y axis names are outside the plot area ax.yaxis.labelpad = 50 # plt.yticks(rotation=90) vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename="Scatter-Matrix") else: warnings.warn("Inputmatrix is singular and cannot be calculated. ")
def print_characteristics(features_raw, image_save_directory, dataset_name, save_graphs=False): for i, d in enumerate(features_raw.dtypes): if is_string_dtype(d): print("Column {} is a categorical string".format(features_raw.columns[i])) s = features_raw[features_raw.columns[i]].value_counts() / features_raw.shape[0] fig = vis.paintBarChartForCategorical(s.index, s) else: print("Column {} is a numerical value".format(features_raw.columns[i])) fig = vis.paintHistogram(features_raw, features_raw.columns[i]) plt.figure(fig.number) vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename='feature_{}-{}'.format(i, features_raw.columns[i]))
def plot_correlation_bar(X_scaled, conf, image_save_directory, y_scaled): m.rc_file_defaults() # Reset sns corr = X_scaled.corrwith(y_scaled[conf['Common'].get('class_name')], axis=0) corr.sort_values().plot.barh(color='blue', title='Strength of Correlation', figsize=(10, 25)) print(corr) plt.gcf() vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename='Correlation_Strength')
def plot_spearman_correlation_matrix(image_save_directory, total_values): # http://benalexkeen.com/correlation-in-python/ matfig = plt.figure(figsize=(20, 20)) plt.matshow( total_values.corr(method='spearman'), fignum=1, cmap=plt.get_cmap('coolwarm') ) # Use spearman correlation instead of pearson to have a robust correlation plt.xticks(range(len(total_values.columns)), total_values.columns) plt.yticks(range(len(total_values.columns)), total_values.columns) plt.xticks(rotation=90) plt.colorbar() vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename='Spearman_Correlation_Plot')
def execute_treebased_feature_selection(X_scaled, y, conf, image_save_directory): ''' ''' print("Tree based feature selection") clf = ExtraTreesClassifier(n_estimators=50) clf = clf.fit(X_scaled, y) print(clf.feature_importances_) print("Best score: %f" % clf.score(X_scaled, y)) model = SelectFromModel(clf, prefit=True) X_new = model.transform(X_scaled) X_new.shape threshold = 0.010 tree_coef = pd.Series(clf.feature_importances_, index=X_scaled.columns) print("Tree search picked " + str(sum(tree_coef >= threshold)) + " variables and eliminated the other " + str(sum(tree_coef < threshold)) + " variables") imp_treecoef = tree_coef.sort_values() treecoefList = list(imp_treecoef[imp_treecoef > threshold].index) print("Tree based coefficent list:\n", treecoefList) plt.figure() m.rcParams['figure.figsize'] = (8.0, 20.0) imp_treecoef.plot(kind="barh") plt.title("Feature importance using Tree Search Model") plt.vlines(threshold, 0, len(X_scaled.columns), color='red') plt.tight_layout() vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename="Tree_Based_Importance") #if image_save_directory: # if not os.path.isdir(image_save_directory): # os.makedirs(image_save_directory) # plt.savefig(os.path.join(image_save_directory, conf['Common'].get('dataset_name') + '_Tree_Based_Importance'), dpi=300) #plt.show(block = False) return treecoefList
def find_tsne_parmeters(X_scaled_subset, y_subset, class_labels, conf, image_save_directory): # Optimize t-sne plot #tne_gridsearch = False # Create a TSNE grid search with two variables perplex = [5, 10, 30, 50, 100] exaggregation = [5, 12, 20, 50, 100] # learning_rate = [10, 50, 200] fig, axarr = plt.subplots(len(perplex), len(exaggregation), figsize=(15, 15)) #if tne_gridsearch == True: # for m,l in enumerate(learning_rate): for k, p in enumerate(perplex): # print("i {}, p {}".format(i, p)) for j, e in enumerate(exaggregation): # print("j {}, e {}".format(j, e)) X_embedded = TSNE(n_components=2, perplexity=p, early_exaggeration=e, n_iter=5000, n_iter_without_progress=1000, learning_rate=10).fit_transform(X_scaled_subset) for i, t in enumerate(set(y_subset)): idx = y_subset == t axarr[k, j].scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=class_labels[t]) axarr[k, j].set_title("p={}, e={}".format(p, e)) # clear_output(wait=True) print( 'perplex paramater k={}/{}, exaggregation parameter perplexity={}/{}' .format(k, len(perplex), j, len(exaggregation))) fig.subplots_adjust(hspace=0.3) tsne_param_fig = plt.gcf() vis.save_figure(tsne_param_fig, image_save_directory=image_save_directory, filename=conf['Common'].get('dataset_name') + '_TSNE_Calibration_Plot')
def generate_features_outcomes(image_save_directory, source): ''' ''' # Outcome and Feature Construction #Generate the class values, i.e.the y for the data.Construct features. The following dataframes are #generated: #- source #- features #- outcomes #Load only a subset of the whole raw data to create a debug dataset #source = custom(conf['source_path']).iloc[0:1000, :] #Plot source #plt.figure(num=None, figsize=(12.5, 7), dpi=80, facecolor='w', edgecolor='k') #plt.plot(source['Date'], source['Close']) #plt.title(conf['source_path']) #plt.show() bottoms, tops, latestTops, latestBottoms, fig_tops_bot, fig_latest_tops_latest_bot = stock.find_tops_bottoms(source) vis.save_figure(fig_tops_bot, image_save_directory=image_save_directory, filename=str(fig_tops_bot.axes[0].get_title()).replace(' ', '_')) vis.save_figure(fig_latest_tops_latest_bot, image_save_directory=image_save_directory, filename=str(fig_latest_tops_latest_bot.axes[0].get_title()).replace(' ', '_')) topsBottoms = define_tops_bottoms(bottoms, tops) pos_trend_long, fig_long = stock.calculate_lowess(source, 300) plt.gca() vis.save_figure(fig_long, image_save_directory=image_save_directory, filename=str(fig_long.axes[0].get_title()).replace(' ', '_')) #plt.show(block = False) pos_trend_short, fig_short = stock.calculate_lowess(source, 10) plt.gca() #plt.show(block = False) vis.save_figure(fig_short, image_save_directory=image_save_directory, filename=str(fig_short.axes[0].get_title()).replace(' ', '_')) y1day, y5day, y20day, ylong = calculate_y_signals(source, bottoms, tops, latestBottoms, latestTops, pos_trend_long, pos_trend_short) y1day, y5day, y20day, ylong = clean_bad_signals_1(y1day, y5day, y20day, ylong, source['Close'], latestBottoms, latestTops) y1day, y5day, y20day, ylong = clean_bad_signals_2(y1day, y5day, y20day, ylong, source['Close'], latestBottoms, latestTops) y1day, y5day, y20day, ylong = clean_bad_signals_3(y1day, y5day, y20day, ylong, source['Close'], latestBottoms, latestTops) # Merge all y values to the series start outcomes = pd.DataFrame(index=source.index).join( pd.Series(y1day, name="1dTrend").astype('int64')).join( pd.Series(y5day, name="5dTrend").astype('int64')).join( pd.Series(y20day, name="20dTrend").astype('int64')).join( pd.Series(ylong, name="LongTrend").astype('int64')).join( pd.Series(topsBottoms, name="TopsBottoms").astype('int64')) return outcomes
def plot_umap(X_scaled, class_labels, image_save_directory, y): # Use a supervised / unsupervised analysis to make the clusters sns.set(style='white', context='poster') # import umap # %time #Time of the whole cell embeddingUnsupervised = umap.UMAP(n_neighbors=5, random_state=42, init='random').fit_transform(X_scaled) # %time #Time of the whole cell if y is not None: embeddingSupervised = umap.UMAP(n_neighbors=5, random_state=42, init='random').fit_transform(X_scaled, y=y) vis.plotUmap(embeddingSupervised, y, list(class_labels.values()), 'Dataset supervised clustering') vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename='UMAP_Supervised') print("Plot UMAP supervised") vis.plotUmap(embeddingUnsupervised, y, list(class_labels.values()), 'Dataset unsupervised clustering', cmapString='RdYlGn') print("Plot UMAP unsupervised with class labels") else: warnings.warn("No y values.") vis.plotUmap(embeddingUnsupervised, None, None, 'Dataset unsupervised clustering', cmapString='RdYlGn') print("Plot UMAP unsupervised without class labels") vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename='UMAP_Unsupervised') print("Plot UMAP unsupervised")
def plot_t_sne(X_scaled_subset, y_scaled_subset, class_labels, image_save_directory): ### Visualize Data with t-SNE # Select a random subset to visualize import random # Reduce the training set with the number of samples randomly chosen # X_train_index_subset = sup.get_data_subset_index(1000, X_scaled) np.random.seed(0) # X_embedded = TSNE(n_components=2, perplexity=5.0, early_exaggeration=12.0, n_iter=5000, # n_iter_without_progress=1000, learning_rate=10).fit_transform(embedded) X_embedded = TSNE(n_components=2, perplexity=10.0, early_exaggeration=100.0, n_iter=5000, n_iter_without_progress=1000, learning_rate=10).fit_transform(X_scaled_subset) #### Plot t-SNE with best parameters m.rc_file_defaults() # Reset sns # Plot with texts added to the graphs # from adjustText import adjust_text #targets = np.array(y[X_train_index_subset]).flatten() plt.figure(figsize=(10, 10)) texts = [] if y_scaled_subset is not None and class_labels is not None: print("Plot t-sne with known classes") for i, t in enumerate(set(y_scaled_subset)): idx = y_scaled_subset == t # for x, y in zip(X_embedded[idx, 0], X_embedded[idx, 1]): # texts.append(plt.text(x, y, t)) plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=class_labels[t]) # adjust_text(texts, force_points=0.2, force_text=0.2, expand_points=(1,1), expand_text=(1,1), arrowprops=dict(arrowstyle="-", color='black', lw=0.5)) plt.legend(bbox_to_anchor=(1, 1)) else: print("Plot t-sne without known classes") plt.scatter(X_embedded[:, 0], X_embedded[:, 1]) vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename='T-SNE_Plot')
def plot_correlation_matrix2(conf, image_save_directory, total_values): # https://blog.insightdatascience.com/data-visualization-in-python-advanced-functionality-in-seaborn-20d217f1a9a6 feature_plot = list(range(0, 10, 1)) feature_plot.extend([-1]) g = sns.pairplot(total_values.iloc[0:1000, feature_plot], hue=conf['Common'].get('class_name'), diag_kind="hist") # total_values.columns[-1] g.map_upper(sns.regplot) g.map_lower(sns.residplot) g.map_diag(plt.hist) for ax in g.axes.flat: plt.setp(ax.get_xticklabels(), rotation=45) # FIXME: Legend is incorrect shown. Only numbers instead of class names g.add_legend() g.set(alpha=0.5) vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename='Pairplot')
def plot_hierarchical_linkage(X_scaled, conf, image_save_directory): ''' ''' corr_matrix = X_scaled.corr() correlations_array = np.asarray(corr_matrix) linkage = hierarchy.linkage(distance.pdist(correlations_array), method='average') g = sns.clustermap(corr_matrix, row_linkage=linkage, col_linkage=linkage, row_cluster=True, col_cluster=True, figsize=(8, 8), cmap=plt.get_cmap('coolwarm')) plt.setp(g.ax_heatmap.yaxis.get_majorticklabels(), rotation=0) label_order = corr_matrix.iloc[:, g.dendrogram_row.reordered_ind].columns vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename='Hierarchical_Linkage')
def execute_lasso_feature_selection(X_scaled, y, conf, image_save_directory): ''' ''' print("Feature selection with lasso regression") reg = LassoCV(cv=10, max_iter=100000) reg.fit(X_scaled, y) coef = pd.Series(reg.coef_, index=X_scaled.columns) print("Best alpha using built-in LassoCV: %f" % reg.alpha_) print("Best score using built-in LassoCV: %f" % reg.score(X_scaled, y)) print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables") imp_coef = coef.sort_values() coefList = list(imp_coef[imp_coef != 0].index) print("Lasso coefficient list\n:", coefList) # plt.figure() m.rcParams['figure.figsize'] = (8.0, 20.0) imp_coef.plot(kind="barh") plt.title("Feature importance using Lasso Model") plt.tight_layout() vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename="Lasso_Model_Weights") #if image_save_directory: # if not os.path.isdir(image_save_directory): # os.makedirs(image_save_directory) # plt.savefig(os.path.join(image_save_directory, conf['Common'].get('dataset_name') + '_Lasso_Model_Weights'), dpi=300) #plt.show(block = False) return coefList
def plot_pca(X_scaled, class_labels, image_save_directory, y): m.rc_file_defaults() # Reset sns pca_trafo = PCA().fit(X_scaled) pca_values = pca_trafo.transform(X_scaled) # from adjustText import adjust_text targets = np.array(y).flatten() fig, ax1 = plt.subplots(figsize=(10, 8)) plt.semilogy(pca_trafo.explained_variance_ratio_, '--o') ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis plt.semilogy(pca_trafo.explained_variance_ratio_.cumsum(), '--o', color='green') plt.xlabel("Principal Component") plt.ylabel("Explained variance") plt.xticks(np.arange(0, len(pca_trafo.explained_variance_ratio_))) plt.hlines(0.95, 0, len(pca_trafo.explained_variance_ratio_.cumsum()), colors='red', linestyles='solid', label='95% variance covered') vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename='PCA_Variance_Coverage') fig = plt.figure() sns.heatmap(np.log(pca_trafo.inverse_transform(np.eye(X_scaled.shape[1]))), cmap="hot", cbar=True) necessary_components = pca_trafo.explained_variance_ratio_.cumsum()[ pca_trafo.explained_variance_ratio_.cumsum() < 0.95] print( "95% variance covered with the {} first components. Values={}".format( len(necessary_components), necessary_components)) vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename='PCA_Heatmap') plt.figure(figsize=(10, 10)) # plt.scatter(pca_values[:,0], pca_values[:,1], c=targets, edgecolor='none', label=class_labels.values(), alpha=0.5) for i, t in enumerate(set(targets)): idx = targets == t plt.scatter(pca_values[idx, 0], pca_values[idx, 1], label=class_labels[t], edgecolor='none', alpha=0.5) plt.legend(labels=class_labels.values(), bbox_to_anchor=(1, 1)) plt.xlabel('Component 1') plt.ylabel('Component 2') vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename='PCA_Plot')
def execute_search_iterations_random_search_SVM(X_train, y_train, init_parameter_svm, pipe_run_random, scorers, refit_scorer_name, iter_setup, save_fig_prefix=None): ''' Iterated search for parameters. Set sample size, kfolds, number of iterations and top result selection. Execute random search cv for the number of entries and extract the best parameters from that search. As a result the best C and gamma are extracted. :args: X_train: Training data, featrues X y_train: Training labels, ground truth y init_parameter_svm: Initial SVM parameters C and gamma pipe_run_random: ML Pipe scorers: scorers to use refit_scorer_name: Refit scrorer save_fig_prefix: Prefix for images from the analysis :return: param_final: Final parameters C and gamma ''' # Iterated pipeline with increasing number of tries sample_size = list( (np.array(iter_setup['samples']) * X_train.shape[0]).astype(int)) kfolds = iter_setup['kfolds'] number_of_interations = iter_setup['iter'] select_from_best = iter_setup['selection'] combined_parameters = zip(sample_size, kfolds, number_of_interations, select_from_best) new_parameter_rand = init_parameter_svm # Initialize the system with the parameter borders for i, combination in enumerate(combined_parameters): sample_size, folds, iterations, selection = combination print( "Start random optimization run {} with the following parameters: ". format(i)) print("Sample size: ", sample_size) print("Number of folds: ", folds) print("Number of tries: ", iterations) print("Number of best results to select from: ", selection) # Run random search new_parameter_rand, results_random_search, clf = exe.run_random_cv_for_SVM( X_train, y_train, new_parameter_rand, pipe_run_random, scorers, refit_scorer_name, number_of_samples=sample_size, kfolds=folds, n_iter_search=iterations, plot_best=selection) print("Got best parameters: ") print(new_parameter_rand) # Display random search results ax = svmvis.visualize_random_search_results( clf, refit_scorer_name, param_x='param_model__C', param_y='param_model__gamma') ax_enhanced = svmvis.add_best_results_to_random_search_visualization( ax, results_random_search, selection) plt.gca() plt.tight_layout() vis.save_figure(plt.gcf(), image_save_directory=save_fig_prefix, filename='run2_subrun_' + str(i) + '_samples' + str(sample_size) + '_fold' + str(folds) + '_iter' + str(iterations) + '_sel' + str(selection)) # plt.savefig(save_fig_prefix + '_' + 'run2_subrun_' + str(i) + '_samples' + str(sample_size) + '_fold' # + str(folds) + '_iter' + str(iterations) + '_sel' + str(selection), dpi=300) # plt.show(block = False) # plt.pause(0.01) # plt.close() print( "===============================================================") ## print("Best parameter limits: ") print(new_parameter_rand) print("Best results: ") print(results_random_search.round(3).head(10)) param_final = {} param_final['C'] = results_random_search.iloc[0]['param_model__C'] param_final['gamma'] = results_random_search.iloc[0]['param_model__gamma'] # param_final = new_parameter_rand[0] print("Hyper parameters found") print(param_final) return param_final, results_random_search
def run_training_estimation(X_train, y_train, X_test, y_test, scorer, model_clf, image_save_directory=None): ''' Run estimation of scorer (default f1) and duration dependent of subset size of input data :args: X_train: Training data y_train: Training labels as numbers X_test: Test data y_test: Test labels as numbers scorer: Scorer for the evaluation, default f1 :return: Nothing ''' # Estimate training duration # run_training_estimation = True #if run_training_estimation==True: #Set test range test_range = list(range(100, 6500 + 1, 500)) #test_range = list(range(100, 1000, 200)) print("Test range", test_range) # SVM model # Define the model xaxis, durations, scores = exe.estimate_training_duration( model_clf, X_train, y_train, X_test, y_test, test_range, scorer) # Paint figure plt.figure() plt.plot(xaxis, durations) plt.xlabel('Number of training examples') plt.ylabel('Duration [s]') plt.title("Training Duration") vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename='Duration_Samples') #if image_save_directory: # if not os.path.isdir(image_save_directory): # os.makedirs(image_save_directory) # plt.savefig(os.path.join(image_save_directory, 'SVM_Duration_Samples'), dpi=300) #plt.show(block = False) #plt.pause(0.1) #plt.close() plt.figure() plt.plot(xaxis, scores) plt.xlabel('Number of training examples') plt.ylabel('F1-Score on cross validation set (=the rest). Size={}'.format( X_test.shape[0])) plt.title("F1 Score Improvement With More Data") vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename='F1_Samples')
def adapt_features_for_model(features_cleaned1, outcomes_cleaned1, result_dir, class_labels, conf): ## Prepare the Feature Columns # === Replace signs for missing values or other values with ===# features = features_cleaned1.copy() # Custom replacements, replace only if there is something to replace, else it makes NAN of it # value_replacements = { # 'n': 0, # 'y': 1, # 'unknown': np.NAN # } # === Replace all custom values and missing values with content from the value_replacement for col in features.columns: # df_dig[col] = df[col].map(value_replacements) # df_dig[col] = df[col].replace('?', np.nan) # Everything to numeric features[col] = pd.to_numeric(features[col]) # df_dig[col] = np.int64(df_dig[col]) print(features.head(5)) # Create one-hot-encoding for certain classes and replace the original class # onehotlabels = pd.get_dummies(df_dig.iloc[:,1]) # Add one-hot-encondig columns to the dataset # for i, name in enumerate(onehotlabels.columns): # df_dig.insert(i+1, column='Cylinder' + str(name), value=onehotlabels.loc[:,name]) # Remove the original columns # df_dig.drop(columns=['cylinders'], inplace=True) ## Prepare the Outcomes if they exist if outcomes_cleaned1 is not None: # Replace classes with digital values outcomes = outcomes_cleaned1.copy() outcomes = outcomes.astype(int) print("Outcome types") print(outcomes.dtypes) ### Binarize Multiclass Dataset # If the binarize setting is used, then binarize the class of the outcome. if conf['Preparation'].getboolean('binarize_labels') == True: binarized_outcome = ( outcomes[conf['Common'].get('class_name')] == conf['Preparation'].getint('class_number')).astype(int) y = binarized_outcome.values.flatten() print("y was binarized. Classes before: {}. Classes after: {}". format(np.unique(outcomes[conf['Common'].get('class_name')]), np.unique(y))) # Redefine class labels class_labels = { 0: conf['Preparation'].get('binary_0_label'), 1: conf['Preparation'].get('binary_1_label') } print("Class labels redefined to: {}".format(class_labels)) print("y labels: {}".format(class_labels)) else: y = outcomes[conf['Common'].get('class_name')].values.flatten() print("No binarization was made. Classes: {}".format(np.unique(y))) print("y shape: {}".format(y.shape)) print("y unique classes: {}".format(np.unique(y, axis=0))) else: y = None class_labels = None ## Determine Missing Data #Missing data is only visualized here as it is handled in the training algorithm in S40. # Check if there are any nulls in the data print("Missing data in the features: ", features.isnull().values.sum()) features[features.isna().any(axis=1)] # Missing data part print("Number of missing values per feature") missingValueShare = [] for col in features.columns: # if is_string_dtype(df_dig[col]): missingValueShare.append(sum(features[col].isna()) / features.shape[0]) # Print missing value graph vis.paintBarChartForMissingValues(features.columns, missingValueShare) barplot = plt.gcf() vis.save_figure(plt.gcf(), image_save_directory=result_dir, filename=str(barplot.axes[0].get_title()).replace( ' ', '_')) # Visualize missing data with missingno #fig = plt.figure(num=None, figsize=(8, 8), dpi=80, facecolor='w', edgecolor='k') msno.matrix(features) fig_matrix = plt.gcf() vis.save_figure(fig_matrix, image_save_directory=result_dir, filename='missing_numbers_matrix') #plt.savefig(os.path.join(result_dir,'_missing_numbers_matrix')) #plt.show(block = False) if features.isnull().values.sum() > 0: plt.gcf() msno.heatmap(features) vis.save_figure(plt.gcf(), image_save_directory=result_dir, filename='missing_numbers_heatmap') #plt.savefig(os.path.join(result_dir, '_missing_numbers_heatmap')) #plt.show(block = False) #### View Prepared Binary Features # We need some more plots for the binary data types. # vis.plotBinaryValues(df_dig, df_dig.columns) #0:-1 # plt.savefig(image_save_directory + "/BinaryFeatures.png", dpi=70) return features, y, class_labels
def main(config_path): conf = sup.load_config(config_path) # Load annotations file y_labels = pd.read_csv(conf['Paths'].get('source_path'), sep=';', header=None).set_index(0).to_dict()[1] # Generating filenames for saving the files image_save_directory = os.path.join(conf['Paths'].get('results_directory'), "data_generation") outcomes_filename_raw = os.path.join(conf['Paths'].get('prepared_data_directory'), "temp", "temp_outcomes_uncut" + ".csv") #if os.path.isdir(conf['Paths'].get('prepared_data_directory'))==False: os.makedirs(os.path.dirname(outcomes_filename_raw), exist_ok=True) # print("Created directory ", conf['Paths'].get('training_data_directory')) #if os.path.isdir(conf['Paths'].get('result_directory'))==False: #os.makedirs(conf['Paths'].get('result_directory'), exist_ok=True) # print("Created directory ", conf['Paths'].get('result_directory')) #Load only a subset of the whole raw data to create a debug dataset source = custom.load_source(conf['Paths'].get('source_path')) #.iloc[0:1000, :] #Plot source plt.figure(num=None, figsize=(12.5, 7), dpi=80, facecolor='w', edgecolor='k') plt.plot(source['Date'], source['Close']) plt.title(conf['Paths'].get('source_path')) #plt.show(block = False) vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename="Source_data") #y_labels = annotations #generate_custom_class_labels() outcomes = generate_features_outcomes(image_save_directory, source) # Drop the 50 last values as they cannot be used for prediction as +50 days ahead is predicted source_cut = source.drop(source.tail(50).index, inplace=False) outcomes_cut = outcomes.drop(outcomes.tail(50).index, inplace=False) vis.plot_three_class_graph(outcomes_cut['1dTrend'].values, source_cut['Close'], source_cut['Date'], 0,0,0, ('close', 'neutral', 'positive', 'negative'), title=conf['Common'].get('dataset_name') + '_GT_1dTrend', save_fig_prefix=image_save_directory) vis.plot_three_class_graph(outcomes_cut['5dTrend'].values, source_cut['Close'], source_cut['Date'], 0,0,0, ('close', 'neutral', 'positive', 'negative'), title=conf['Common'].get('dataset_name') + '_GT_5dTrend', save_fig_prefix=image_save_directory) vis.plot_three_class_graph(outcomes_cut['20dTrend'].values, source_cut['Close'], source_cut['Date'], 0,0,0, ('close', 'neutral', 'positive', 'negative'), title=conf['Common'].get('dataset_name') + '_GT_20dTrend', save_fig_prefix=image_save_directory) vis.plot_three_class_graph(outcomes_cut['LongTrend'].values, source_cut['Close'], source_cut['Date'], 0,0,0, ('close', 'neutral', 'positive', 'negative'), title=conf['Common'].get('dataset_name') + '_GT_LongTrend', save_fig_prefix=image_save_directory) vis.plot_three_class_graph(outcomes_cut['TopsBottoms'].values, source_cut['Close'], source_cut['Date'], 0,0,0, ('close', 'neutral', 'top', 'bottom'), title=conf['Common'].get('dataset_name') + '_GT_TopsBottoms', save_fig_prefix=image_save_directory) def binarize(outcomes, class_number): return (outcomes == class_number).astype(int) vis.plot_two_class_graph(binarize(outcomes_cut['1dTrend'], conf['Common'].getint('class_number')), source_cut['Close'], source_cut['Date'], 0, ('close', 'Positive Trend'), title=conf['Common'].get('dataset_name') + '_GT_1dTrend', save_fig_prefix=image_save_directory) vis.plot_two_class_graph(binarize(outcomes_cut['5dTrend'], conf['Common'].getint('class_number')), source_cut['Close'], source_cut['Date'], 0, ('close', 'Positive Trend'), title=conf['Common'].get('dataset_name') + '_GT_5dTrend', save_fig_prefix=image_save_directory) vis.plot_two_class_graph(binarize(outcomes_cut['20dTrend'], conf['Common'].getint('class_number')), source_cut['Close'], source_cut['Date'], 0, ('close', 'Positive Trend'), title=conf['Common'].get('dataset_name') + '_GT_20dTrend', save_fig_prefix=image_save_directory) vis.plot_two_class_graph(binarize(outcomes_cut['LongTrend'], conf['Common'].getint('class_number')), source_cut['Close'], source_cut['Date'], 0, ('close', 'Positive Trend'), title=conf['Common'].get('dataset_name') + '_GT_LongTrend', save_fig_prefix=image_save_directory) # Save file # Save outcomes to a csv file print("Outcomes shape {}".format(outcomes_cut.shape)) outcomes_cut.to_csv(outcomes_filename_raw, sep=';', index=True, header=True) print("Saved outcomes to " + outcomes_filename_raw)
def plot_parallel_coordinates(df, cols, colours, comparison_name, conf, image_save_directory): x = [i for i, _ in enumerate(cols)] # create dict of categories: colours colours = { df[comparison_name].astype('category').cat.categories[i]: colours[i] for i, _ in enumerate(df[comparison_name].astype( 'category').cat.categories) } # Create (X-1) sublots along x axis fig, axes = plt.subplots(1, len(x) - 1, sharey=False, figsize=(15, 5)) # Get min, max and range for each column # Normalize the data for each column min_max_range = {} for col in cols: min_max_range[col] = [df[col].min(), df[col].max(), np.ptp(df[col])] df[col] = np.true_divide(df[col] - df[col].min(), np.ptp(df[col])) # Plot each row for i, ax in enumerate(axes): for idx in df.index: mpg_category = df.loc[idx, comparison_name] ax.plot(x, df.loc[idx, cols], colours[mpg_category]) ax.set_xlim([x[i], x[i + 1]]) # Set the tick positions and labels on y axis for each plot # Tick positions based on normalised data # Tick labels are based on original data def set_ticks_for_axis(dim, ax, ticks): min_val, max_val, val_range = min_max_range[cols[dim]] step = val_range / float(ticks - 1) tick_labels = [round(min_val + step * i, 2) for i in range(ticks)] norm_min = df[cols[dim]].min() norm_range = np.ptp(df[cols[dim]]) norm_step = norm_range / float(ticks - 1) ticks = [round(norm_min + norm_step * i, 2) for i in range(ticks)] ax.yaxis.set_ticks(ticks) ax.set_yticklabels(tick_labels) for dim, ax in enumerate(axes): ax.xaxis.set_major_locator(ticker.FixedLocator([dim])) set_ticks_for_axis(dim, ax, ticks=6) ax.set_xticklabels([cols[dim]]) # Move the final axis' ticks to the right-hand side ax = plt.twinx(axes[-1]) dim = len(axes) ax.xaxis.set_major_locator(ticker.FixedLocator([x[-2], x[-1]])) set_ticks_for_axis(dim, ax, ticks=6) ax.set_xticklabels([cols[-2], cols[-1]]) # Remove space between subplots plt.subplots_adjust(wspace=0) # Add legend to plot plt.legend([ plt.Line2D((0, 1), (0, 0), color=colours[cat]) for cat in df[comparison_name].astype('category').cat.categories ], df[comparison_name].astype('category').cat.categories, bbox_to_anchor=(1.2, 1), loc=2, borderaxespad=0.) plt.title("Values of attributes by category") vis.save_figure(plt.gcf(), image_save_directory=image_save_directory, filename='Parallel_Coordinates')