def create_scatter_plot(dataset, featureX, featureY, filename, filepath='images/scatter/', color='red'): fig, ax = plt.subplots(nrows=1, ncols=1) dataset.plot(kind='scatter', x=featureX, y=featureY, color=color, ax=ax) plt.savefig(filepath + filename) plt.close(fig) # the correlated features create_scatter_plot(pull_features(training_data), 'feature14', 'feature44', 'scatter_correlation_all_eras.png') create_scatter_plot(pull_features(training_data, for_era=1), 'feature14', 'feature44', 'scatter_correlation_era1.png', color='orange') # the independent features create_scatter_plot(pull_features(training_data), 'feature2', 'feature40', 'scatter_independent_all_eras.png', color='deepskyblue') create_scatter_plot(pull_features(training_data, for_era=1), 'feature2',
from pandas.plotting import scatter_matrix from picipixi import create_gif from loader import load_training, pull_features, pull_features_and_era_label def create_scatter_plot(dataset, filename, featureX='feature14', featureY='feature44', filepath='images/scatter/animation/', color='red'): era_plot = dataset.plot(kind='scatter', x=featureX, y=featureY, color=color) era_plot.set_ylim(0.0, 1.0) era_plot.set_xlim(0.0, 1.0) plt.savefig(filepath + filename) training_data = load_training() # create the animation frames for era in range(1, 121): X = pull_features(training_data, for_era=era) create_scatter_plot(X, 'scatter_correlation_frame_era' + str(era) + '.png') # TODO: combine the frames into an animation # create_gif('images/scatter/scatter_correlation_all_eras.gif', 'images/scatter/animation', 'scatter_correlation_frame_era')
from loader import load_training, pull_features X = pull_features(load_training()) print(X.describe())
mpimg.imsave(filepath + diff_filename, diff_image) def calc_mean_squared_diff(imageA_filename, imageB_filename, filepath='images/correlation/'): # calculate the mean-square difference between two images over all the pixels in an image imageA = mpimg.imread(filepath + imageA_filename) imageB = mpimg.imread(filepath + imageB_filename) diff = np.sum((imageA.astype("float") - imageB.astype("float"))**2) diff /= float(imageA.shape[0] * imageA.shape[1]) return diff training_data = load_training() negative = pull_features(training_data, target_bernie_value=0, for_era=1) positive = pull_features(training_data, target_bernie_value=1, for_era=1) create_correlation_matrix(positive, 'correlation_matrix_positive.png') create_correlation_matrix(negative, 'correlation_matrix_negative.png') calc_image_diff('correlation_matrix_positive.png', 'correlation_matrix_negative.png', 'correlation_diff_era1.png') print( 'Mean squared difference', calc_mean_squared_diff('correlation_matrix_positive.png', 'correlation_matrix_negative.png')) # show image diff for all eras
import pandas as pd import matplotlib.pyplot as plt from loader import load_training, pull_features def create_correlation_matrix(dataset, filename, filepath='images/correlation/', cmap=plt.cm.viridis): correlation = dataset.corr() fig, ax = plt.subplots() matrix = ax.imshow(correlation, cmap=cmap, interpolation='nearest') fig.colorbar(matrix) tick_marks = [i for i in range(len(dataset.columns))] plt.xticks(tick_marks, dataset.columns, rotation='vertical') plt.yticks(tick_marks, dataset.columns) # now make the axes legible - we don't need them all for label in ax.xaxis.get_ticklabels()[1::2]: label.set_visible(False) for label in ax.yaxis.get_ticklabels()[1::2]: label.set_visible(False) plt.savefig(filepath + filename) plt.clf() create_correlation_matrix(pull_features(load_training()), 'correlation_matrix_all_eras.png')
def scatter_matrix_variation(dataset, select_features, filename_suffix='', frac=0.1): frac = 0.5 create_scatter_matrix(dataset[select_features].sample(frac=frac), 'scatter_matrix' + filename_suffix + '.png') training_data = load_training() # create a scatter plot matrix of all the features, but using a reduced sampling of each scatter_matrix_variation( pull_features(training_data, for_era=1), ['feature' + str(feature) for feature in range(1, 50)], '_era1_1_50', frac=0.1) # create a scatter plot matrix of only hand-selected features scatter_matrix_variation( pull_features(training_data, target_bernie_value=0, for_era=1), ['feature' + str(feature) for feature in range(4, 17)], '_era1_4_17', frac=1.0) # create a scatter plot matrix of only hand-selected features scatter_matrix_variation(pull_features(training_data, target_bernie_value=0, for_era=1),