def Plot(self, name_col1=7, name_col2=8, house_class=['M', 'B']): """ Plot built scatter Plot two columns. :param name_col1: first numeric column from dataset. Can accept index (int) and name (str) columns :param name_col2: second numeric column from dataset. Can accept index (int) and name (str) columns :param house_class: It's classes from Hogwarts House, default use all four classes. :return: """ ds = DataSet(self.file_name) col_mas_name = [name_col1, name_col2] for i in range(2): if (type(col_mas_name[i]) is str): if col_mas_name[i] in ds.dataset[0]: col_mas_name[i] = ds.dataset[0].index(col_mas_name[i]) else: print('Error: bad name column') return for i in range(2): if col_mas_name[i] < 0 or col_mas_name[i] >= len(ds.dataset[0]): print("Error: This isn't column") return if not ds.isNumeric_columns(col_mas_name[i]): print("Error: Input column must is numerics") return if self.size > (len(ds.dataset) - 1): self.size = len(ds.dataset) - 1 col1 = ds.get_float_col(col_mas_name[0]) col2 = ds.get_float_col(col_mas_name[1]) color = { 'M': 'b', 'B': 'r', } feature1 = {} feature2 = {} # house_class = [i for i in house_class if i in set(ds.get_col(self.y_col))] if house_class else set(ds.get_col(self.y_col)) # house_class = set(ds.get_col(self.y_col)) if not house_class else house_class for i in house_class: feature1[i] = [] feature2[i] = [] for i in range(1, len(ds.dataset)): if ds.dataset[i][self.y_col] in house_class: feature1[ds.dataset[i][self.y_col]].append(col1[i - 1]) feature2[ds.dataset[i][self.y_col]].append(col2[i - 1]) for i in feature1.keys(): plot.scatter(feature1[i][:self.size], feature2[i][:self.size], c=color[i], alpha=0.5, label=i) if self.legend: plot.legend(loc='upper right') plot.ylabel('column is {}'.format(col_mas_name[1])) plot.xlabel('column is {}'.format(col_mas_name[0])) plot.title('Scatter Plot') plot.savefig('data/scatter_plot.png') plot.show()
def main(): dirname = os.path.dirname(__file__) output_dirname = os.path.join(dirname, 'results') try: os.stat(output_dirname) except: os.mkdir(output_dirname) file_name = sys.argv[1] dirname = os.path.dirname(__file__) file_name = os.path.join(dirname, file_name) d = DataSet(file_name) d.loadDataSet() to_remove = [ d.data_set[0].index('Index'), d.data_set[0].index('First Name'), d.data_set[0].index('Last Name'), d.data_set[0].index('Birthday'), d.data_set[0].index('Best Hand'), d.data_set[0].index('Hogwarts House'), # Tests 7/10/18 d.data_set[0].index('Arithmancy'), d.data_set[0].index('Defense Against the Dark Arts'), d.data_set[0].index('Divination'), d.data_set[0].index('Muggle Studies'), d.data_set[0].index('History of Magic'), d.data_set[0].index('Transfiguration'), d.data_set[0].index('Potions'), d.data_set[0].index('Care of Magical Creatures'), d.data_set[0].index('Charms'), d.data_set[0].index('Flying'), ] X = np.array([[ d.data_set[i][j] for j in range(len(d.data_set[0])) if j not in to_remove ] for i in range(len(d.data_set))]) #features = X[0,:] X = convert_to_float(X[1:, ]) y_col_nb = d.data_set[0].index('Hogwarts House') y = np.array(d.extractColumn(y_col_nb)[1:]) m = MeanImputation(X) m.train() m.transform() sc = Scaling(X) sc.train() sc.transform() l = LogisticRegression(X=X, y=y) l.train()
def __init__(self, path_to_data_set='resources/dataset_train.csv', legend=True, granularity=100): """ :param path_to_data_set: a string. The path to the dataset. :param legend: a boolean. If legend is False, only the histogram is plotted. If legend is True, titles, axis legend, etc. are plotted. :param granularity: an integer. The number of barplots in the histogram. """ self.data_set = DataSet(path_to_data_set) self.data_set.loadDataSet() self.legend = legend self.granularity = granularity
def __init__(self, path_to_data_set='resources/dataset_train.csv', legend=True, size=10): """ :param path_to_data_set: a string. The path to the dataset. :param legend: a boolean. If legend is False, only the histogram is plotted. If legend is True, titles, axis legend, etc. are plotted. :param size: an int. The size of the points in the scatter plot. """ self.data_set = DataSet(path_to_data_set) self.data_set.loadDataSet() self.legend = legend self.size = size
def predict_file(self, theta=np.array([]), theta_exit=0): df = DataSet(filename=self.file) df.find_numeric_label() X = self.get_x_y(df, return_y=False) X = self.__add_intercept(X) self.theta = np.array(theta) if not theta and theta_exit: print("Error: Have not theta") sys.exit() if not theta: self.theta = np.ones(X.shape[1]) if self.theta.shape[0] != X.shape[1]: print('Error: bad theta or X') sys.exit() return [self.predict(X), self.predict_prob(X)]
def __init__(self, path_to_data_set='resources/dataset_train.csv', max_nb_features=4, fig_size=(8, 8)): """ :param path_to_data_set: a string. The path to the dataset. :param max_nb_features: an integer. The number of features to analyze - analysis will start from the first feature (on the left) and continue until reaching the number max of features. This was necessary for the sake of readability (there are ~10 numeric features, which would lead to 10**2 = 100 plots to do. :param fig_size: an integer tuple. The size of the figure to output. """ self.path_to_data_set = path_to_data_set self.data_set = DataSet(self.path_to_data_set) self.data_set.loadDataSet() self.max_nb_features = max_nb_features self.fig_size = fig_size self.numeric_features = []
def fit(self): ds = DataSet(filename=self.file) ds.find_numeric_label() X, y = self.get_x_y(ds) if self.fit_intercept: X = self.__add_intercept(X) self.theta = np.random.randn(X.shape[1]) for i in range(self.num_iter): z = np.dot(X, self.theta) h = self.__sigmoid(z) gradient = np.dot(X.T, (h - y)) / y.size self.theta -= self.lr * gradient if (self.verbose and i % 10000 == 0): z = np.dot(X, self.theta) h = self.__sigmoid(z) print(f'loss: {self.__loss(h, y)} \t')
def Plot(self, col_nb): ds = DataSet(self.file_name) if (type(col_nb) is str): if col_nb in ds.dataset[0]: col_nb = ds.dataset[0].index(col_nb) else: print('Error with name column') return if not ds.isNumeric_columns(col_nb): print("Input column must is numerics") return col = ds.get_float_col(col_nb) statistic = Math_calculat(col) bins = np.linspace(statistic.Quartile(0), statistic.Quartile(1), self.size) color = { 'Ravenclaw': 'b', 'Gryffindor': 'r', 'Slytherin': 'g', 'Hufflepuff': 'yellow' } feature = {} for i in set(ds.get_col(self.y_col)): feature[i] = [] for i in range(1, len(ds.dataset)): feature[ds.dataset[i][self.y_col]].append(col[i - 1]) for i in feature.keys(): plot.hist(feature[i], bins, facecolor=color[i], alpha=0.5, label=i) if self.legend: plot.legend(loc='upper right') plot.ylabel('Frequency') plot.xlabel('Value') plot.title('Histogram') plot.savefig('datasets/histogram.png') plot.show()
def Plot(self): ds = DataSet(self.file_name) ds.find_numeric_label() if self.max_nb_columns > (len(ds.numeric_columns)): self.max_nb_columns = len(ds.numeric_columns) color = { 'Ravenclaw': 'b', 'Gryffindor': 'r', 'Slytherin': 'g', 'Hufflepuff': 'yellow' } fig, ax = plot.subplots(self.max_nb_columns, self.max_nb_columns, figsize=self.fig_size) fig.tight_layout() N = self.max_nb_columns for i in range(N): col1 = ds.get_float_col(ds.numeric_columns[i])[:self.size] for j in range(N): col2 = ds.get_float_col(ds.numeric_columns[j])[:self.size] feature1 = {} feature2 = {} for k in set(ds.get_col(self.y_col)): feature1[k] = [] feature2[k] = [] for k in range(1, len(ds.dataset[:self.size])): feature1[ds.dataset[k][self.y_col]].append(col1[k - 1]) feature2[ds.dataset[k][self.y_col]].append(col2[k - 1]) if i == 0: ax[i, j].xaxis.set_label_position('top') ax[i, j].set_xlabel(ds.dataset[0][ds.numeric_columns[j]], rotation=0) if j == 0: ax[i, j].set_ylabel(ds.dataset[0][ds.numeric_columns[i]], rotation=0) if (i == j): statistic = Math_calculat(col1) bins = np.linspace(statistic.Quartile(0), statistic.Quartile(1)) for k in feature1.keys(): ax[i, j].hist(feature1[k], bins, facecolor=color[k], alpha=0.5, label=k) else: for k in feature1.keys(): ax[i, j].scatter(feature1[k], feature2[k], c=color[k], alpha=0.5, label=k) ax[i, j].tick_params(labelbottom=False) ax[i, j].tick_params(labelleft=False) if self.legend: plot.legend(loc='lower right') plot.savefig('datasets/pair_plot.png') plot.show()
def main(): ''' Use this script to run experiments and fine-tune the algoritms ''' # Load the dataset file_name = sys.argv[1] dirname = os.path.dirname(__file__) file_name = os.path.join(dirname, file_name) d = DataSet(file_name) d.loadDataSet() # Remove useless features (not numeric + bad regressors). to_remove = [ d.data_set[0].index('Index'), d.data_set[0].index('First Name'), d.data_set[0].index('Last Name'), d.data_set[0].index('Birthday'), d.data_set[0].index('Best Hand'), d.data_set[0].index('Hogwarts House'), # Tests 7/10/18 d.data_set[0].index('Arithmancy'), d.data_set[0].index('Defense Against the Dark Arts'), d.data_set[0].index('Divination'), d.data_set[0].index('Muggle Studies'), d.data_set[0].index('History of Magic'), d.data_set[0].index('Transfiguration'), d.data_set[0].index('Potions'), d.data_set[0].index('Care of Magical Creatures'), d.data_set[0].index('Charms'), d.data_set[0].index('Flying'), ] X = np.array([[ d.data_set[i][j] for j in range(len(d.data_set[0])) if j not in to_remove ] for i in range(len(d.data_set))]) X = convert_to_float(X[1:, ]) y_col_nb = d.data_set[0].index('Hogwarts House') y = np.array(d.extractColumn(y_col_nb)[1:]) # Impute missing values m = MeanImputation(X) m.train() m.transform() # Scale the variables sc = Scaling(X) sc.train() sc.transform() # Split the dataset in a training and testing set sp = SplitTrainTest(X, y) sp.Split() X_train = sp.X_train y_train = sp.y_train X_test = sp.X_test y_test = sp.y_test # Train a logistic regression model l = LogisticRegression(X=X_train, y=y_train) l.train() # Compute the confusion matrix over the training set y_predicted = l.predict() cm1 = ConfusionMatrix(y_train, y_predicted) cm1.getMatrix() print('\n\n') print( '**************** Confusion Matrix on the training set ****************' ) print('\n') cm1.Print() # Compute the confusion matrix over the testing set y_predicted = l.predict(X_test) cm2 = ConfusionMatrix(y_test, y_predicted, cm1.unique_labels) cm2.getMatrix() print('\n\n') print( '**************** Confusion Matrix on the testing set ****************' ) print('\n') cm2.Print()
class HistogramPerHouse: """ - A class to plot the histogram of the Hogwarts features - Example to run: from histogram import HistogramPerHouse import matplotlib.pyplot as plt h = HistogramPerHouse() h.Plot(8) plt.show() """ def __init__(self, path_to_data_set='resources/dataset_train.csv', legend=True, granularity=100): """ :param path_to_data_set: a string. The path to the dataset. :param legend: a boolean. If legend is False, only the histogram is plotted. If legend is True, titles, axis legend, etc. are plotted. :param granularity: an integer. The number of barplots in the histogram. """ self.data_set = DataSet(path_to_data_set) self.data_set.loadDataSet() self.legend = legend self.granularity = granularity def Plot(self, col_nb): """ The plotting function. :param col_nb: integer. The position of the column / feature to plot. """ feature = self.data_set.extractColumn(col_nb=col_nb, convert_to_float=True)[1:] houses = self.data_set.extractColumn(col_nb=1)[1:] to_plot = {} for i in range(len(houses)): try: to_plot[houses[i]] += [feature[i]] except: to_plot[houses[i]] = [feature[i]] full_list = [] unique_houses = set(houses) for house in unique_houses: full_list += to_plot[house] s = Statistics(full_list) min = s.Quartile(0) max = s.Quartile(1) bins = np.linspace(min, max, self.granularity) colors = { 'Hufflepuff':'c', 'Ravenclaw':'orange', 'Slytherin':'g', 'Gryffindor':'r', } for house in unique_houses: plt.hist(to_plot[house], bins, alpha=0.5, label=house, color=colors[house]) if self.legend : plt.legend(loc='upper right') plt.title('Histogram of "%s" grades among the different Hogwarts houses' % self.data_set.data_set[0][col_nb]) plt.xlabel("Grade") plt.ylabel("Count")
class ScatterPlotPerHouse: """ - A class to plot the scatter plot of a Hogwarts feature vs. another one - Example to run: from scatter_plot import ScatterPlotPerHouse import matplotlib.pyplot as plt sc = ScatterPlotPerHouse() sc.Plot(8,9) plt.show() """ def __init__(self, path_to_data_set='resources/dataset_train.csv', legend=True, size=10): """ :param path_to_data_set: a string. The path to the dataset. :param legend: a boolean. If legend is False, only the histogram is plotted. If legend is True, titles, axis legend, etc. are plotted. :param size: an int. The size of the points in the scatter plot. """ self.data_set = DataSet(path_to_data_set) self.data_set.loadDataSet() self.legend = legend self.size = size def Plot(self, col_nb_1, col_nb_2): """ Plotting function :param col_nb_1: integer. The position of the 1st column / feature to plot. :param col_nb_2: integer. The position of the 2nd column / feature to plot. """ feature_1 = self.data_set.extractColumn(col_nb=col_nb_1, convert_to_float=True)[1:] feature_2 = self.data_set.extractColumn(col_nb=col_nb_2, convert_to_float=True)[1:] houses = self.data_set.extractColumn(col_nb=1)[1:] to_plot = { 'feature_1': {}, 'feature_2': {}, } for i in range(len(houses)): if feature_1[i] and feature_2[i]: try: to_plot['feature_1'][houses[i]] += [feature_1[i]] except: to_plot['feature_1'][houses[i]] = [feature_1[i]] try: to_plot['feature_2'][houses[i]] += [feature_2[i]] except: to_plot['feature_2'][houses[i]] = [feature_2[i]] unique_houses = set(houses) colors = { 'Hufflepuff': 'c', 'Ravenclaw': 'orange', 'Slytherin': 'g', 'Gryffindor': 'r', } for house in unique_houses: plt.scatter(x=to_plot['feature_1'][house], y=to_plot['feature_2'][house], c=colors[house], alpha=0.5, label=house, s=self.size) if self.legend: plt.legend(loc='upper right') plt.title( 'Scatter plot of "%s" vs "%s" grades among the different Hogwarts houses' % (self.data_set.data_set[0][col_nb_1], self.data_set.data_set[0][col_nb_2])) plt.xlabel(self.data_set.data_set[0][col_nb_1]) plt.ylabel(self.data_set.data_set[0][col_nb_2])
class PairPlot: """ - A class to plot the pair plot of many Hogwarts features. - Example to run: from pair_plot import PairPlot import matplotlib.pyplot as plt pp = PairPlot() pp.Plot() plt.show() """ def __init__(self, path_to_data_set='resources/dataset_train.csv', max_nb_features=4, fig_size=(8, 8)): """ :param path_to_data_set: a string. The path to the dataset. :param max_nb_features: an integer. The number of features to analyze - analysis will start from the first feature (on the left) and continue until reaching the number max of features. This was necessary for the sake of readability (there are ~10 numeric features, which would lead to 10**2 = 100 plots to do. :param fig_size: an integer tuple. The size of the figure to output. """ self.path_to_data_set = path_to_data_set self.data_set = DataSet(self.path_to_data_set) self.data_set.loadDataSet() self.max_nb_features = max_nb_features self.fig_size = fig_size self.numeric_features = [] def extractNumericFeatures(self): """ Automatically extracts the numeric features in the dataset. """ for i in range(len(self.data_set.data_set[0])): if self.data_set.data_set[0][ i] != 'Index' and self.data_set.isNumericFeature(i): self.numeric_features += [i] def Plot(self): """ Plotting function. :return: """ plt.figure(figsize=self.fig_size) SMALL_SIZE = 5 plt.rc('xtick', labelsize=SMALL_SIZE) plt.rc('ytick', labelsize=SMALL_SIZE) plt.suptitle("Pair Plot") N = len(self.numeric_features[:self.max_nb_features]) for i in range(N): for j in range(N): ax = plt.subplot(N, N, 1 + j + i * N) if i == 0: ax.xaxis.set_label_position('top') plt.xlabel( self.data_set.data_set[0][self.numeric_features[j]], fontsize=8) if j == 0: plt.ylabel( self.data_set.data_set[0][self.numeric_features[i]], fontsize=8) if i == j: h = HistogramPerHouse( path_to_data_set=self.path_to_data_set, legend=False, granularity=30) h.Plot(self.numeric_features[i]) else: sc = ScatterPlotPerHouse( path_to_data_set=self.path_to_data_set, legend=False, size=1) sc.Plot(self.numeric_features[j], self.numeric_features[i]) handles, labels = ax.get_legend_handles_labels() plt.figlegend(handles, labels, loc='lower right', prop={'size': 6})
def main(): dirname = os.path.dirname(__file__) dirname_prediction = os.path.join(dirname, 'results') file_name = sys.argv[1] file_name = os.path.join(dirname, file_name) d = DataSet(file_name) d.loadDataSet() to_remove = [ d.data_set[0].index('Index'), d.data_set[0].index('First Name'), d.data_set[0].index('Last Name'), d.data_set[0].index('Birthday'), d.data_set[0].index('Best Hand'), d.data_set[0].index('Hogwarts House'), # Tests 7/10/18 d.data_set[0].index('Arithmancy'), d.data_set[0].index('Defense Against the Dark Arts'), d.data_set[0].index('Divination'), d.data_set[0].index('Muggle Studies'), d.data_set[0].index('History of Magic'), d.data_set[0].index('Transfiguration'), d.data_set[0].index('Potions'), d.data_set[0].index('Care of Magical Creatures'), d.data_set[0].index('Charms'), d.data_set[0].index('Flying'), ] index_position = d.data_set[0].index('Index') indexes = np.array( [d.data_set[i][index_position] for i in range(len(d.data_set))])[1:] X = np.array([[ d.data_set[i][j] for j in range(len(d.data_set[0])) if j not in to_remove ] for i in range(len(d.data_set))]) #features = X[0,:] X = convert_to_float(X[1:, ]) m = MeanImputation(X, path_to_mean_imputation=os.path.join( dirname_prediction, 'mean_imputation.json')) m.transform() sc = Scaling(X, path_to_scaling=os.path.join(dirname_prediction, 'scaling.json')) sc.transform() l = LogisticRegression(X=X, path_to_beta=os.path.join(dirname_prediction, 'beta.json')) predictions = l.predict() dirname = os.path.dirname(__file__) file_name = os.path.join(dirname, 'resources/houses.csv') with open(file_name, 'w+') as outfile: writer = csv.writer(outfile, delimiter=',') writer.writerow(['Index', 'Hogwarts House']) for i in range(len(indexes)): writer.writerow([indexes[i], predictions[i]])
def main(): file_name = sys.argv[1] dirname = os.path.dirname(__file__) file_name = os.path.join(dirname, file_name) d = DataSet(file_name) d.loadDataSet() to_remove = [ d.data_set[0].index('Index'), d.data_set[0].index('First Name'), d.data_set[0].index('Last Name'), d.data_set[0].index('Birthday'), d.data_set[0].index('Best Hand'), d.data_set[0].index('Hogwarts House'), # Tests 7/10/18 d.data_set[0].index('Arithmancy'), d.data_set[0].index('Defense Against the Dark Arts'), d.data_set[0].index('Divination'), d.data_set[0].index('Muggle Studies'), d.data_set[0].index('History of Magic'), d.data_set[0].index('Transfiguration'), d.data_set[0].index('Potions'), d.data_set[0].index('Care of Magical Creatures'), d.data_set[0].index('Charms'), d.data_set[0].index('Flying'), ] X = np.array([[ d.data_set[i][j] for j in range(len(d.data_set[0])) if j not in to_remove ] for i in range(len(d.data_set))]) X = convert_to_float(X[1:, ]) y_col_nb = d.data_set[0].index('Hogwarts House') y = np.array(d.extractColumn(y_col_nb)[1:]) m = MeanImputation(X) m.train() m.transform() sc = Scaling(X) sc.train() sc.transform() sp = SplitTrainTest(X, y) sp.Split() X_train = sp.X_train y_train = sp.y_train X_test = sp.X_test y_test = sp.y_test l = LogisticRegression(X=X_train, y=y_train, optimizer='sgd', optimizer_params={ 'alpha': 0.5, 'n': 5, 'batch_size': 16 }) l.train() y_predicted = l.predict() cm1 = ConfusionMatrix(y_train, y_predicted) cm1.getMatrix() print('\n\n') print( '**************** Confusion Matrix on the training set ****************' ) print('\n') cm1.Print() y_predicted = l.predict(X_test) cm2 = ConfusionMatrix(y_test, y_predicted, cm1.unique_labels) cm2.getMatrix() print('\n\n') print( '**************** Confusion Matrix on the testing set ****************' ) print('\n') cm2.Print()