def main(): ''' Use this script to run experiments and fine-tune the algoritms ''' # Load the dataset file_name = sys.argv[1] dirname = os.path.dirname(__file__) file_name = os.path.join(dirname, file_name) d = DataSet(file_name) d.loadDataSet() # Remove useless features (not numeric + bad regressors). to_remove = [ d.data_set[0].index('Index'), d.data_set[0].index('First Name'), d.data_set[0].index('Last Name'), d.data_set[0].index('Birthday'), d.data_set[0].index('Best Hand'), d.data_set[0].index('Hogwarts House'), # Tests 7/10/18 d.data_set[0].index('Arithmancy'), d.data_set[0].index('Defense Against the Dark Arts'), d.data_set[0].index('Divination'), d.data_set[0].index('Muggle Studies'), d.data_set[0].index('History of Magic'), d.data_set[0].index('Transfiguration'), d.data_set[0].index('Potions'), d.data_set[0].index('Care of Magical Creatures'), d.data_set[0].index('Charms'), d.data_set[0].index('Flying'), ] X = np.array([[ d.data_set[i][j] for j in range(len(d.data_set[0])) if j not in to_remove ] for i in range(len(d.data_set))]) X = convert_to_float(X[1:, ]) y_col_nb = d.data_set[0].index('Hogwarts House') y = np.array(d.extractColumn(y_col_nb)[1:]) # Impute missing values m = MeanImputation(X) m.train() m.transform() # Scale the variables sc = Scaling(X) sc.train() sc.transform() # Split the dataset in a training and testing set sp = SplitTrainTest(X, y) sp.Split() X_train = sp.X_train y_train = sp.y_train X_test = sp.X_test y_test = sp.y_test # Train a logistic regression model l = LogisticRegression(X=X_train, y=y_train) l.train() # Compute the confusion matrix over the training set y_predicted = l.predict() cm1 = ConfusionMatrix(y_train, y_predicted) cm1.getMatrix() print('\n\n') print( '**************** Confusion Matrix on the training set ****************' ) print('\n') cm1.Print() # Compute the confusion matrix over the testing set y_predicted = l.predict(X_test) cm2 = ConfusionMatrix(y_test, y_predicted, cm1.unique_labels) cm2.getMatrix() print('\n\n') print( '**************** Confusion Matrix on the testing set ****************' ) print('\n') cm2.Print()
def main(): file_name = sys.argv[1] dirname = os.path.dirname(__file__) file_name = os.path.join(dirname, file_name) d = DataSet(file_name) d.loadDataSet() to_remove = [ d.data_set[0].index('Index'), d.data_set[0].index('First Name'), d.data_set[0].index('Last Name'), d.data_set[0].index('Birthday'), d.data_set[0].index('Best Hand'), d.data_set[0].index('Hogwarts House'), # Tests 7/10/18 d.data_set[0].index('Arithmancy'), d.data_set[0].index('Defense Against the Dark Arts'), d.data_set[0].index('Divination'), d.data_set[0].index('Muggle Studies'), d.data_set[0].index('History of Magic'), d.data_set[0].index('Transfiguration'), d.data_set[0].index('Potions'), d.data_set[0].index('Care of Magical Creatures'), d.data_set[0].index('Charms'), d.data_set[0].index('Flying'), ] X = np.array([[ d.data_set[i][j] for j in range(len(d.data_set[0])) if j not in to_remove ] for i in range(len(d.data_set))]) X = convert_to_float(X[1:, ]) y_col_nb = d.data_set[0].index('Hogwarts House') y = np.array(d.extractColumn(y_col_nb)[1:]) m = MeanImputation(X) m.train() m.transform() sc = Scaling(X) sc.train() sc.transform() sp = SplitTrainTest(X, y) sp.Split() X_train = sp.X_train y_train = sp.y_train X_test = sp.X_test y_test = sp.y_test l = LogisticRegression(X=X_train, y=y_train, optimizer='sgd', optimizer_params={ 'alpha': 0.5, 'n': 5, 'batch_size': 16 }) l.train() y_predicted = l.predict() cm1 = ConfusionMatrix(y_train, y_predicted) cm1.getMatrix() print('\n\n') print( '**************** Confusion Matrix on the training set ****************' ) print('\n') cm1.Print() y_predicted = l.predict(X_test) cm2 = ConfusionMatrix(y_test, y_predicted, cm1.unique_labels) cm2.getMatrix() print('\n\n') print( '**************** Confusion Matrix on the testing set ****************' ) print('\n') cm2.Print()