def run_logistic_regression(train_subset=45000, valid_size=5000, test=False): train_dataset, train_labels = load_train_data() train_dataset = reformat_dataset(train_dataset) valid_dataset = train_dataset[:valid_size, :] valid_labels = train_labels[:valid_size] train_dataset = train_dataset[valid_size:valid_size + train_subset, :] train_labels = train_labels[valid_size:valid_size + train_subset] print 'Training set size: ', train_dataset.shape, train_labels.shape print 'Validation set size: ', valid_dataset.shape, valid_labels.shape print 'Training...' logreg = LogisticRegression() logreg.fit(train_dataset, train_labels) train_predict = logreg.predict(train_dataset) valid_predict = logreg.predict(valid_dataset) train_accuracy = accuracy(train_predict, train_labels) valid_accuracy = accuracy(valid_predict, valid_labels) print_accuracy(train_accuracy, valid_accuracy) # Predict test data if (not test): return print 'Predicting test dataset...' test_dataset = load_test_data() test_dataset = test_dataset.reshape((test_dataset.shape[0], test_dataset.shape[1] * test_dataset.shape[2] * test_dataset.shape[3])) test_predict = logreg.predict(test_dataset) label_matrices_to_csv(test_predict, 'submission.csv')
def get_train_valid_data(train_subset=45000, valid_size=5000, reformat_data=True, reformat_label=True): """ Get dataset from cifar10_train.pickle file, convert the data type of numpy.float32, and separate the dataset into training set and validation set. Take note that train_subset + valid_size cannot be more than 50000. Keyword arguments: train_subset -- the number of training set valid_size -- the number of validation set reformat_data -- if True, reformat the dataset to 2 dimension matrix. Else, keep the dataset as 4 dimension matrix reformat_label -- if True, reformat the labels to (n X num_labels) dimension matrix. Else, keep the labels as 2 dimension matrix """ if train_subset + valid_size > 50000: raise Exception('train_subset + valid_size cannot be more than 50000') train_dataset, train_labels = load_train_data() if reformat_data: train_dataset = reformat_dataset(train_dataset) if reformat_label: train_labels = reformat_labels(train_labels) train_dataset = train_dataset.astype(np.float32) train_labels = train_labels.astype(np.float32) # Create a validation dataset valid_dataset = train_dataset[:valid_size] valid_labels = train_labels[:valid_size] train_dataset = train_dataset[valid_size:valid_size + train_subset] train_labels = train_labels[valid_size:valid_size + train_subset] print 'Training set size:', train_dataset.shape, train_labels.shape print 'Validation set size:', valid_dataset.shape, valid_labels.shape return train_dataset, train_labels, valid_dataset, valid_labels