def test_init_with_correct_file_input(self): """ Ensure a valid dict is created from a correct json during init """ settings = utils.Settings(self.check_settings_file) self.assertIs(settings.user_input.__class__, dict) self.assertTrue(len(settings.user_input) > 0)
def test_resolves_to_correct_dir(self): """ Make sure settings parser resolves to dir containing test and train dirs """ settings_string_with_2_dirs = io.StringIO( u'{"data_dir": ["fake", "TestSettingsParserDir"]}') settings = utils.Settings(settings_string_with_2_dirs) self.assertEqual(settings.data_dir, os.path.abspath(self.data_dir))
def test_error_if_required_missing(self): """ Ensure error is thrown if a required setting is omitted """ setting_string_without_required = io.StringIO( u'{"foo": 5, "bar": "duck"}') with self.assertRaises(ValueError): utils.Settings(setting_string_without_required)
def check_default_values_during_init(self): """ Make sure default values are set for r_seed """ settings = utils.Settings(self.check_settings_file) classes = constants.classes self.assertIs(settings.random_seed, 42) self.assertIs(settings.classes, classes)
def test_init_with_correct_stringIO(self): """ Ensure a valid dict is created from a correct json io.String during init """ string_settings = io.StringIO( u'{"data_dir": ["TestSettingsParserDir"]}') settings = utils.Settings(string_settings) self.assertIs(settings.user_input.__class__, dict) self.assertTrue(len(settings.user_input) > 0)
def main(): # this should be parsed from json, but hardcoded for now #attributes_settings = ['width','height'] #pkl_file = 'imsizeLR.pkl' #attributes_settings = ['numpixels','aspectratio'] #pkl_file = 'imsizeLR_alt.pkl' attributes_settings = ['width','height','mean','stderr','propwhite','propbool','propblack'] pkl_file = 'imattr1.pkl' # Load the settings, providing settings = utils.Settings('settings.json') # Make the wrapper function processing = highlevelfeatures.BasicAttributes(attributes_settings) # Load the training data, with the processing applied X, y = utils.load_data(settings.image_fnames, classes=settings.classes, processing=processing) # Encode the labels label_encoder = sklearn.preprocessing.LabelEncoder() y = label_encoder.fit_transform(y) # just a dummy uniform probability classifier for working purposes #clf = sklearn.dummy.DummyClassifier(strategy='uniform') #clf = sklearn.linear_model.SGDClassifier(n_jobs=-1, # loss='log') #clf = sklearn.ensemble.RandomForestClassifier(n_jobs=-1, # n_estimators=100, # verbose=1) # clf = sklearn.svm.SVC(probability=True) clf = sklearn.linear_model.LogisticRegression() cv = sklearn.cross_validation.StratifiedShuffleSplit(y) # Try cross-validating results = [] for train, test in cv: clf.fit(X[train], y[train]) p = clf.predict_proba(X[test]) results.append(sklearn.metrics.log_loss(y[test], p)) print(results) print('CV average = {}'.format(np.mean(results))) # Train on the whole thing and save model for later clf.fit(X,y) joblib.dump(clf, pkl_file, compress=3)
def main(run_settings_path, verbose=False, force=False): # load the non-run-specific settings settings = utils.Settings('settings.json') # load the run-specific settings run_settings = utils.load_run_settings(run_settings_path, settings, settings_path='settings.json', force=force) if run_settings['model type'] == 'sklearn': train_sklearn(run_settings, verbose=verbose, force=force) elif run_settings['model type'] == 'pylearn2': train_pylearn2(run_settings, verbose=verbose, force=force) else: raise NotImplementedError("Unsupported model type.")
def main(run_settings_path, verbose=False, altdata=None, augment=1, split=1): # this should just run either function depending on the run settings settings = utils.Settings('settings.json') # test script won't overwrite the pickle, so always force load run_settings = utils.load_run_settings(run_settings_path, settings, settings_path='settings.json', force=True) # HELLO BOILERPLATE if run_settings['model type'] == 'sklearn': test_sklearn(run_settings, verbose=verbose) elif run_settings['model type'] == 'pylearn2': #train_pylearn2(run_settings) test_pylearn2(run_settings, verbose=verbose,altdata=altdata, augment=augment, split=split) else: raise NotImplementedError("Unsupported model type.")
def main(): # yeah, that ought to parse it settings = utils.Settings('settings.json') # loading in mnist train_path = os.path.join(settings.data_dir, "mnist_train.npz") test_path = os.path.join(settings.data_dir, "mnist_test.npz") train_npz = np.load(train_path) test_npz = np.load(test_path) # sticking it all together X = np.vstack([train_npz['arr_0'], test_npz['arr_0']]) y = np.hstack([train_npz['arr_1'], test_npz['arr_1']]) # Testing out the mlp function mlp = neukrill_net.nk_mlp.MLP_sk_interface(verbose=True) mlp.fit(X, y)
def main(): # this should be parsed from json, but hardcoded for now #attributes_settings = ['width','height'] #pkl_file = 'imsizeLR.pkl' #out_fname = 'submission_imsizeLR.csv' #attributes_settings = ['width','height'] #pkl_file = 'imsizeSVM.pkl' #out_fname = 'submission_imsizeSVM.csv' #attributes_settings = ['numpixels','aspectratio'] #pkl_file = 'imsizeLR_alt.pkl' #out_fname = 'submission_imsizeLR_alt.csv' attributes_settings = [ 'width', 'height', 'mean', 'stderr', 'propwhite', 'propbool', 'propblack' ] pkl_file = 'imattr1.pkl' out_fname = 'submission_imattr1.csv' # Get global settings, providing file names of test data settings = utils.Settings('settings.json') # Make the wrapper function processing = highlevelfeatures.attributes_wrapper(attributes_settings) # Load the test data, with the processing applied X, names = utils.load_data(settings.image_fnames, processing=processing, verbose=False) clf = joblib.load(pkl_file) p = clf.predict_proba(X) utils.write_predictions(out_fname, p, names, settings.classes)
def main(): out_fname = 'submission_priorprobs.csv' settings = utils.Settings('settings.json') # Get names of test data files names = [ os.path.basename(fpath) for fpath in settings.image_fnames['test'] ] # Score expected from training data (not a CV score because no folds) labels = [] for class_index, class_name in enumerate(settings.classes): num_images = len(settings.image_fnames['train'][class_name]) # generate the class labels and add them to the list labels += num_images * [class_name] p = settings.class_priors[np.newaxis, :] p = np.tile(p, (len(labels), 1)) label_encoder = sklearn.preprocessing.LabelEncoder() y = label_encoder.fit_transform(labels) cv = sklearn.metrics.log_loss(y, p) print('CV = {}'.format(cv)) # Write output with open(out_fname, 'w') as csv_out: out_writer = csv.writer(csv_out, delimiter=',') out_writer.writerow(['image'] + list(settings.classes)) for index in range(len(names)): out_writer.writerow([names[index]] + list(settings.class_priors)) with open(out_fname, 'rb') as f_in: f_out = gzip.open(out_fname + '.gz', 'wb') f_out.writelines(f_in) f_out.close()
def main(): # this should be parsed from json, but hardcoded for now bow_options = { 'verbose': True, 'normalise_hist': False, 'n_features_max': 100, 'patch_size': 15, 'clusteralgo': 'kmeans', 'n_clusters': 20, 'random_seed': 42 } # Load the settings, providing settings = utils.Settings('settings.json') # Load the raw data print('Loading the raw training data') rawdata, labels = utils.load_rawdata(settings.image_fnames, classes=settings.classes) # Encode the labels label_encoder = sklearn.preprocessing.LabelEncoder() y = label_encoder.fit_transform(labels) # Probably not the best classifier clf = sklearn.linear_model.LogisticRegression() cv = sklearn.cross_validation.StratifiedShuffleSplit(y) bow = highlevelfeatures.BagOfWords(**bow_options) sample = np.random.random_integers( 0, len(rawdata) - 1, size=(1000)) # Subsample so we can do this in sensible time bow.fit([rawdata[i] for i in sample]) #bow.fit(rawdata) print('Bagging words for raw training data') X = bow.extractfeatures(rawdata) X = np.squeeze(X) # Try cross-validating print('Cross-validating') results = [] for train, test in cv: # Make a new BOW encoding #bow = highlevelfeatures.BagOfWords(**bow_options) #bow.fit([rawdata[i] for i in train]) #X = bow.extractfeatures(rawdata) clf.fit(X[train, :], y[train]) p = clf.predict_proba(X[test]) res = sklearn.metrics.log_loss(y[test], p) print(res) results.append(res) print(results) print('CV average = {}'.format(np.mean(results))) # Train on the whole thing and save model for later #bow = highlevelfeatures.BagOfWords(**bow_options) #bow.fit(rawdata) #X = bow.extractfeatures(rawdata) clf.fit(X, y) print('Loading the raw test data') rawtest, names = utils.load_rawdata(settings.image_fnames) print('Bagging words for raw test data') X2 = X = bow.extractfeatures(rawdata) X2 = np.squeeze(X2) p = clf.predict_proba(X2) utils.write_predictions('submission_bow_initial.csv', p, names, settings.classes)
def test_error_if_file_does_not_exist(self): """ Ensure an IOError is thrown if the file doesn't exist """ with self.assertRaises(ValueError): utils.Settings('fake_file')