def smoke_build_model(build_model): set_log_file("testlog.log") prepared_data = prepare_fake_data() f_train, f_validate, train_idx, valid_idx, train_eval, valid_eval = ( build_model(prepared_data, batch_size=1)) best_validation_loss, best_epoch = ( train_model(f_train, f_validate, train_idx, valid_idx, train_eval, valid_eval, n_epochs=100)) assert best_validation_loss > 0.5 assert best_epoch > 0
help='the name of the parameter set that we want to use') parser.add_argument('-f', dest='file', type=str, default=None, help='the data file to use') parser.add_argument('-o', dest='outname', type=str, default=gen_log_name(), help='name for the log file to be generated') parser.add_argument( '-t', dest='task_num', type=int, default=0, help='a way to separate different runs of the same parameter-set') args = parser.parse_args() params = config.get_config(args.param_set) set_log_file(args.outname) if args.file: params['dataset_name'] = args.file elif 'dataset_name' not in params: params['dataset_name'] = default_dataset run(0, **params) print "finished" if sys.platform.startswith('win'): from win_utils import winalert winalert()
dataset = smart_load_data(**kwargs) train_idx, valid_idx = cv_split_binarized(dataset, percent=0.2, fold_index=task_num) prepared_data = (dataset, train_idx, valid_idx) # load classifiers to build ensemble out of saved_classifiers = filter(lambda(fn): os.path.splitext(fn)[1] == '.model', os.listdir('.')) # build and train ensemble model = SelectedModel(prepared_data, saved_classifiers=saved_classifiers, **kwargs) _, params = model.train_full(**kwargs) if __name__ == '__main__': args = docopt(__doc__) params = {} log_filename = args['--out'] or gen_log_name() if args['--quiet']: log_filename = os.devnull print("Not printing to log file.") set_log_file(log_filename) if args['--file']: params['dataset_name'] = args['--file'] task_num = int(args['--task_number']) params['conds'] = COND_TYPES[task_num % len(COND_TYPES)] run(task_num=task_num, **params) print("Finished")
with open(fname, 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerow(headers) for i in idxs: student = data['subject'][i] outcome = 'correct' if data['correct'][i] == 2 else 'incorrect' KCs = 'word' if single_skill else data.orig['skill'][i] word_feats = skills[data['skill'][i]] row = [student, outcome, KCs] + list(word_feats) writer.writerow(row) def to_fast(data, train_fname=None, valid_fname=None, fold=0, vector_length=150, single_skill=False): train_fname = train_fname or 'FAST+deepkt_train{}.txt'.format(fold) valid_fname = valid_fname or 'FAST+deepkt_test{}.txt'.format(fold) valid_idx, train_idx = cv_split(data, fold_index=fold, no_new_skill=True) skills = gen_word_matrix(data.get_data('skill'), data['skill'].enum_pairs, vector_length=vector_length) # create fast header feat_headers = ['*features_{}'.format(i) for i in xrange(vector_length)] headers = ['student', 'outcome', 'KCs'] + feat_headers _to_fast_with_idxs(train_fname, headers, skills, data, train_idx, single_skill=single_skill) _to_fast_with_idxs(valid_fname, headers, skills, data, valid_idx, single_skill=single_skill) set_log_file('temp.log') #for fold in xrange(14): # to_fast(fold=fold, single_skill=False) data = prepare_data(dataset_name='data/data5.gz', top_n=14) to_fast(data, fold=0, single_skill=False)
prepared_data = chinese.prepare_data(top_n=40, **kwargs) train_idx, valid_idx = cv_split(prepared_data, percent=.1, fold_index=task_num, **kwargs) model = SelectedModel((prepared_data, train_idx, valid_idx), **kwargs) model.train_full() if __name__ == '__main__': default_dataset = 'raw_data/chinese_dictation.txt' parser = argparse.ArgumentParser(description="run an experiment on this computer") parser.add_argument('-p', dest='param_set', type=str, default='default', choices=config.all_param_set_keys, help='the name of the parameter set that we want to use') parser.add_argument('-f', dest='file', type=str, default=None, help='the data file to use') parser.add_argument('-o', dest='outname', type=str, default=gen_log_name(), help='name for the log file to be generated') parser.add_argument('-t', dest='task_num', type=int, default=0, help='a way to separate different runs of the same parameter-set') args = parser.parse_args() params = config.get_config(args.param_set) set_log_file(args.outname) if args.file: params['dataset_name'] = args.file elif 'dataset_name' not in params: params['dataset_name'] = default_dataset run(0, **params) print "finished"
with open(fname, 'w') as f: writer = csv.writer(f, delimiter='\t') writer.writerow(headers) for i in idxs: student = data['subject'][i] outcome = 'correct' if data['correct'][i] == 2 else 'incorrect' KCs = 'word' if single_skill else data.orig['skill'][i] word_feats = skills[data['skill'][i]] row = [student, outcome, KCs] + list(word_feats) writer.writerow(row) def to_fast(data, train_fname=None, valid_fname=None, fold=0, vector_length=150, single_skill=False): train_fname = train_fname or 'FAST+deepkt_train{}.txt'.format(fold) valid_fname = valid_fname or 'FAST+deepkt_test{}.txt'.format(fold) valid_idx, train_idx = cv_split(data, cv_fold=fold, no_new_skill=True) skills = gen_word_matrix(data.get_data('skill'), data['skill'].enum_pairs, vector_length=vector_length) # create fast header feat_headers = ['*features_{}'.format(i) for i in xrange(vector_length)] headers = ['student', 'outcome', 'KCs'] + feat_headers _to_fast_with_idxs(train_fname, headers, skills, data, train_idx, single_skill=single_skill) _to_fast_with_idxs(valid_fname, headers, skills, data, valid_idx, single_skill=single_skill) set_log_file('temp.log') #for fold in xrange(14): # to_fast(fold=fold, single_skill=False) data = prepare_data(dataset_name='data/data5.gz', top_n=14) to_fast(data, fold=0, single_skill=False)
def __enter__(self): set_log_file(self.log_name)
with open('{}.model'.format(output_name), "wb") as f: pickle.dump(model, f) if __name__ == '__main__': args = docopt(__doc__) # load args params = config.get_config(args['--param_set']) err_filename = args['--err'] or gen_log_name() out_filename = args['--out'] or "{}.model".format(err_filename) if args['--quiet']: log_filename = os.devnull out_filename = os.devnull print("Suppressing logging and output.") set_log_file(err_filename) task_num = int(args['--task_number']) params['model'] = args['--model'] params['features'] = args['--feature'] params['conds'] = args['--cond'] params['dataset_name'] = args['--in'] params['data_mode'] = args['--data_mode'] params['output_name'] = out_filename params['task_num'] = int(args['--task_number']) if args['run']: run(**params) # TODO: figure out a good interface
else: raise Exception("model type is not valid") dataset = prepare_data(**kwargs) train_idx, valid_idx = cv_split(dataset, percent=0.1, fold_index=task_num) prepared_data = (dataset, train_idx, valid_idx) model = SelectedModel(prepared_data, **kwargs) model.train_full(**kwargs) if __name__ == '__main__': default_dataset = "raw_data/all_siegle.txt" args = docopt(__doc__) params = config.get_config(args['--param_set']) log_filename = args['--out'] or gen_log_name() if args['--quiet']: log_filename = os.devnull print("Not printing to log file.") set_log_file(log_filename) if args['--file']: params['dataset_name'] = args['--file'] elif 'dataset_name' not in params: params['dataset_name'] = default_dataset params['conds'] = ['EyesClosed', 'EyesOpen'] run(0, **params) print("Finished")