def percrank_train(args): opts, files = getopt(args, 'c:d:s:j:w:e:r:') candgen_model = None train_size = 1.0 parallel = False jobs_number = 0 work_dir = None experiment_id = None for opt, arg in opts: if opt == '-d': set_debug_stream(file_stream(arg, mode='w')) elif opt == '-s': train_size = float(arg) elif opt == '-c': candgen_model = arg elif opt == '-j': parallel = True jobs_number = int(arg) elif opt == '-w': work_dir = arg elif opt == '-e': experiment_id = arg elif opt == '-r' and arg: rnd.seed(arg) if len(files) != 4: sys.exit(__doc__) fname_rank_config, fname_train_das, fname_train_ttrees, fname_rank_model = files log_info('Training perceptron ranker...') rank_config = Config(fname_rank_config) if candgen_model: rank_config['candgen_model'] = candgen_model if rank_config.get('nn'): from tgen.rank_nn import SimpleNNRanker, EmbNNRanker if rank_config['nn'] in ['emb', 'emb_trees', 'emb_prev']: ranker_class = EmbNNRanker else: ranker_class = SimpleNNRanker else: ranker_class = PerceptronRanker log_info('Using %s for ranking' % ranker_class.__name__) if not parallel: ranker = ranker_class(rank_config) else: rank_config['jobs_number'] = jobs_number if work_dir is None: work_dir, _ = os.path.split(fname_rank_config) ranker = ParallelRanker(rank_config, work_dir, experiment_id, ranker_class) ranker.train(fname_train_das, fname_train_ttrees, data_portion=train_size) # avoid the "maximum recursion depth exceeded" error sys.setrecursionlimit(100000) ranker.save_to_file(fname_rank_model)
def run_training(work_dir, config_file, train_file, model_file, test_file=None, classif_file=None, memory=MEMORY, name='train'): """\ Run the model training. """ # initialization from the configuration file _, ext = os.path.splitext(config_file) # load configuration from a pickle (we're already in the working directory) if ext == '.pickle': fh = open(config_file, mode='rb') cfg = pickle.load(fh) fh.close() demarshal_lambda(cfg, 'filter_attr') demarshal_lambda(cfg, 'postprocess') # load by running Python code (make paths relative to working directory) else: config_file = os.path.join(work_dir, config_file) cfg = Config(config_file) # training if cfg.get('unfold_pattern'): pattern = cfg['unfold_pattern'] del cfg['unfold_pattern'] unfold_key = cfg.get('unfold_key', 'unfold_key') cfgs = cfg.unfold_lists(pattern, unfold_key) for cfg in cfgs: key = re.sub(r'[^A-Za-z0-9_]', '', cfg[unfold_key]) create_job(cfg, name + '-' + key, work_dir, train_file, model_file, test_file, classif_file, memory) return if cfg.get('divide_func'): model = SplitModel(cfg) model.train(train_file, work_dir, memory) else: model = Model(cfg) model.train(train_file) # evaluation if test_file is not None and classif_file is not None: if ext != '.pickle': # this means we're not in the working directory classif_file = os.path.join(work_dir, classif_file) log_info('Evaluation on file: ' + test_file) score = model.evaluate(test_file, classif_file=classif_file) log_info('Score: ' + str(score)) # save the model if ext != '.pickle': # we need to make the path relative to work_dir model_file = os.path.join(work_dir, model_file) model.save_to_file(model_file)
def percrank_train(args): opts, files = getopt(args, 'c:d:s:j:w:e:') candgen_model = None train_size = 1.0 parallel = False jobs_number = 0 work_dir = None experiment_id = None for opt, arg in opts: if opt == '-d': set_debug_stream(file_stream(arg, mode='w')) elif opt == '-s': train_size = float(arg) elif opt == '-c': candgen_model = arg elif opt == '-j': parallel = True jobs_number = int(arg) elif opt == '-w': work_dir = arg elif opt == '-e': experiment_id = arg if len(files) != 4: sys.exit(__doc__) fname_rank_config, fname_train_das, fname_train_ttrees, fname_rank_model = files log_info('Training perceptron ranker...') rank_config = Config(fname_rank_config) if candgen_model: rank_config['candgen_model'] = candgen_model if rank_config.get('nn'): if rank_config['nn'] == 'emb': ranker_class = EmbNNRanker else: ranker_class = SimpleNNRanker else: ranker_class = PerceptronRanker if not parallel: ranker = ranker_class(rank_config) else: rank_config['jobs_number'] = jobs_number if work_dir is None: work_dir, _ = os.path.split(fname_rank_config) ranker = ParallelRanker(rank_config, work_dir, experiment_id, ranker_class) ranker.train(fname_train_das, fname_train_ttrees, data_portion=train_size) ranker.save_to_file(fname_rank_model)