Пример #1
0
def percrank_train(args):
    opts, files = getopt(args, 'c:d:s:j:w:e:r:')
    candgen_model = None
    train_size = 1.0
    parallel = False
    jobs_number = 0
    work_dir = None
    experiment_id = None

    for opt, arg in opts:
        if opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-s':
            train_size = float(arg)
        elif opt == '-c':
            candgen_model = arg
        elif opt == '-j':
            parallel = True
            jobs_number = int(arg)
        elif opt == '-w':
            work_dir = arg
        elif opt == '-e':
            experiment_id = arg
        elif opt == '-r' and arg:
            rnd.seed(arg)

    if len(files) != 4:
        sys.exit(__doc__)

    fname_rank_config, fname_train_das, fname_train_ttrees, fname_rank_model = files
    log_info('Training perceptron ranker...')

    rank_config = Config(fname_rank_config)
    if candgen_model:
        rank_config['candgen_model'] = candgen_model
    if rank_config.get('nn'):
        from tgen.rank_nn import SimpleNNRanker, EmbNNRanker
        if rank_config['nn'] in ['emb', 'emb_trees', 'emb_prev']:
            ranker_class = EmbNNRanker
        else:
            ranker_class = SimpleNNRanker
    else:
        ranker_class = PerceptronRanker

    log_info('Using %s for ranking' % ranker_class.__name__)

    if not parallel:
        ranker = ranker_class(rank_config)
    else:
        rank_config['jobs_number'] = jobs_number
        if work_dir is None:
            work_dir, _ = os.path.split(fname_rank_config)
        ranker = ParallelRanker(rank_config, work_dir, experiment_id,
                                ranker_class)

    ranker.train(fname_train_das, fname_train_ttrees, data_portion=train_size)

    # avoid the "maximum recursion depth exceeded" error
    sys.setrecursionlimit(100000)
    ranker.save_to_file(fname_rank_model)
Пример #2
0
def percrank_train(args):
    opts, files = getopt(args, 'c:d:s:j:w:e:r:')
    candgen_model = None
    train_size = 1.0
    parallel = False
    jobs_number = 0
    work_dir = None
    experiment_id = None

    for opt, arg in opts:
        if opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-s':
            train_size = float(arg)
        elif opt == '-c':
            candgen_model = arg
        elif opt == '-j':
            parallel = True
            jobs_number = int(arg)
        elif opt == '-w':
            work_dir = arg
        elif opt == '-e':
            experiment_id = arg
        elif opt == '-r' and arg:
            rnd.seed(arg)

    if len(files) != 4:
        sys.exit(__doc__)

    fname_rank_config, fname_train_das, fname_train_ttrees, fname_rank_model = files
    log_info('Training perceptron ranker...')

    rank_config = Config(fname_rank_config)
    if candgen_model:
        rank_config['candgen_model'] = candgen_model
    if rank_config.get('nn'):
        from tgen.rank_nn import SimpleNNRanker, EmbNNRanker
        if rank_config['nn'] in ['emb', 'emb_trees', 'emb_prev']:
            ranker_class = EmbNNRanker
        else:
            ranker_class = SimpleNNRanker
    else:
        ranker_class = PerceptronRanker

    log_info('Using %s for ranking' % ranker_class.__name__)

    if not parallel:
        ranker = ranker_class(rank_config)
    else:
        rank_config['jobs_number'] = jobs_number
        if work_dir is None:
            work_dir, _ = os.path.split(fname_rank_config)
        ranker = ParallelRanker(rank_config, work_dir, experiment_id, ranker_class)

    ranker.train(fname_train_das, fname_train_ttrees, data_portion=train_size)

    # avoid the "maximum recursion depth exceeded" error
    sys.setrecursionlimit(100000)
    ranker.save_to_file(fname_rank_model)
Пример #3
0
def seq2seq_train(args):

    ap = ArgumentParser(prog=' '.join(sys.argv[0:2]))

    ap.add_argument('-s', '--train-size', type=float,
                    help='Portion of the training data to use (default: 1.0)', default=1.0)
    ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name')
    ap.add_argument('-j', '--jobs', type=int, help='Number of parallel jobs to use')
    ap.add_argument('-w', '--work-dir', type=str, help='Main working directory for parallel jobs')
    ap.add_argument('-e', '--experiment-id', type=str,
                    help='Experiment ID for parallel jobs (used as job name prefix)')
    ap.add_argument('-r', '--random-seed', type=str,
                    help='Initial random seed (used as string).')
    ap.add_argument('-c', '--context-file', type=str,
                    help='Input ttree/text file with context utterances')
    ap.add_argument('-v', '--valid-data', type=str,
                    help='Validation data paths (2-3 comma-separated files: DAs, trees/sentences, contexts)')
    ap.add_argument('-l', '--lexic-data', type=str,
                    help='Lexicalization data paths (1-2 comma-separated files: surface forms,' +
                    'training lexic. instructions)')
    ap.add_argument('-t', '--tb-summary-dir', '--tensorboard-summary-dir', '--tensorboard', type=str,
                    help='Directory where Tensorboard summaries are saved during training')

    ap.add_argument('seq2seq_config_file', type=str, help='Seq2Seq generator configuration file')
    ap.add_argument('da_train_file', type=str, help='Input training DAs')
    ap.add_argument('tree_train_file', type=str, help='Input training trees/sentences')
    ap.add_argument('seq2seq_model_file', type=str,
                    help='File name where to save the trained Seq2Seq generator model')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))
    if args.random_seed:
        rnd.seed(args.random_seed)

    log_info('Training sequence-to-sequence generator...')

    config = Config(args.seq2seq_config_file)

    if args.tb_summary_dir:  # override Tensorboard setting
        config['tb_summary_dir'] = args.tb_summary_dir
    if args.jobs:  # parallelize when training
        config['jobs_number'] = args.jobs
        if not args.work_dir:
            work_dir, _ = os.path.split(args.seq2seq_config_file)
        generator = ParallelSeq2SeqTraining(config, args.work_dir or work_dir, args.experiment_id)
    else:  # just a single training instance
        generator = Seq2SeqGen(config)

    generator.train(args.da_train_file, args.tree_train_file,
                    data_portion=args.train_size, context_file=args.context_file,
                    validation_files=args.valid_data, lexic_files=args.lexic_data)

    sys.setrecursionlimit(100000)
    generator.save_to_file(args.seq2seq_model_file)
Пример #4
0
def seq2seq_train(args):

    ap = ArgumentParser(prog=' '.join(sys.argv[0:2]))

    ap.add_argument('-s', '--train-size', type=float,
                    help='Portion of the training data to use (default: 1.0)', default=1.0)
    ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name')
    ap.add_argument('-j', '--jobs', type=int, help='Number of parallel jobs to use')
    ap.add_argument('-w', '--work-dir', type=str, help='Main working directory for parallel jobs')
    ap.add_argument('-e', '--experiment-id', type=str,
                    help='Experiment ID for parallel jobs (used as job name prefix)')
    ap.add_argument('-r', '--random-seed', type=str,
                    help='Initial random seed (used as string).')
    ap.add_argument('-c', '--context-file', type=str,
                    help='Input ttree/text file with context utterances')
    ap.add_argument('-v', '--valid-data', type=str,
                    help='Validation data paths (2-3 comma-separated files: DAs, trees/sentences, contexts)')
    ap.add_argument('-l', '--lexic-data', type=str,
                    help='Lexicalization data paths (1-2 comma-separated files: surface forms,' +
                    'training lexic. instructions)')
    ap.add_argument('-t', '--tb-summary-dir', '--tensorboard-summary-dir', '--tensorboard', type=str,
                    help='Directory where Tensorboard summaries are saved during training')

    ap.add_argument('seq2seq_config_file', type=str, help='Seq2Seq generator configuration file')
    ap.add_argument('da_train_file', type=str, help='Input training DAs')
    ap.add_argument('tree_train_file', type=str, help='Input training trees/sentences')
    ap.add_argument('seq2seq_model_file', type=str,
                    help='File name where to save the trained Seq2Seq generator model')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))
    if args.random_seed:
        rnd.seed(args.random_seed)

    log_info('Training sequence-to-sequence generator...')

    config = Config(args.seq2seq_config_file)

    if args.tb_summary_dir:  # override Tensorboard setting
        config['tb_summary_dir'] = args.tb_summary_dir
    if args.jobs:  # parallelize when training
        config['jobs_number'] = args.jobs
        if not args.work_dir:
            work_dir, _ = os.path.split(args.seq2seq_config_file)
        generator = ParallelSeq2SeqTraining(config, args.work_dir or work_dir, args.experiment_id)
    else:  # just a single training instance
        generator = Seq2SeqGen(config)

    generator.train(args.da_train_file, args.tree_train_file,
                    data_portion=args.train_size, context_file=args.context_file,
                    validation_files=args.valid_data, lexic_files=args.lexic_data)

    sys.setrecursionlimit(100000)
    generator.save_to_file(args.seq2seq_model_file)
Пример #5
0
 def exposed_train(self, rnd_seed, das_file, ttree_file, data_portion, context_file, validation_files):
     """Run the whole training.
     """
     rnd.seed(rnd_seed)
     log_info('Random seed: %f' % rnd_seed)
     tstart = time.time()
     log_info('Starting training...')
     self.seq2seq.train(das_file, ttree_file, data_portion, context_file, validation_files)
     log_info('Training finished -- time taken: %f secs.' % (time.time() - tstart))
     top_cost = self.seq2seq.top_k_costs[0]
     log_info('Best cost: %f' % top_cost)
     return top_cost
Пример #6
0
 def exposed_train(self, rnd_seed, das_file, ttree_file, data_portion, context_file, validation_files):
     """Run the whole training.
     """
     rnd.seed(rnd_seed)
     log_info('Random seed: %f' % rnd_seed)
     tstart = time.time()
     log_info('Starting training...')
     self.seq2seq.train(das_file, ttree_file, data_portion, context_file, validation_files)
     log_info('Training finished -- time taken: %f secs.' % (time.time() - tstart))
     top_cost = self.seq2seq.top_k_costs[0]
     log_info('Best cost: %f' % top_cost)
     return top_cost
 def exposed_training_pass(self, w, pass_no, rnd_seed, data_offset,
                           data_len):
     """(Worker) Run one pass over a part of the training data.
     @param w: initial perceptron weights (pickled)
     @param pass_no: pass number (for logging purposes)
     @param rnd_seed: random generator seed for shuffling training examples
     @param data_offset: training data portion start
     @param data_len: training data portion size
     @return: updated perceptron weights after passing the selected data portion (pickled)
     """
     log_info('Training pass %d with data portion %d + %d' %
              (pass_no, data_offset, data_len))
     # use the local ranker instance
     ranker = self.ranker_inst
     # import current feature weights
     tstart = time.time()
     ranker.set_weights(pickle.loads(w))
     log_info('Weights loading: %f secs.' % (time.time() - tstart))
     # save rest of the training data to temporary variables, set just the
     # required portion for computation
     all_train_das = ranker.train_das
     ranker.train_das = ranker.train_das[data_offset:data_offset + data_len]
     all_train_trees = ranker.train_trees
     ranker.train_trees = ranker.train_trees[data_offset:data_offset +
                                             data_len]
     all_train_feats = ranker.train_feats
     ranker.train_feats = ranker.train_feats[data_offset:data_offset +
                                             data_len]
     all_train_sents = ranker.train_sents
     ranker.train_sents = ranker.train_sents[data_offset:data_offset +
                                             data_len]
     all_train_order = ranker.train_order
     ranker.train_order = range(len(ranker.train_trees))
     if ranker.randomize:
         rnd.seed(rnd_seed)
         rnd.shuffle(ranker.train_order)
     # do the actual computation (update w)
     ranker._training_pass(pass_no)
     # return the rest of the training data to member variables
     ranker.train_das = all_train_das
     ranker.train_trees = all_train_trees
     ranker.train_feats = all_train_feats
     ranker.train_sents = all_train_sents
     ranker.train_order = all_train_order
     # return the result of the computation
     log_info('Training pass %d / %d / %d done.' %
              (pass_no, data_offset, data_len))
     tstart = time.time()
     dump = pickle.dumps((ranker.get_weights(), ranker.get_diagnostics()),
                         pickle.HIGHEST_PROTOCOL)
     log_info('Weights saving: %f secs.' % (time.time() - tstart))
     return dump
Пример #8
0
def seq2seq_train(args):

    ap = ArgumentParser()

    ap.add_argument('-s', '--train-size', type=float,
                    help='Portion of the training data to use (default: 1.0)', default=1.0)
    ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name')
    ap.add_argument('-j', '--jobs', type=int, help='Number of parallel jobs to use')
    ap.add_argument('-w', '--work-dir', type=str, help='Main working for parallel jobs')
    ap.add_argument('-e', '--experiment-id', type=str,
                    help='Experiment ID for parallel jobs (used as job name prefix)')
    ap.add_argument('-r', '--random-seed', type=str,
                    help='Initial random seed (used as string).')
    ap.add_argument('-c', '--context-file', type=str,
                    help='Input ttree/text file with context utterances')
    ap.add_argument('-v', '--valid-data', type=str,
                    help='Validation data paths (2-3 comma-separated files: DAs, trees/sentences, contexts)')

    ap.add_argument('seq2seq_config_file', type=str, help='Seq2Seq generator configuration file')
    ap.add_argument('da_train_file', type=str, help='Input training DAs')
    ap.add_argument('tree_train_file', type=str, help='Input training trees/sentences')
    ap.add_argument('seq2seq_model_file', type=str,
                    help='File name where to save the trained Seq2Seq generator model')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))
    if args.random_seed:
        rnd.seed(rnd.seed(args.random_seed))

    log_info('Training sequence-to-sequence generator...')

    config = Config(args.seq2seq_config_file)
    if args.jobs:
        config['jobs_number'] = args.jobs
        if not args.work_dir:
            work_dir, _ = os.path.split(args.seq2seq_config_file)
        generator = ParallelSeq2SeqTraining(config, args.work_dir or work_dir, args.experiment_id)
    else:
        generator = Seq2SeqGen(config)

    generator.train(args.da_train_file, args.tree_train_file,
                    data_portion=args.train_size, context_file=args.context_file,
                    validation_files=args.valid_data)

    sys.setrecursionlimit(100000)
    generator.save_to_file(args.seq2seq_model_file)
Пример #9
0
 def exposed_training_pass(self, w, pass_no, rnd_seed, data_offset, data_len):
     """(Worker) Run one pass over a part of the training data.
     @param w: initial perceptron weights (pickled)
     @param pass_no: pass number (for logging purposes)
     @param rnd_seed: random generator seed for shuffling training examples
     @param data_offset: training data portion start
     @param data_len: training data portion size
     @return: updated perceptron weights after passing the selected data portion (pickled)
     """
     log_info('Training pass %d with data portion %d + %d' %
              (pass_no, data_offset, data_len))
     # use the local ranker instance
     ranker = self.ranker_inst
     # import current feature weights
     tstart = time.time()
     ranker.set_weights(pickle.loads(w))
     log_info('Weights loading: %f secs.' % (time.time() - tstart))
     # save rest of the training data to temporary variables, set just the
     # required portion for computation
     all_train_das = ranker.train_das
     ranker.train_das = ranker.train_das[data_offset:data_offset + data_len]
     all_train_trees = ranker.train_trees
     ranker.train_trees = ranker.train_trees[data_offset:data_offset + data_len]
     all_train_feats = ranker.train_feats
     ranker.train_feats = ranker.train_feats[data_offset:data_offset + data_len]
     all_train_sents = ranker.train_sents
     ranker.train_sents = ranker.train_sents[data_offset:data_offset + data_len]
     all_train_order = ranker.train_order
     ranker.train_order = range(len(ranker.train_trees))
     if ranker.randomize:
         rnd.seed(rnd_seed)
         rnd.shuffle(ranker.train_order)
     # do the actual computation (update w)
     ranker._training_pass(pass_no)
     # return the rest of the training data to member variables
     ranker.train_das = all_train_das
     ranker.train_trees = all_train_trees
     ranker.train_feats = all_train_feats
     ranker.train_sents = all_train_sents
     ranker.train_order = all_train_order
     # return the result of the computation
     log_info('Training pass %d / %d / %d done.' % (pass_no, data_offset, data_len))
     tstart = time.time()
     dump = pickle.dumps((ranker.get_weights(), ranker.get_diagnostics()), pickle.HIGHEST_PROTOCOL)
     log_info('Weights saving: %f secs.' % (time.time() - tstart))
     return dump
Пример #10
0
def train(args):

    if args.random_seed:  # set random seed if needed
        rnd.seed(args.random_seed)

    log_info("Loading configuration from %s..." % args.config_file)
    with codecs.open(args.config_file, 'r', 'UTF-8') as fh:
        cfg = yaml.load(fh)

    log_info("Initializing...")
    rp = RatingPredictor(cfg)
    if args.tensorboard_dir_id is not None:
        tb_dir, run_id = args.tensorboard_dir_id.split(':', 1)
        rp.set_tensorboard_logging(tb_dir, run_id)
    log_info("Training...")
    rp.train(args.train_data,
             valid_data_file=args.valid_data,
             data_portion=args.training_portion,
             model_fname=args.model_file)
    log_info("Saving model to %s..." % args.model_file)
    rp.save_to_file(args.model_file)