Пример #1
0
def percrank_train(args):
    opts, files = getopt(args, 'c:d:s:j:w:e:r:')
    candgen_model = None
    train_size = 1.0
    parallel = False
    jobs_number = 0
    work_dir = None
    experiment_id = None

    for opt, arg in opts:
        if opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-s':
            train_size = float(arg)
        elif opt == '-c':
            candgen_model = arg
        elif opt == '-j':
            parallel = True
            jobs_number = int(arg)
        elif opt == '-w':
            work_dir = arg
        elif opt == '-e':
            experiment_id = arg
        elif opt == '-r' and arg:
            rnd.seed(arg)

    if len(files) != 4:
        sys.exit(__doc__)

    fname_rank_config, fname_train_das, fname_train_ttrees, fname_rank_model = files
    log_info('Training perceptron ranker...')

    rank_config = Config(fname_rank_config)
    if candgen_model:
        rank_config['candgen_model'] = candgen_model
    if rank_config.get('nn'):
        from tgen.rank_nn import SimpleNNRanker, EmbNNRanker
        if rank_config['nn'] in ['emb', 'emb_trees', 'emb_prev']:
            ranker_class = EmbNNRanker
        else:
            ranker_class = SimpleNNRanker
    else:
        ranker_class = PerceptronRanker

    log_info('Using %s for ranking' % ranker_class.__name__)

    if not parallel:
        ranker = ranker_class(rank_config)
    else:
        rank_config['jobs_number'] = jobs_number
        if work_dir is None:
            work_dir, _ = os.path.split(fname_rank_config)
        ranker = ParallelRanker(rank_config, work_dir, experiment_id,
                                ranker_class)

    ranker.train(fname_train_das, fname_train_ttrees, data_portion=train_size)

    # avoid the "maximum recursion depth exceeded" error
    sys.setrecursionlimit(100000)
    ranker.save_to_file(fname_rank_model)
Пример #2
0
def percrank_train(args):
    opts, files = getopt(args, 'c:d:s:j:w:e:')
    candgen_model = None
    train_size = 1.0
    parallel = False
    jobs_number = 0
    work_dir = None
    experiment_id = None

    for opt, arg in opts:
        if opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-s':
            train_size = float(arg)
        elif opt == '-c':
            candgen_model = arg
        elif opt == '-j':
            parallel = True
            jobs_number = int(arg)
        elif opt == '-w':
            work_dir = arg
        elif opt == '-e':
            experiment_id = arg

    if len(files) != 4:
        sys.exit(__doc__)

    fname_rank_config, fname_train_das, fname_train_ttrees, fname_rank_model = files
    log_info('Training perceptron ranker...')

    rank_config = Config(fname_rank_config)
    if candgen_model:
        rank_config['candgen_model'] = candgen_model
    if rank_config.get('nn'):
        if rank_config['nn'] == 'emb':
            ranker_class = EmbNNRanker
        else:
            ranker_class = SimpleNNRanker
    else:
        ranker_class = PerceptronRanker
    if not parallel:
        ranker = ranker_class(rank_config)
    else:
        rank_config['jobs_number'] = jobs_number
        if work_dir is None:
            work_dir, _ = os.path.split(fname_rank_config)
        ranker = ParallelRanker(rank_config, work_dir, experiment_id, ranker_class)
    ranker.train(fname_train_das, fname_train_ttrees, data_portion=train_size)
    ranker.save_to_file(fname_rank_model)
Пример #3
0
def rerank_cl_train(args):

    opts, files = getopt(args, 'a:')

    load_seq2seq_model = None
    for opt, arg in opts:
        if opt == '-a':
            load_seq2seq_model = arg

    if len(files) != 4:
        sys.exit("Invalid arguments.\n" + __doc__)
    fname_config, fname_da_train, fname_trees_train, fname_cl_model = files

    if load_seq2seq_model:
        tgen = Seq2SeqBase.load_from_file(load_seq2seq_model)

    config = Config(fname_config)
    rerank_cl = RerankingClassifier(config)
    rerank_cl.train(fname_da_train, fname_trees_train)

    if load_seq2seq_model:
        tgen.classif_filter = rerank_cl
        tgen.save_to_file(fname_cl_model)
    else:
        rerank_cl.save_to_file(fname_cl_model)
Пример #4
0
def seq2seq_train(args):

    ap = ArgumentParser()

    ap.add_argument('-s', '--train-size', type=float,
                    help='Portion of the training data to use (default: 1.0)', default=1.0)
    ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name')
    ap.add_argument('-j', '--jobs', type=int, help='Number of parallel jobs to use')
    ap.add_argument('-w', '--work-dir', type=str, help='Main working for parallel jobs')
    ap.add_argument('-e', '--experiment-id', type=str,
                    help='Experiment ID for parallel jobs (used as job name prefix)')
    ap.add_argument('-r', '--random-seed', type=str,
                    help='Initial random seed (used as string).')
    ap.add_argument('-c', '--context-file', type=str,
                    help='Input ttree/text file with context utterances')
    ap.add_argument('-v', '--valid-data', type=str,
                    help='Validation data paths (2-3 comma-separated files: DAs, trees/sentences, contexts)')
    ap.add_argument('-l', '--lexic-data', type=str,
                    help='Lexicalization data paths (1-2 comma-separated files: surface forms,' +
                    'training lexic. instructions)')

    ap.add_argument('seq2seq_config_file', type=str, help='Seq2Seq generator configuration file')
    ap.add_argument('da_train_file', type=str, help='Input training DAs')
    ap.add_argument('tree_train_file', type=str, help='Input training trees/sentences')
    ap.add_argument('seq2seq_model_file', type=str,
                    help='File name where to save the trained Seq2Seq generator model')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))
    if args.random_seed:
        rnd.seed(rnd.seed(args.random_seed))

    log_info('Training sequence-to-sequence generator...')

    config = Config(args.seq2seq_config_file)
    if args.jobs:
        config['jobs_number'] = args.jobs
        if not args.work_dir:
            work_dir, _ = os.path.split(args.seq2seq_config_file)
        generator = ParallelSeq2SeqTraining(config, args.work_dir or work_dir, args.experiment_id)
    else:
        generator = Seq2SeqGen(config)

    generator.train(args.da_train_file, args.tree_train_file,
                    data_portion=args.train_size, context_file=args.context_file,
                    validation_files=args.valid_data, lexic_files=args.lexic_data)

    sys.setrecursionlimit(100000)
    generator.save_to_file(args.seq2seq_model_file)
Пример #5
0
def treecl_train(args):
    from tgen.classif import TreeClassifier

    opts, files = getopt(args, '')

    if len(files) != 4:
        sys.exit("Invalid arguments.\n" + __doc__)
    fname_config, fname_da_train, fname_trees_train, fname_cl_model = files

    config = Config(fname_config)
    treecl = TreeClassifier(config)

    treecl.train(fname_da_train, fname_trees_train)
    treecl.save_to_file(fname_cl_model)
Пример #6
0
def main(argv):
    opts, filenames = getopt(argv, 'c:o:')
    config_filename = None
    output_filename = None
    for opt, arg in opts:
        if opt == '-c':
            config_filename = arg
        elif opt == '-o':
            output_filename = arg

    if not config_filename or not output_filename or not filenames:
        sys.exit(__doc__)

    cfg = Config(config_filename)
    m = ConcatModel.load_from_files(cfg, filenames)
    m.save_to_file(output_filename)
Пример #7
0
def candgen_train(args):
    opts, files = getopt(args, 'p:lnc:sd:t:')

    prune_threshold = 1
    parent_lemmas = False
    node_limits = False
    comp_type = None
    comp_limit = None
    comp_slots = False
    tree_classif = False

    for opt, arg in opts:
        if opt == '-p':
            prune_threshold = int(arg)
        elif opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-l':
            parent_lemmas = True
        elif opt == '-n':
            node_limits = True
        elif opt == '-c':
            comp_type = arg
            if ':' in comp_type:
                comp_type, comp_limit = comp_type.split(':', 1)
                comp_limit = int(comp_limit)
        elif opt == '-t':
            tree_classif = Config(arg)
        elif opt == '-s':
            comp_slots = True

    if len(files) != 3:
        sys.exit("Invalid arguments.\n" + __doc__)
    fname_da_train, fname_ttrees_train, fname_cand_model = files

    log_info('Training candidate generator...')
    candgen = RandomCandidateGenerator({
        'prune_threshold': prune_threshold,
        'parent_lemmas': parent_lemmas,
        'node_limits': node_limits,
        'compatible_dais_type': comp_type,
        'compatible_dais_limit': comp_limit,
        'compatible_slots': comp_slots,
        'tree_classif': tree_classif
    })
    candgen.train(fname_da_train, fname_ttrees_train)
    candgen.save_to_file(fname_cand_model)
Пример #8
0
def run_training(work_dir, config_file, train_file, model_file,
                 test_file=None, classif_file=None, memory=MEMORY,
                 name='train'):
    """\
    Run the model training.
    """
    # initialization from the configuration file
    _, ext = os.path.splitext(config_file)
    # load configuration from a pickle (we're already in the working directory)
    if ext == '.pickle':
        fh = open(config_file, mode='rb')
        cfg = pickle.load(fh)
        fh.close()
        demarshal_lambda(cfg, 'filter_attr')
        demarshal_lambda(cfg, 'postprocess')
    # load by running Python code (make paths relative to working directory)
    else:
        config_file = os.path.join(work_dir, config_file)
        cfg = Config(config_file)
    # training
    if cfg.get('unfold_pattern'):
        pattern = cfg['unfold_pattern']
        del cfg['unfold_pattern']
        unfold_key = cfg.get('unfold_key', 'unfold_key')
        cfgs = cfg.unfold_lists(pattern, unfold_key)
        for cfg in cfgs:
            key = re.sub(r'[^A-Za-z0-9_]', '', cfg[unfold_key])
            create_job(cfg, name + '-' + key, work_dir, train_file, model_file,
                       test_file, classif_file, memory)
        return
    if cfg.get('divide_func'):
        model = SplitModel(cfg)
        model.train(train_file, work_dir, memory)
    else:
        model = Model(cfg)
        model.train(train_file)
    # evaluation
    if test_file is not None and classif_file is not None:
        if ext != '.pickle':  # this means we're not in the working directory
            classif_file = os.path.join(work_dir, classif_file)
        log_info('Evaluation on file: ' + test_file)
        score = model.evaluate(test_file, classif_file=classif_file)
        log_info('Score: ' + str(score))
    # save the model
    if ext != '.pickle':  # we need to make the path relative to work_dir
        model_file = os.path.join(work_dir, model_file)
    model.save_to_file(model_file)
Пример #9
0
def asearch_gen(args):
    """A*search generation"""
    from pytreex.core.document import Document

    opts, files = getopt(args, 'e:d:w:c:s:')
    eval_file = None
    fname_ttrees_out = None
    cfg_file = None
    eval_selector = ''

    for opt, arg in opts:
        if opt == '-e':
            eval_file = arg
        elif opt == '-s':
            eval_selector = arg
        elif opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-w':
            fname_ttrees_out = arg
        elif opt == '-c':
            cfg_file = arg

    if len(files) != 3:
        sys.exit('Invalid arguments.\n' + __doc__)
    fname_cand_model, fname_rank_model, fname_da_test = files

    log_info('Initializing...')
    candgen = RandomCandidateGenerator.load_from_file(fname_cand_model)
    ranker = PerceptronRanker.load_from_file(fname_rank_model)
    cfg = Config(cfg_file) if cfg_file else {}
    cfg.update({'candgen': candgen, 'ranker': ranker})
    tgen = ASearchPlanner(cfg)

    log_info('Generating...')
    das = read_das(fname_da_test)

    if eval_file is None:
        gen_doc = Document()
    else:
        eval_doc = read_ttrees(eval_file)
        if eval_selector == tgen.selector:
            gen_doc = Document()
        else:
            gen_doc = eval_doc

    # generate and evaluate
    if eval_file is not None:
        # generate + analyze open&close lists
        lists_analyzer = ASearchListsAnalyzer()
        for num, (da, gold_tree) in enumerate(zip(
                das, trees_from_doc(eval_doc, tgen.language, eval_selector)),
                                              start=1):
            log_debug("\n\nTREE No. %03d" % num)
            gen_tree = tgen.generate_tree(da, gen_doc)
            lists_analyzer.append(gold_tree, tgen.open_list, tgen.close_list)
            if gen_tree != gold_tree:
                log_debug("\nDIFFING TREES:\n" +
                          tgen.ranker.diffing_trees_with_scores(
                              da, gold_tree, gen_tree) + "\n")

        log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' %
                 lists_analyzer.stats())

        # evaluate the generated trees against golden trees
        eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector)
        gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector)

        log_info('Evaluating...')
        evaler = Evaluator()
        for eval_bundle, eval_ttree, gen_ttree, da in zip(
                eval_doc.bundles, eval_ttrees, gen_ttrees, das):
            # add some stats about the tree directly into the output file
            add_bundle_text(
                eval_bundle, tgen.language, tgen.selector + 'Xscore',
                "P: %.4f R: %.4f F1: %.4f" %
                p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree)))

            # collect overall stats
            evaler.append(eval_ttree, gen_ttree,
                          ranker.score(TreeData.from_ttree(eval_ttree), da),
                          ranker.score(TreeData.from_ttree(gen_ttree), da))
        # print overall stats
        log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" %
                 evaler.p_r_f1())
        log_info("DEP  precision: %.4f, Recall: %.4f, F1: %.4f" %
                 evaler.p_r_f1(EvalTypes.DEP))
        log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" %
                 evaler.size_stats())
        log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" %
                 evaler.score_stats())
        log_info(
            "Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" %
            evaler.common_substruct_stats())
    # just generate
    else:
        for da in das:
            tgen.generate_tree(da, gen_doc)

    # write output
    if fname_ttrees_out is not None:
        log_info('Writing output...')
        write_ttrees(gen_doc, fname_ttrees_out)
Пример #10
0
#!/usr/bin/env python

from flect.config import Config
from tgen.features import Features
from tgen.futil import trees_from_doc, read_ttrees, read_das
import sys
import timeit
import datetime

if len(sys.argv[1:]) != 3:
    sys.exit('Usage: ./bench_feats.py features_cfg.py trees.yaml.gz das.txt')

print >> sys.stderr, 'Loading...'

cfg = Config(sys.argv[1])
trees = trees_from_doc(read_ttrees(sys.argv[2]), 'en', '')
das = read_das(sys.argv[3])

feats = Features(cfg['features'])


def test_func():
    for tree, da in zip(trees, das):
        feats.get_features(tree, {'da': da})


print >> sys.stderr, 'Running test...'
secs = timeit.timeit('test_func()',
                     setup='from __main__ import test_func',
                     number=10)
td = datetime.timedelta(seconds=secs)
Пример #11
0
def asearch_gen(args):
    """A*search generation"""

    opts, files = getopt(args, 'e:d:w:c:s:')
    eval_file = None
    fname_ttrees_out = None
    cfg_file = None
    eval_selector = ''

    for opt, arg in opts:
        if opt == '-e':
            eval_file = arg
        elif opt == '-s':
            eval_selector = arg
        elif opt == '-d':
            set_debug_stream(file_stream(arg, mode='w'))
        elif opt == '-w':
            fname_ttrees_out = arg
        elif opt == '-c':
            cfg_file = arg

    if len(files) != 3:
        sys.exit('Invalid arguments.\n' + __doc__)
    fname_cand_model, fname_rank_model, fname_da_test = files

    log_info('Initializing...')
    candgen = RandomCandidateGenerator.load_from_file(fname_cand_model)
    ranker = PerceptronRanker.load_from_file(fname_rank_model)
    cfg = Config(cfg_file) if cfg_file else {}
    cfg.update({'candgen': candgen, 'ranker': ranker})
    tgen = ASearchPlanner(cfg)

    log_info('Generating...')
    das = read_das(fname_da_test)

    if eval_file is None:
        gen_doc = Document()
    else:
        eval_doc = read_ttrees(eval_file)
        if eval_selector == tgen.selector:
            gen_doc = Document()
        else:
            gen_doc = eval_doc

    # generate and evaluate
    if eval_file is not None:
        # generate + analyze open&close lists
        lists_analyzer = ASearchListsAnalyzer()
        for num, (da, gold_tree) in enumerate(zip(das,
                                                  trees_from_doc(eval_doc, tgen.language, eval_selector)),
                                              start=1):
            log_debug("\n\nTREE No. %03d" % num)
            open_list, close_list = tgen.generate_tree(da, gen_doc, return_lists=True)
            lists_analyzer.append(gold_tree, open_list, close_list)
            gen_tree = close_list.peek()[0]
            if gen_tree != gold_tree:
                log_debug("\nDIFFING TREES:\n" + tgen.ranker.diffing_trees_with_scores(da, gold_tree, gen_tree) + "\n")

        log_info('Gold tree BEST: %.4f, on CLOSE: %.4f, on ANY list: %4f' % lists_analyzer.stats())

        # evaluate the generated trees against golden trees
        eval_ttrees = ttrees_from_doc(eval_doc, tgen.language, eval_selector)
        gen_ttrees = ttrees_from_doc(gen_doc, tgen.language, tgen.selector)

        log_info('Evaluating...')
        evaler = Evaluator()
        for eval_bundle, eval_ttree, gen_ttree, da in zip(eval_doc.bundles, eval_ttrees, gen_ttrees, das):
            # add some stats about the tree directly into the output file
            add_bundle_text(eval_bundle, tgen.language, tgen.selector + 'Xscore',
                            "P: %.4f R: %.4f F1: %.4f" % p_r_f1_from_counts(*corr_pred_gold(eval_ttree, gen_ttree)))

            # collect overall stats
            evaler.append(eval_ttree,
                          gen_ttree,
                          ranker.score(TreeData.from_ttree(eval_ttree), da),
                          ranker.score(TreeData.from_ttree(gen_ttree), da))
        # print overall stats
        log_info("NODE precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1())
        log_info("DEP  precision: %.4f, Recall: %.4f, F1: %.4f" % evaler.p_r_f1(EvalTypes.DEP))
        log_info("Tree size stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.tree_size_stats())
        log_info("Score stats:\n * GOLD %s\n * PRED %s\n * DIFF %s" % evaler.score_stats())
        log_info("Common subtree stats:\n -- SIZE: %s\n -- ΔGLD: %s\n -- ΔPRD: %s" %
                 evaler.common_subtree_stats())
    # just generate
    else:
        for da in das:
            tgen.generate_tree(da, gen_doc)

    # write output
    if fname_ttrees_out is not None:
        log_info('Writing output...')
        write_ttrees(gen_doc, fname_ttrees_out)