예제 #1
0
파일: setup.py 프로젝트: emsrc/daeso-dutch
def create_sample_instances(setting): 
    """
    Create the sample corpus instance files and the sample true pgc files in
    the ./inst and ./true subdirs respectively. 
    
    This assumes pgc part files are present in the ./part subdir. 
    
    Produces inst/dev001.inst, ..., inst/dev004.inst and
    true/dev001_true.pgc, ..., true/dev004_true.pgc
    """
    extract(setting)
예제 #2
0
def exp(setting):
    """
    perform alignment experiment
    """
    exp_init(setting)
    create_parts(setting)
    extract(setting)
    sample(setting)
    classify(setting)
    weight(setting)
    match(setting)
    merge(setting)
    evaluate(setting)
    exp_exit(setting)
예제 #3
0
def exp(setting):
    """
    perform alignment experiment
    """
    exp_init(setting)
    create_parts(setting)
    extract(setting)
    sample(setting)
    classify(setting)
    weight(setting)
    match(setting)
    merge(setting)
    evaluate(setting)
    exp_exit(setting)
예제 #4
0
def exp_dev_fast(setting):
    """
    perform a fast alignment experiment on development data
    
    Weighting, matching and merging takes place per test corpus without
    writing intermediary results to a file.
    """
    assert setting.develop and not setting.validate

    exp_init(setting)

    create_parts(setting)

    # It's impossible to do extraction one corpus a time, because in order to
    # classify a test corpus you need instances for all the other training
    # corpora! Moreover, since Timbl classification is file-based, we need to
    # write the corpus instance files to disk. These files can be huge and
    # keeping all of them in memory seems to offer little benefit.
    extract(setting)

    sample(setting)

    # Timbl writes its output to a file, which then needs to be parsed in
    # order to insert the class predictions and weights into the corpus
    # instances. That means there is no advantage to doing classification
    # one corpus a time.
    classify(setting)

    log.info("\n" + header("WEIGHT/MATCH/MERGE STEP"))
    # reset evaluator
    if setting.evaluate:
        setting.evaluator.__init__()

    scope = zip(setting.dev_inst_fns, setting.dev_clas_fns, setting.dev_true_fns)[: setting.n]

    for inst_fname, out_fname, true_fname in scope:
        log.info("reading corpus instances {0}".format(inst_fname))
        corpus_inst = CorpusInst()
        corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype)

        if setting.weight:
            log.info("reading classifier output {0}".format(out_fname))
            timbl_out = parse_timbl_output(open(out_fname))
            log.info("weighting...")
            weight_corpus(corpus_inst, timbl_out, setting.weight_func)

        if setting.match:
            log.info("matching...")
            match_corpus(corpus_inst, setting.matcher)

        if setting.merge:
            log.info("reading true corpus {0}".format(true_fname))
            true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE)
            log.info("merging...")
            pred_corpus = merge_corpus(corpus_inst, true_corpus, setting.merger)

        if setting.evaluate:
            name = os.path.basename(true_fname).split("_")[0]
            setting.evaluator.add(true_corpus, pred_corpus, name)

    if setting.evaluate:
        log.info("evaluting...")
        setting.evaluator.run_eval()
        log.info("saving evaluation {0}".format(setting.dev_eval_fname))
        makedirs(setting.eval_dir)
        setting.evaluator.write(setting.dev_eval_fname)

    exp_exit(setting)
예제 #5
0
def exp_dev_fast(setting):
    """
    perform a fast alignment experiment on development data
    
    Weighting, matching and merging takes place per test corpus without
    writing intermediary results to a file.
    """
    assert setting.develop and not setting.validate

    exp_init(setting)

    create_parts(setting)

    # It's impossible to do extraction one corpus a time, because in order to
    # classify a test corpus you need instances for all the other training
    # corpora! Moreover, since Timbl classification is file-based, we need to
    # write the corpus instance files to disk. These files can be huge and
    # keeping all of them in memory seems to offer little benefit.
    extract(setting)

    sample(setting)

    # Timbl writes its output to a file, which then needs to be parsed in
    # order to insert the class predictions and weights into the corpus
    # instances. That means there is no advantage to doing classification
    # one corpus a time.
    classify(setting)

    log.info("\n" + header("WEIGHT/MATCH/MERGE STEP"))
    # reset evaluator
    if setting.evaluate: setting.evaluator.__init__()

    scope = zip(setting.dev_inst_fns, setting.dev_clas_fns,
                setting.dev_true_fns)[:setting.n]

    for inst_fname, out_fname, true_fname in scope:
        log.info("reading corpus instances {0}".format(inst_fname))
        corpus_inst = CorpusInst()
        corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype)

        if setting.weight:
            log.info("reading classifier output {0}".format(out_fname))
            timbl_out = parse_timbl_output(open(out_fname))
            log.info("weighting...")
            weight_corpus(corpus_inst, timbl_out, setting.weight_func)

        if setting.match:
            log.info("matching...")
            match_corpus(corpus_inst, setting.matcher)

        if setting.merge:
            log.info("reading true corpus {0}".format(true_fname))
            true_corpus = ParallelGraphCorpus(inf=true_fname,
                                              graph_loading=LOAD_NONE)
            log.info("merging...")
            pred_corpus = merge_corpus(corpus_inst, true_corpus,
                                       setting.merger)

        if setting.evaluate:
            name = os.path.basename(true_fname).split("_")[0]
            setting.evaluator.add(true_corpus, pred_corpus, name)

    if setting.evaluate:
        log.info("evaluting...")
        setting.evaluator.run_eval()
        log.info("saving evaluation {0}".format(setting.dev_eval_fname))
        makedirs(setting.eval_dir)
        setting.evaluator.write(setting.dev_eval_fname)

    exp_exit(setting)