Exemplo n.º 1
0
def weight_files(inst_fns, out_fns, weight_func=entropy_weight,
                 descriptor=None, n=None, binary=False):
    """
    Weight corpus instance files
    
    @param inst_fns: list of corpus instance filenames
    
    @param out_fns: list of filenames containing Timbl output
    
    @keyword weight_func: weighting fuction
    
    @keyword descriptor: a Descriptor instance, required if corpus instances
    are loaded in text format

    
    @keyword n: limit merging to the first n files
    """
    for inst_fname, out_fname in zip(inst_fns, out_fns)[:n]:
        corpus_inst = CorpusInst()
        if binary:
            corpus_inst.loadbin(inst_fname)
        else:
            corpus_inst.loadtxt(inst_fname, descriptor.dtype)
            
        timbl_out = parse_timbl_output(open(out_fname))
        weight_corpus(corpus_inst, timbl_out, weight_func)
        log.info("saving weighted corpus instances to {0}".format(inst_fname))
        corpus_inst.save()
Exemplo n.º 2
0
    def test_weight_corpus(self):
        st = create_setting()
        corpus_inst = CorpusInst()
        inst_fname = st.dev_inst_fns[0]
        corpus_inst.loadtxt(inst_fname, st.descriptor.dtype)
        graph_inst = corpus_inst[0]

        # clear predicted weights field
        graph_inst["pred_weight"] = 0.0
        # backup original for comparison later on
        weight_before = graph_inst["pred_weight"].copy()

        out_fname = st.dev_clas_fns[0]
        timbl_out = parse_timbl_output(open(out_fname))

        weight_corpus(corpus_inst, timbl_out, entropy_weight)

        # check that at least one weight is different (i.e. not 0.0)
        self.assertTrue(any(graph_inst["pred_weight"] != weight_before))
Exemplo n.º 3
0
    def test_weight_corpus(self):
        st = create_setting()
        corpus_inst = CorpusInst()
        inst_fname = st.dev_inst_fns[0]
        corpus_inst.loadtxt(inst_fname, st.descriptor.dtype)
        graph_inst = corpus_inst[0]

        # clear predicted weights field
        graph_inst["pred_weight"] = 0.0
        # backup original for comparison later on
        weight_before = graph_inst["pred_weight"].copy()

        out_fname = st.dev_clas_fns[0]
        timbl_out = parse_timbl_output(open(out_fname))

        weight_corpus(corpus_inst, timbl_out, entropy_weight)

        # check that at least one weight is different (i.e. not 0.0)
        self.assertTrue(any(graph_inst["pred_weight"] != weight_before))
Exemplo n.º 4
0
def exp_dev_fast(setting):
    """
    perform a fast alignment experiment on development data
    
    Weighting, matching and merging takes place per test corpus without
    writing intermediary results to a file.
    """
    assert setting.develop and not setting.validate

    exp_init(setting)

    create_parts(setting)

    # It's impossible to do extraction one corpus a time, because in order to
    # classify a test corpus you need instances for all the other training
    # corpora! Moreover, since Timbl classification is file-based, we need to
    # write the corpus instance files to disk. These files can be huge and
    # keeping all of them in memory seems to offer little benefit.
    extract(setting)

    sample(setting)

    # Timbl writes its output to a file, which then needs to be parsed in
    # order to insert the class predictions and weights into the corpus
    # instances. That means there is no advantage to doing classification
    # one corpus a time.
    classify(setting)

    log.info("\n" + header("WEIGHT/MATCH/MERGE STEP"))
    # reset evaluator
    if setting.evaluate:
        setting.evaluator.__init__()

    scope = zip(setting.dev_inst_fns, setting.dev_clas_fns, setting.dev_true_fns)[: setting.n]

    for inst_fname, out_fname, true_fname in scope:
        log.info("reading corpus instances {0}".format(inst_fname))
        corpus_inst = CorpusInst()
        corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype)

        if setting.weight:
            log.info("reading classifier output {0}".format(out_fname))
            timbl_out = parse_timbl_output(open(out_fname))
            log.info("weighting...")
            weight_corpus(corpus_inst, timbl_out, setting.weight_func)

        if setting.match:
            log.info("matching...")
            match_corpus(corpus_inst, setting.matcher)

        if setting.merge:
            log.info("reading true corpus {0}".format(true_fname))
            true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE)
            log.info("merging...")
            pred_corpus = merge_corpus(corpus_inst, true_corpus, setting.merger)

        if setting.evaluate:
            name = os.path.basename(true_fname).split("_")[0]
            setting.evaluator.add(true_corpus, pred_corpus, name)

    if setting.evaluate:
        log.info("evaluting...")
        setting.evaluator.run_eval()
        log.info("saving evaluation {0}".format(setting.dev_eval_fname))
        makedirs(setting.eval_dir)
        setting.evaluator.write(setting.dev_eval_fname)

    exp_exit(setting)
Exemplo n.º 5
0
def exp_dev_fast(setting):
    """
    perform a fast alignment experiment on development data
    
    Weighting, matching and merging takes place per test corpus without
    writing intermediary results to a file.
    """
    assert setting.develop and not setting.validate

    exp_init(setting)

    create_parts(setting)

    # It's impossible to do extraction one corpus a time, because in order to
    # classify a test corpus you need instances for all the other training
    # corpora! Moreover, since Timbl classification is file-based, we need to
    # write the corpus instance files to disk. These files can be huge and
    # keeping all of them in memory seems to offer little benefit.
    extract(setting)

    sample(setting)

    # Timbl writes its output to a file, which then needs to be parsed in
    # order to insert the class predictions and weights into the corpus
    # instances. That means there is no advantage to doing classification
    # one corpus a time.
    classify(setting)

    log.info("\n" + header("WEIGHT/MATCH/MERGE STEP"))
    # reset evaluator
    if setting.evaluate: setting.evaluator.__init__()

    scope = zip(setting.dev_inst_fns, setting.dev_clas_fns,
                setting.dev_true_fns)[:setting.n]

    for inst_fname, out_fname, true_fname in scope:
        log.info("reading corpus instances {0}".format(inst_fname))
        corpus_inst = CorpusInst()
        corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype)

        if setting.weight:
            log.info("reading classifier output {0}".format(out_fname))
            timbl_out = parse_timbl_output(open(out_fname))
            log.info("weighting...")
            weight_corpus(corpus_inst, timbl_out, setting.weight_func)

        if setting.match:
            log.info("matching...")
            match_corpus(corpus_inst, setting.matcher)

        if setting.merge:
            log.info("reading true corpus {0}".format(true_fname))
            true_corpus = ParallelGraphCorpus(inf=true_fname,
                                              graph_loading=LOAD_NONE)
            log.info("merging...")
            pred_corpus = merge_corpus(corpus_inst, true_corpus,
                                       setting.merger)

        if setting.evaluate:
            name = os.path.basename(true_fname).split("_")[0]
            setting.evaluator.add(true_corpus, pred_corpus, name)

    if setting.evaluate:
        log.info("evaluting...")
        setting.evaluator.run_eval()
        log.info("saving evaluation {0}".format(setting.dev_eval_fname))
        makedirs(setting.eval_dir)
        setting.evaluator.write(setting.dev_eval_fname)

    exp_exit(setting)