Exemplo n.º 1
0
 def test_extract_dev(self):
     st = create_setting()
     st.validate = False
     # create tmp dirs for extraction output
     st.inst_dir = tempfile.mkdtemp()
     st.true_dir = tempfile.mkdtemp()
     
     extract(st)
     
     # check no of files
     self.assertEqual(len(st.dev_true_fns), 
                      len(st.dev_part_fns))
     self.assertEqual(len(st.dev_inst_fns), 
                      len(st.dev_part_fns))
     
     # test loading a corpus file
     corpus = ParallelGraphCorpus(inf=st.dev_true_fns[0])
     
     # test loading a instances file
     inst = CorpusInst()
     inst.loadtxt(st.dev_inst_fns[0],
                  st.descriptor.dtype)
     self.assertEqual(len(corpus), len(inst))
     
     clean_inst(st)
     clean_true(st)
Exemplo n.º 2
0
 def test_extract_val_binary(self):
     st = create_setting()
     st.develop = False
     # create tmp dirs for extraction output
     st.inst_dir = tempfile.mkdtemp()
     st.true_dir = tempfile.mkdtemp()
     st.binary = True
     
     extract(st)
     
     # check no of files
     self.assertEqual(len(st.val_true_fns), 
                      len(st.val_part_fns))
     self.assertEqual(len(st.val_inst_fns), 
                      len(st.val_part_fns))
     
     # test loading a corpus file
     corpus = ParallelGraphCorpus(inf=st.val_true_fns[0])
     
     # test loading a instances file
     inst = CorpusInst()
     inst.loadbin(st.val_inst_fns[0])
     self.assertEqual(len(corpus), len(inst))
     
     clean_inst(st)
     clean_true(st)
Exemplo n.º 3
0
    def test_classify(self):
        corpus_inst = CorpusInst()
        descriptor = create_setting().descriptor
        corpus_inst.loadtxt("exp/inst/dev001.inst", 
                            descriptor.dtype)
        graph_inst = corpus_inst[0]
        
        # clear relevant fields
        graph_inst["pred_relation"] = "None"
        graph_inst["pred_weight"] = 0.0

        # backup original for comparison later on
        pred_before = graph_inst["pred_relation"].copy()
        weight_before = graph_inst["pred_weight"].copy()
        
        classifier = TimblClassifier(descriptor,
                                     "exp/inst/dev001.inst")
        classifier.classify(graph_inst)
        
        # delete classifier to make sure that server is killed,
        # even if test fails
        del classifier
        
        # check that at least one prediction is different (i.e. not None)
        self.assertTrue(any(graph_inst["pred_relation"] != pred_before))
        
        # check that at least one weight is different (i.e. not 0.0)
        self.assertTrue(any(graph_inst["pred_weight"] != weight_before))
Exemplo n.º 4
0
 def test_merge(self):
     corpus_inst = CorpusInst()
     dtype = create_setting().descriptor.dtype
     corpus_inst.loadtxt("exp/inst/dev001.inst", dtype)
     graph_inst = corpus_inst[0]
     
     pgc = ParallelGraphCorpus(inf="exp/true/dev001_true.pgc")
     graph_pair = pgc[0]
     
     gm = Merger()
     gm.merge(graph_inst, graph_pair)
Exemplo n.º 5
0
    def test_extract_with_pp_graph_hooks(self):
        """
        test of extracting feature with preprocessing hook
        """
        st = create_setting()
        st.validate = False
        # create tmp dirs for extraction output
        st.inst_dir = tempfile.mkdtemp()
        st.true_dir = tempfile.mkdtemp()
        
        # a preprocessing function which insert an attribute "x" with value
        # "y" on every node inthe graphs
        def pp_hook1(graphs):
            for g in graphs:
                for attrs in g.node.values():
                    attrs[u"x"] = u"y"
        
        # a feature function which relies on the pp_hook above
        def ff_x(nodes, graphs, **kwargs):
            return graphs.source.node[nodes.source][u"x"]
        
        # create a feature description
        f = Feat(ff_x, "S1", pp_graph_hooks=[pp_hook1])
        
        # add to features; descriptor and extractor are automatically derived
        st.features = (f,)
        
        extract(st)
        
        # check no of files
        self.assertEqual(len(st.dev_true_fns), 
                         len(st.dev_part_fns))
        self.assertEqual(len(st.dev_inst_fns), 
                         len(st.dev_part_fns))

        # test loading a corpus file
        corpus = ParallelGraphCorpus(inf=st.dev_true_fns[0])

        # test loading a instances file
        inst = CorpusInst()
        inst.loadtxt(st.dev_inst_fns[0],
                     st.descriptor.dtype)
        self.assertEqual(len(corpus), len(inst))
        
        # check values produced by preprocessing function
        self.assertTrue(all(inst[0]["x"] == "y"))

        clean_inst(st)
        clean_true(st)
Exemplo n.º 6
0
def extract_corpus(extractor, selector, corpus):
    corpus_inst = CorpusInst()
    # create an empty copy, because append() is faster than __del__() or
    # remove()
    true_corpus = ParallelGraphCorpus(
        relations=corpus.get_relations(),
        meta_data=corpus.get_meta_data())
    
    for graph_pair in corpus:
        if selector(graph_pair):
            true_corpus.append(graph_pair)
            corpus_inst.append(
                extractor.extract(graph_pair))
            
    return corpus_inst, true_corpus
Exemplo n.º 7
0
    def test_match(self):
        corpus_inst = CorpusInst()
        dtype = create_setting().descriptor.dtype
        corpus_inst.loadtxt("exp/inst/dev001.inst", dtype)
        graph_inst = corpus_inst[0]

        graph_inst["match_relation"] = "None"
        match_before = graph_inst["match_relation"].copy()

        gm = Matcher()
        gm.match(graph_inst)

        ## print graph_inst["match_relation"]
        # check that at least one match is different 
        self.assertTrue(any(graph_inst["match_relation"] != match_before))
Exemplo n.º 8
0
 def test_match_corpus(self):
     st = create_setting()
     corpus_inst = CorpusInst()
     inst_fname = st.dev_inst_fns[0]
     corpus_inst.loadtxt(inst_fname, 
                         st.descriptor.dtype)
     graph_inst = corpus_inst[0]
     
     # clear pred_match field
     graph_inst["match_relation"] = str(None)
     # backup original for comparison later on
     match_before = graph_inst["match_relation"].copy()
     
     match_corpus(corpus_inst, Matcher())
     
     # check that at least one relation is different (i.e. not None)
     self.assertTrue(any(graph_inst["match_relation"] != match_before))
Exemplo n.º 9
0
def weight_files(inst_fns, out_fns, weight_func=entropy_weight,
                 descriptor=None, n=None, binary=False):
    """
    Weight corpus instance files
    
    @param inst_fns: list of corpus instance filenames
    
    @param out_fns: list of filenames containing Timbl output
    
    @keyword weight_func: weighting fuction
    
    @keyword descriptor: a Descriptor instance, required if corpus instances
    are loaded in text format

    
    @keyword n: limit merging to the first n files
    """
    for inst_fname, out_fname in zip(inst_fns, out_fns)[:n]:
        corpus_inst = CorpusInst()
        if binary:
            corpus_inst.loadbin(inst_fname)
        else:
            corpus_inst.loadtxt(inst_fname, descriptor.dtype)
            
        timbl_out = parse_timbl_output(open(out_fname))
        weight_corpus(corpus_inst, timbl_out, weight_func)
        log.info("saving weighted corpus instances to {0}".format(inst_fname))
        corpus_inst.save()
Exemplo n.º 10
0
    def test_merge_corpus(self):
        st = create_setting()

        corpus_inst = CorpusInst()
        inst_fname = st.dev_inst_fns[0]
        corpus_inst.loadtxt(inst_fname, st.descriptor.dtype)

        true_fname = st.dev_true_fns[0]
        true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE)
        pred_corpus = merge_corpus(corpus_inst, true_corpus, Merger())
        self.assertTrue(len(pred_corpus))

        for graph_inst, graph_pair in zip(corpus_inst, pred_corpus):
            for inst in graph_inst:
                rel = inst["match_relation"]
                if rel != str(None):
                    nodes = Pair(inst["source_node"], inst["target_node"])
                    self.assertEqual(graph_pair.get_align(nodes), rel)
Exemplo n.º 11
0
    def test_weight_corpus(self):
        st = create_setting()
        corpus_inst = CorpusInst()
        inst_fname = st.dev_inst_fns[0]
        corpus_inst.loadtxt(inst_fname, st.descriptor.dtype)
        graph_inst = corpus_inst[0]

        # clear predicted weights field
        graph_inst["pred_weight"] = 0.0
        # backup original for comparison later on
        weight_before = graph_inst["pred_weight"].copy()

        out_fname = st.dev_clas_fns[0]
        timbl_out = parse_timbl_output(open(out_fname))

        weight_corpus(corpus_inst, timbl_out, entropy_weight)

        # check that at least one weight is different (i.e. not 0.0)
        self.assertTrue(any(graph_inst["pred_weight"] != weight_before))
Exemplo n.º 12
0
def update_feats(features,
                 org_inst_fns,
                 org_descriptor,
                 update_inst_fns,
                 update_descriptor):
    """
    A quick hack to facilitate second stage classification. Updates the values
    of specified features in original corpus instances with those from the
    update corpus instances. Modifies orginal files!
    
    @param features: tuple of Feat instances, which provides the names of the
    features to update
    
    @param org_inst_fns: list of orginal corpus instance filenames
    
    @param org_descriptor: feature description for original instances
    
    @param update_inst_fns: list of corpus instance filenames containing the
    updated features
    
    @param update_descriptor: feature description for update instances
    """
    assert len(org_inst_fns) == len(update_inst_fns)
    
    for org_fn, up_fn in zip(org_inst_fns, update_inst_fns):
        print "Updating %s with %s" % (org_fn, up_fn)
        org_corpus_inst = CorpusInst()
        org_corpus_inst.loadtxt(org_fn, org_descriptor.dtype)
        up_corpus_inst = CorpusInst()
        up_corpus_inst.loadtxt(up_fn, update_descriptor.dtype)
        assert len(org_corpus_inst) == len(up_corpus_inst)
        
        for org_inst, up_inst in zip(org_corpus_inst, up_corpus_inst):
            for feat in features:
                org_inst[feat.name] = up_inst[feat.name]
        
        org_corpus_inst.savetxt(org_fn)
        
        
Exemplo n.º 13
0
    def test_extract_dev(self):
        st = create_setting()
        st.validate = False
        # create tmp dirs for extraction output
        st.inst_dir = tempfile.mkdtemp()
        st.true_dir = tempfile.mkdtemp()

        extract(st)

        # check no of files
        self.assertEqual(len(st.dev_true_fns), len(st.dev_part_fns))
        self.assertEqual(len(st.dev_inst_fns), len(st.dev_part_fns))

        # test loading a corpus file
        corpus = ParallelGraphCorpus(inf=st.dev_true_fns[0])

        # test loading a instances file
        inst = CorpusInst()
        inst.loadtxt(st.dev_inst_fns[0], st.descriptor.dtype)
        self.assertEqual(len(corpus), len(inst))

        clean_inst(st)
        clean_true(st)
Exemplo n.º 14
0
    def test_extract_val_binary(self):
        st = create_setting()
        st.develop = False
        # create tmp dirs for extraction output
        st.inst_dir = tempfile.mkdtemp()
        st.true_dir = tempfile.mkdtemp()
        st.binary = True

        extract(st)

        # check no of files
        self.assertEqual(len(st.val_true_fns), len(st.val_part_fns))
        self.assertEqual(len(st.val_inst_fns), len(st.val_part_fns))

        # test loading a corpus file
        corpus = ParallelGraphCorpus(inf=st.val_true_fns[0])

        # test loading a instances file
        inst = CorpusInst()
        inst.loadbin(st.val_inst_fns[0])
        self.assertEqual(len(corpus), len(inst))

        clean_inst(st)
        clean_true(st)
Exemplo n.º 15
0
def match_files(inst_fns, matcher, descriptor=None, n=None, binary=False):
    """
    Match corpus instances files
    
    @param inst_fns: list of corpus instance filenames
    
    @param matcher: a Matcher instance for matching source to target instances
    
    @keyword descriptor: a Descriptor instance, required if corpus instances
    are loaded in text format
    
    @keyword binary: corpus instances in binary rather than text format

    @keyword n: limit matching to the first n files
    """
    for inst_fname in inst_fns[:n]:
        corpus_inst = CorpusInst()    
        if binary:
            corpus_inst.loadbin(inst_fname)
        else:
            corpus_inst.loadtxt(inst_fname, descriptor.dtype)
        match_corpus(corpus_inst, matcher)
        log.info("saving matched corpus instances to {0}".format(inst_fname))
        corpus_inst.save()
Exemplo n.º 16
0
def merge_files(inst_fns, true_fns, pred_fns, merger=Merger(),
                descriptor=None, n=None, binary=False):
    """
    Merge corpus instance files
    
    @param inst_fns: list of corpus instance filenames
    
    @param true_fns: list of corpus filenames containing the true alignments

    @param pred_fns: list of predicted corpus filenames to be created
    
    @param merger: instance of Merger class for merging instances into a graph
    pair
    
    @keyword descriptor: a Descriptor instance, required if corpus instances
    are loaded in text format
    
    @keyword n: limit merging to the first n files
    
    @keyword binary: corpus instances in binary rather than text format
    """
    assert isinstance(merger, Merger)
    assert len(inst_fns) == len(true_fns) > 0
    
    for inst_fname, true_fname, pred_fname in zip(inst_fns,
                                                  true_fns,
                                                  pred_fns)[:n]:
        corpus_inst = CorpusInst()
        
        if binary:
            corpus_inst.loadbin(inst_fname)
        else:
            corpus_inst.loadtxt(inst_fname, descriptor.dtype)
            
        true_corpus = ParallelGraphCorpus(inf=true_fname,
                                          graph_loading=LOAD_NONE)
        pred_corpus = merge_corpus(corpus_inst, true_corpus, merger)
        log.info("saving predictd corpus {0}".format(inst_fname))
        pred_corpus.write(pred_fname)
Exemplo n.º 17
0
def exp_dev_fast(setting):
    """
    perform a fast alignment experiment on development data
    
    Weighting, matching and merging takes place per test corpus without
    writing intermediary results to a file.
    """
    assert setting.develop and not setting.validate

    exp_init(setting)

    create_parts(setting)

    # It's impossible to do extraction one corpus a time, because in order to
    # classify a test corpus you need instances for all the other training
    # corpora! Moreover, since Timbl classification is file-based, we need to
    # write the corpus instance files to disk. These files can be huge and
    # keeping all of them in memory seems to offer little benefit.
    extract(setting)

    sample(setting)

    # Timbl writes its output to a file, which then needs to be parsed in
    # order to insert the class predictions and weights into the corpus
    # instances. That means there is no advantage to doing classification
    # one corpus a time.
    classify(setting)

    log.info("\n" + header("WEIGHT/MATCH/MERGE STEP"))
    # reset evaluator
    if setting.evaluate:
        setting.evaluator.__init__()

    scope = zip(setting.dev_inst_fns, setting.dev_clas_fns, setting.dev_true_fns)[: setting.n]

    for inst_fname, out_fname, true_fname in scope:
        log.info("reading corpus instances {0}".format(inst_fname))
        corpus_inst = CorpusInst()
        corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype)

        if setting.weight:
            log.info("reading classifier output {0}".format(out_fname))
            timbl_out = parse_timbl_output(open(out_fname))
            log.info("weighting...")
            weight_corpus(corpus_inst, timbl_out, setting.weight_func)

        if setting.match:
            log.info("matching...")
            match_corpus(corpus_inst, setting.matcher)

        if setting.merge:
            log.info("reading true corpus {0}".format(true_fname))
            true_corpus = ParallelGraphCorpus(inf=true_fname, graph_loading=LOAD_NONE)
            log.info("merging...")
            pred_corpus = merge_corpus(corpus_inst, true_corpus, setting.merger)

        if setting.evaluate:
            name = os.path.basename(true_fname).split("_")[0]
            setting.evaluator.add(true_corpus, pred_corpus, name)

    if setting.evaluate:
        log.info("evaluting...")
        setting.evaluator.run_eval()
        log.info("saving evaluation {0}".format(setting.dev_eval_fname))
        makedirs(setting.eval_dir)
        setting.evaluator.write(setting.dev_eval_fname)

    exp_exit(setting)
Exemplo n.º 18
0
def exp_dev_fast(setting):
    """
    perform a fast alignment experiment on development data
    
    Weighting, matching and merging takes place per test corpus without
    writing intermediary results to a file.
    """
    assert setting.develop and not setting.validate

    exp_init(setting)

    create_parts(setting)

    # It's impossible to do extraction one corpus a time, because in order to
    # classify a test corpus you need instances for all the other training
    # corpora! Moreover, since Timbl classification is file-based, we need to
    # write the corpus instance files to disk. These files can be huge and
    # keeping all of them in memory seems to offer little benefit.
    extract(setting)

    sample(setting)

    # Timbl writes its output to a file, which then needs to be parsed in
    # order to insert the class predictions and weights into the corpus
    # instances. That means there is no advantage to doing classification
    # one corpus a time.
    classify(setting)

    log.info("\n" + header("WEIGHT/MATCH/MERGE STEP"))
    # reset evaluator
    if setting.evaluate: setting.evaluator.__init__()

    scope = zip(setting.dev_inst_fns, setting.dev_clas_fns,
                setting.dev_true_fns)[:setting.n]

    for inst_fname, out_fname, true_fname in scope:
        log.info("reading corpus instances {0}".format(inst_fname))
        corpus_inst = CorpusInst()
        corpus_inst.loadtxt(inst_fname, setting.descriptor.dtype)

        if setting.weight:
            log.info("reading classifier output {0}".format(out_fname))
            timbl_out = parse_timbl_output(open(out_fname))
            log.info("weighting...")
            weight_corpus(corpus_inst, timbl_out, setting.weight_func)

        if setting.match:
            log.info("matching...")
            match_corpus(corpus_inst, setting.matcher)

        if setting.merge:
            log.info("reading true corpus {0}".format(true_fname))
            true_corpus = ParallelGraphCorpus(inf=true_fname,
                                              graph_loading=LOAD_NONE)
            log.info("merging...")
            pred_corpus = merge_corpus(corpus_inst, true_corpus,
                                       setting.merger)

        if setting.evaluate:
            name = os.path.basename(true_fname).split("_")[0]
            setting.evaluator.add(true_corpus, pred_corpus, name)

    if setting.evaluate:
        log.info("evaluting...")
        setting.evaluator.run_eval()
        log.info("saving evaluation {0}".format(setting.dev_eval_fname))
        makedirs(setting.eval_dir)
        setting.evaluator.write(setting.dev_eval_fname)

    exp_exit(setting)