Пример #1
0
    def extract(self, graph_pair):
        """
        Extract features from aligned graph pair, limiting scope to selected
        nodes
        
        @keyword graph_pair: an instance of GraphMapping. NB Any node alignment
        involving a filtered node (cf. node_selector) is removed from the
        alignment!
        
        @return: a numpy record array with an instance for each possible pair
        of selected source node and target node.
        """
        graphs = graph_pair.get_graphs()
        self._apply_pp_graph_hooks(graphs)
        instances = self._empty_instances(graphs)
        # source and target node counters, counting from one
        n_count = Pair(0, 0)
        # instance counter
        inst_count= 0
            
        for source_node in graphs.source:
            source_selected = self.node_selector(source_node, graphs.source)
            
            if source_selected: 
                n_count.source += 1
                
            n_count.target = 0
            
            for target_node in graphs.target:
                nodes = Pair(source_node, target_node)
                target_selected = self.node_selector(target_node,
                                                     graphs.target)
                
                if source_selected and target_selected:
                    self._apply_pp_node_hooks(nodes, graphs, graph_pair)
                    n_count.target += 1

                    for feat in self.descriptor:
                        # Each feature function is called with the node
                        # counters, a pair of nodes, a pair of graphs, and an
                        # alignment
                        instances[inst_count][feat.name] = feat.function(
                            n_count=n_count, 
                            nodes=nodes,
                            graphs=graphs, 
                            alignment=graph_pair)
                    inst_count += 1
                else:
                    # Remove alignment (if any) between skipped nodes from
                    # alignment, because it should not occur in the true pgc.
                    # A not so elegant side effect, but more effcient than
                    # updating the alignment afterwards.
                    try:
                        graph_pair.del_align(nodes)
                    except networkx.NetworkXError:
                        pass
                    
        # original ndarray was n x m, but actual size may be less due to node
        # selection, so here we get rid of empty rows at the bottom
        return instances[:inst_count]
Пример #2
0
    def test_1(self):
        gb = SparseGraphBank("data/source-gb-1.xml", "alpino")

        # create a strong reference to the graph stub object,
        # otherwise it will vanish immediately :-)
        graph_stub1 = gb.get_graph_stub("s100")
        graph_stub2 = gb.get_graph_stub("s200")
        
        graph_pair = GraphPair(Pair(gb, gb), 
                               Pair(graph_stub1, graph_stub2))
        # add a backlink to graph_pair
        graph_stub1.add_client(graph_pair)
        graph_stub2.add_client(graph_pair)
        
        self.assertTrue(isinstance(gb.get_graph("s100"), GraphStub))
        self.assertTrue(isinstance(gb.get_graph("s200"), GraphStub))
        
        gb.load()
        
        self.assertEqual(len(gb), 2)
        self.assertTrue(isinstance(graph_pair._graphs.source, AlpinoGraph))
        self.assertTrue(isinstance(graph_pair._graphs.target, AlpinoGraph))
        self.assertTrue(isinstance(gb.get_graph("s100"), AlpinoGraph))
        self.assertTrue(isinstance(gb.get_graph("s200"), AlpinoGraph))
        
        del graph_pair
        # force garbage collection
        gc.collect()
        
        # make sure the graphs are gone now that the referring graph pair is
        # no longer alive
        self.assertEqual(len(gb), 0)
        self.assertRaises(KeyError,  gb.get_graph, "s100")
        self.assertRaises(KeyError,  gb.get_graph, "s200")
Пример #3
0
    def test_roots_share_suffix(self):
        graphs = Pair(AlpinoGraph(), AlpinoGraph())
        graphs.source.add_node(1, "x")
        graphs.target.add_node(2, "y")
        nodes = Pair(1, 2)

        # no roots
        self.assertEqual(ff_roots_share_suffix(nodes, graphs), "-")

        graphs.source.node[1]["root"] = "woon_wagen"
        graphs.target.node[2]["root"] = "eet_tafel"
        self.assertEqual(ff_roots_share_suffix(nodes, graphs), "F")

        graphs.source.node[1]["root"] = "woon_wagen"
        graphs.target.node[2]["root"] = "woon_wagen"
        self.assertEqual(ff_roots_share_suffix(nodes, graphs), "F")

        graphs.source.node[1]["root"] = "woon_wagen"
        graphs.target.node[2]["root"] = "wagen"
        self.assertEqual(ff_roots_share_suffix(nodes, graphs), "F")

        graphs.source.node[1]["root"] = "woon_wagen_bewoner"
        graphs.target.node[2]["root"] = "wagen_bewoner"
        self.assertEqual(ff_roots_share_suffix(nodes, graphs), "F")

        graphs.source.node[1]["root"] = "woon_wagen"
        graphs.target.node[2]["root"] = "mest_wagen"
        self.assertEqual(ff_roots_share_suffix(nodes, graphs), "T")

        graphs.source.node[1]["root"] = "woon_wagen_trekker"
        graphs.target.node[2]["root"] = "mest_wagen_trekker"
        self.assertEqual(ff_roots_share_suffix(nodes, graphs), "T")
Пример #4
0
    def align_corpus(self, corpus, doc_trees=None, clear=True):
        """
        Align a parallel text corpus
        
        @param corpus: parallel text corpus instance (HitaextDoc)
        
        @keyword doc_trees: pair of source and target document trees; only
        useful in experiments to prevent repeatedly rereading of the document
        trees
        
        @keyword clear: if true all existing alignments involving elements
        with focus tags are removed
        
        Alignments are added to the <alignment> section of the corpus.
        """
        if clear:
            clear_alignments(corpus, self.focus_tags)
            
        if not doc_trees:
            doc_trees = Pair(
                get_doc_tree(corpus, "from", self.ignore_tags.source),
                get_doc_tree(corpus, "to", self.ignore_tags.target))
            
            # copy alignments from <aligment> section in corpus
            # to "_alignments" attribute on elements
            corpus.inject_alignments(doc_trees.source, 
                                     doc_trees.target)
        
        if self.scope_tags:
            scope_tags = self.scope_tags
        else:
            # when scope is not specified, assume that scope tag is root tag,
            # and that roots are aligned
            source_root = doc_trees.source.getroot()
            target_root = doc_trees.target.getroot()
            scope_tags = Pair([source_root.tag], [target_root.tag])
            source_root.set("_alignments", [target_root])
            
        # TODO: semantics not entirely clear. 
        # - what happens if scope tags are embedded?
        # - What happens if scope elements are aligned 1-to-n?
        for source_scope_elem in doc_trees.source.getiterator():
            if source_scope_elem.tag not in scope_tags.source:
                continue
            
            for target_scope_elem in source_scope_elem.get("_alignments"):
                if target_scope_elem.tag not in scope_tags.target:
                    continue

                scope_elems = Pair(source_scope_elem, target_scope_elem)
                self._align_within_scope(doc_trees,
                                         scope_elems)
    
        # finally copy alignment from "_alignments" attribute on elements
        # to <aligment> section in corpus
        corpus.extract_alignments(doc_trees.source, 
                                  doc_trees.target)
Пример #5
0
def greedy_align_phrases(corpus):
    # greedy align phrases with the same lower-cased words as strings and with
    # the same lower-cased roots as restates
    for graph_pair in corpus:
        graph_pair.clear()
        graphs = graph_pair.get_graphs()
        lc_roots(graphs.source, graphs.source.root)
        lc_roots(graphs.target, graphs.target.root)
        
        target_nodes = [ tn for tn in graphs.target
                         if ( not graphs.target.node_is_punct(tn) and 
                              not graphs.target.node_is_empty(tn) ) ]
        
        target_words = [ graphs.target.get_node_token_string(tn).lower()
                         for tn in target_nodes ]
        
        target_roots = [ graphs.target.node[tn].get("_lc_roots", [])
                         for tn in target_nodes ]

        for sn in graphs.source:
            if ( graphs.source.node_is_punct(sn) or 
                 graphs.source.node_is_empty(sn) ):
                continue
            
            sw = graphs.source.get_node_token_string(sn).lower()
            sr = graphs.source.node[sn].get("_lc_roots")
                        
            try:
                j = target_words.index(sw)
            except:
                try:
                    j = target_roots.index(sr)
                except:
                    continue
                else:
                    tn = target_nodes[j]        
                    graph_pair.add_align(Pair(sn, tn), "restates")
                    #print "RESTATES"
                    #print " ".join(sr)
                    #print " ".join(target_roots[j])
            
                    del target_nodes[j]
                    del target_words[j]
                    del target_roots[j]
            else:
                tn = target_nodes[j]        
                graph_pair.add_align(Pair(sn, tn), "equals")
                #print "EQUALS"
                #print sw
                #print target_words[j]
            
                del target_nodes[j]
                del target_words[j]
                del target_roots[j]

                
Пример #6
0
 def __init__(self):
     self._corpus = ParallelGraphCorpus()
     # the domain model
     self._changed = False
     self._filename = None
     self._graph_pair = None
     self._graph_pair_index = None
     self._graphs = Pair(None, None)
     self._nodes = Pair(None, None)
     # the special relation which stands for "no relation"
     self._no_relation = "none"
     self._co_node_selection = False
Пример #7
0
class Test_(unittest.TestCase):
    def setUp(self):
        self.p = Pair("x", "y")

    def test__repr__(self):
        print repr(self.p)

    def test__iter__(self):
        self.assertTrue(list(iter(self.p)))

    def test__eq__(self):
        p2 = Pair("x", "y")
        self.assertEqual(self.p, p2)

    def test_set(self):
        self.p.set(3, 4)
        self.assertEqual(self.p.source, 3)
Пример #8
0
def ff_same_parent_lc_phrase(nodes, graphs, **kwargs):
    """
    parent nodes have same lower-cased phrase
    """
    parent_nodes = Pair(graphs.source.get_parent_node(nodes.source),
                        graphs.target.get_parent_node(nodes.target))

    return ff_same_lc_phrase(parent_nodes, graphs)
Пример #9
0
class Test_(unittest.TestCase):

    def setUp(self):
        self.p = Pair("x", "y")
        
    def test__repr__(self):
        print repr(self.p)
        
    def test__iter__(self):
        self.assertTrue(list(iter(self.p)))
        
    def test__eq__(self):
        p2 = Pair("x", "y")
        self.assertEqual(self.p, p2)
        
    def test_set(self):
        self.p.set(3, 4)
        self.assertEqual(self.p.source, 3)
Пример #10
0
    def goto_graph_pair(self, index):
        # don't use try-except here, because negative index is allowed for list
        if 0 <= index < len(self._corpus):
            self._graph_pair = self._corpus[index]
            self._graph_pair_index = index
            self._graphs = self._graph_pair.get_graphs()
            self._nodes = Pair(None, None)

            send(self.goto_graph_pair, "newGraphPair.viz")
            send(self.goto_graph_pair, "newGraphPair.gui")
Пример #11
0
    def __init__(self, tokenizer=None, alpino=None, graph_aligner=None):
        self.init_tokenizer(tokenizer)
        self.init_alpino(alpino)
        self.init_graph_xml_parser()
        self.init_graph_aligner(graph_aligner)
        self.init_others()

        # a pair of graphbank dummies, which are needed when creating a new
        # GraphMapping instance
        self._graphbanks = Pair(GraphBank("", "alpino"),
                                GraphBank("", "alpino"))
Пример #12
0
    def test_roots_subsumption(self):
        graphs = Pair(AlpinoGraph(), AlpinoGraph())
        graphs.source.add_node(1, "x")
        graphs.target.add_node(2, "y")
        nodes = Pair(1, 2)

        # no roots
        self.assertEqual(ff_roots_subsumption(nodes, graphs), "-")

        graphs.source.node[1]["root"] = "wagen"
        graphs.target.node[2]["root"] = "wagen"
        self.assertEqual(ff_roots_subsumption(nodes, graphs), "equals")

        graphs.source.node[1]["root"] = "brandweer_wagen"
        graphs.target.node[2]["root"] = "brandweer"
        self.assertEqual(ff_roots_subsumption(nodes, graphs), "has_prefix")

        graphs.source.node[1]["root"] = "brandweer"
        graphs.target.node[2]["root"] = "brandweer_wagen"
        self.assertEqual(ff_roots_subsumption(nodes, graphs), "is_prefix")

        graphs.source.node[1]["root"] = "brandweer_wagen"
        graphs.target.node[2]["root"] = "wagen"
        self.assertEqual(ff_roots_subsumption(nodes, graphs), "has_suffix")

        graphs.source.node[1]["root"] = "wagen"
        graphs.target.node[2]["root"] = "brandweer_wagen"
        self.assertEqual(ff_roots_subsumption(nodes, graphs), "is_suffix")

        graphs.source.node[1]["root"] = "woon_wagen_bewoners_kamp_ingang"
        graphs.target.node[2]["root"] = "wagen_bewoners"
        self.assertEqual(ff_roots_subsumption(nodes, graphs), "has_infix")

        graphs.source.node[1]["root"] = "wagen_bewoners"
        graphs.target.node[2]["root"] = "woon_wagen_bewoners_kamp_ingang"
        self.assertEqual(ff_roots_subsumption(nodes, graphs), "is_infix")

        # no subsumption
        graphs.source.node[1]["root"] = "brandweer_wagen"
        graphs.target.node[2]["root"] = "kamp_ingang"
        self.assertEqual(ff_roots_subsumption(nodes, graphs), "none")
Пример #13
0
    def get_doc_trees(self, search=False, update=True):
        """
        Get pair of document trees
        """
        from_tree = self.get_doc_tree("from", search=search)
        to_tree = self.get_doc_tree("to", search=search)

        if update:
            from_tree.update()
            to_tree.update()

        return Pair(from_tree, to_tree)
Пример #14
0
 def _determine_focus_elems(self, scope_elems):
     source_list = [ 
         elem
         for elem in scope_elems.source.findall(".//" + 
                                                self.focus_tags.source)
         if not elem.get("_ignore") ]
     
     target_list = [ 
         elem
         for elem in scope_elems.target.findall(".//" + 
                                                self.focus_tags.target)
         if not elem.get("_ignore") ]
     
     return Pair(source_list, target_list)
Пример #15
0
 def _score_sim(self, focus_elem_lists):
     scores = []
     
     for source_focus_elem in focus_elem_lists.source:
         for target_focus_elem in focus_elem_lists.target:
             sim = self.sim_func(
                 source_focus_elem.get("_terms"),
                 target_focus_elem.get("_terms"),
                 source_focus_elem.get("_weights"),
                 target_focus_elem.get("_weights"))
             focus_elems = Pair(source_focus_elem, target_focus_elem)
             scores.append((sim, focus_elems))
             
     return scores
Пример #16
0
 def __init__(self, focus_tags, scope_tags=None, ignore_tags=None):
     """
     Create a new TextAligner instance
     
     @param focus_tags: a pair of soure and target focus tags
     
     @keyword scope_tags: a pair of source and target scope tag lists;
     defaults to the labels of the roots of the source and target document
     trees.
     
     @keyword ignore_tags: a pair of source and target ignore tag lists
     """
     self.focus_tags = focus_tags
     self.scope_tags = scope_tags
     self.ignore_tags = ignore_tags or Pair([],[])
Пример #17
0
    def merge(self, graph_inst, graph_pair):
        """
        Merges matched relations from graph instances into a graph pair as
        node alignments
        
        @param graph_inst: a Numpy record array containing the instances for a
        pair of graphs; it should contain the fields source_node, target_node
        and match_relation
        
        @param graph_pair: a GraphPair instance
        """
        assert isinstance(graph_inst, numpy.ndarray)
        assert isinstance(graph_pair, GraphPair)

        for inst in graph_inst:
            if inst["match_relation"] != self.no_rel:
                nodes = Pair(inst["source_node"], inst["target_node"])
                graph_pair.add_align(nodes, inst["match_relation"])
Пример #18
0
    def dump(self, graph_pair, instances):
        graphs = graph_pair.get_graphs()
        feat_names = [t[0] for t in instances.dtype.descr[4:-4]]

        for i, inst in enumerate(instances):
            nodes = Pair(inst["source_node"], inst["target_node"])

            print "instance:", i
            print "source: %s: %s: %s" % (
                nodes.source, graphs.source.node[nodes.source]["label"],
                graphs.source.get_node_token_string(nodes.source))
            print "target: %s: %s: %s" % (
                nodes.target, graphs.target.node[nodes.target]["label"],
                graphs.target.get_node_token_string(nodes.target))

            for fn in feat_names:
                print "%s: %s" % (fn, inst[fn])

            print 40 * "-"
Пример #19
0
 def test_merge_corpus(self):
     st = create_setting()
     
     corpus_inst = CorpusInst()
     inst_fname = st.dev_inst_fns[0]
     corpus_inst.loadtxt(inst_fname, st.descriptor.dtype)
     
     true_fname = st.dev_true_fns[0]
     true_corpus = ParallelGraphCorpus(inf=true_fname,
                                       graph_loading=LOAD_NONE)
     pred_corpus = merge_corpus(corpus_inst, true_corpus, Merger()) 
     self.assertTrue(len(pred_corpus))
     
     for graph_inst, graph_pair in zip(corpus_inst, pred_corpus):
         for inst in graph_inst:
             rel = inst["match_relation"]
             if rel != str(None):
                 nodes = Pair(inst["source_node"], inst["target_node"] )
                 self.assertEqual(graph_pair.get_align(nodes), rel)    
Пример #20
0
    def align(self, source_sent, target_sent):
        # the strings received here after transport through XML-RPC are either
        # plain ascii or unicode
        sent_pair = Pair(source_sent, target_sent)

        tok_sent_pair = self._tokenize(sent_pair)

        parse_pair = self._parse(tok_sent_pair)

        graph_pair = self._load_graphs(parse_pair)

        instances = self._align_graphs(graph_pair)

        parse_align, phrase_align = self._get_alignment(instances, graph_pair)

        return dict(source_sent=sent_pair.source,
                    target_sent=sent_pair.target,
                    source_tok=tok_sent_pair.source,
                    target_tok=tok_sent_pair.target,
                    source_parse=parse_pair.source,
                    target_parse=parse_pair.target,
                    parse_align=parse_align,
                    phrase_align=phrase_align)
Пример #21
0
def greedy_align_equal_words_roots(corpus):
    # if words are equal, align as equals
    # elif roots are equals, align as restates
    for graph_pair in corpus:
        graph_pair.clear()
        graphs = graph_pair.get_graphs()
        
        target_nodes = graphs.target.terminals(with_punct=False,
                                               with_empty=False)
        target_words = [ graphs.target.node[tn]["word"].lower()
                         for tn in target_nodes ]
        target_roots = [ graphs.target.node[tn]["root"]
                         for tn in target_nodes ]
        
        for sn in graphs.source.terminals_iter(with_punct=False,
                                               with_empty=False):
            sw = graphs.source.node[sn]["word"].lower()
            sr = graphs.source.node[sn]["root"]
                        
            try:
                j = target_words.index(sw)
            except:
                try:
                    j = target_roots.index(sr)
                except:
                    continue
                else:
                    relation = "restates"
            else:
                relation = "equals"
            
            tn = target_nodes[j]        
            graph_pair.add_align(Pair(sn, tn), relation)
            
            del target_nodes[j]
            del target_words[j]
            del target_roots[j]
Пример #22
0
    def test_parser_node_pairs(self):
        """
        check if all node_pair are correctly read
        """
        parser = PGCParser()
        pg_corpus = parser.parse("data/corpus-2.pgc")

        true_align = [(Pair("4", "4"), "equals"), (Pair("8", "11"), "equals"),
                      (Pair("5", "5"), "equals"),
                      (Pair("11", "10"), "intersects"),
                      (Pair("19", "8"), "intersects"),
                      (Pair("1", "1"), "restates"), (Pair("0",
                                                          "0"), "restates")]

        read_align = pg_corpus[1].alignments()

        self.assertEqual(len(read_align), len(true_align))

        for e in read_align:
            true_align.remove(e)

        self.assertFalse(true_align)
Пример #23
0
def greedy_align_equal_words(corpus):
    for graph_pair in corpus:
        graph_pair.clear()
        graphs = graph_pair.get_graphs()
        
        target_nodes = graphs.target.terminals(with_punct=False,
                                               with_empty=False)
        target_words = [ graphs.target.node[tn]["word"].lower()
                         for tn in target_nodes ]
        
        for sn in graphs.source.terminals_iter(with_punct=False,
                                               with_empty=False):
            sw = graphs.source.node[sn]["word"].lower()
            
            try:
                j = target_words.index(sw)
            except:
                continue
            
            tn = target_nodes[j]        
            graph_pair.add_align(Pair(sn, tn), "equals")
            
            del target_nodes[j]
            del target_words[j]
Пример #24
0
 def _parse(self, tok_sent_pair):
     return Pair(self._parse_single_sent(tok_sent_pair.source),
                 self._parse_single_sent(tok_sent_pair.target))
Пример #25
0
 def setUp(self):
     self.p = Pair("x", "y")
Пример #26
0
def pp_term_align(nodes, graphs, alignment, **kwargs):
    """
    A node preprocessing function that computes the number of aligned
    terminals for a given pair of source and target nodes.
    
    Assumes the "_yield" attribute on nodes as computed by the pp_yield grap
    preprocessing functions.

    Provides the node attributes:
    _inside:  terminals aligned to terminals inside the other node
    _outside: aligned outside the other node or aligned to non-terminals
    _none:    unaligned terminals 
    
    """
    sn_attr = graphs.source.node[nodes.source]
    tn_attr = graphs.target.node[nodes.target]

    # handle source node
    sn_attr["_inside"] = {}
    sn_attr["_outside"] = []
    sn_attr["_none"] = []

    for st in sn_attr["_yield"]:
        # find aligned target node, if any
        tt = alignment.get_aligned_target_node(st)

        if tt:
            if tt in tn_attr["_yield"]:
                relation = alignment.get_align(Pair(st, tt))

                try:
                    sn_attr["_inside"][relation].append(st)
                except KeyError:
                    sn_attr["_inside"][relation] = [st]
            else:
                # if non-terminal alignments are available,
                # this includes all cases where a source terminal is aligned
                # to target *non-terminal*, even if it is within the scope of
                # nodes.target!
                sn_attr["_outside"].append(st)
        else:
            sn_attr["_none"].append(st)

    # handle target node
    # align inside count is by definition identical for source an target node
    tn_attr["_inside"] = sn_attr["_inside"]
    tn_attr["_outside"] = []
    tn_attr["_none"] = []

    for tt in tn_attr["_yield"]:
        # find aligned source node, if any
        st = alignment.get_aligned_source_node(tt)

        if st:
            if st not in sn_attr["_yield"]:
                # if non-terminal alignments are available,
                # this includes all case where a target terminal is aligned to
                # source *non-terminal*, even if it is within the scope of
                # nodes.source!
                tn_attr["_outside"].append(tt)
        else:
            tn_attr["_none"].append(tt)
Пример #27
0
    type=int,
    metavar="N",
    default=0,
    help='minimum difference in tokens allowed between the aligned sentences '
    '(default is 0)')

args = parser.parse_args()

if len(args.parallel_text_corpora) != len(args.source_graphbanks):
    exit("Error: too few or to many source graphbanks")

if len(args.parallel_text_corpora) != len(args.target_graphbanks):
    exit("Error: too few or to many target graphbanks")

for text_corpus, source_graphbank, target_graphbank in zip(
        args.parallel_text_corpora, args.source_graphbanks,
        args.target_graphbanks):
    graph_corpus = pgc_from_ptc(text_corpus,
                                source_graphbank,
                                target_graphbank,
                                focus_tags=Pair(args.source_tag,
                                                args.target_tag),
                                graph_formats=Pair(args.source_format,
                                                   args.target_format),
                                relations=args.relations,
                                min_token_diff=args.min_token_diff,
                                max_token_len=args.max_token_len)

    outfn = os.path.splitext(os.path.basename(text_corpus))[0] + ".pgc"
    graph_corpus.write(outfn, pprint=True)
Пример #28
0
def greedy_align_words(corpus):
    # if words are equal -> equals
    # if roots are equals -> restates
    # if source in target root and len(source)>3 -> generalizes
    # if target in source root and len(target)>3-> specifies
    # if target and source root share a morph segment ->intersects
    for graph_pair in corpus:
        graph_pair.clear()
        graphs = graph_pair.get_graphs()
        
        target_nodes = graphs.target.terminals(with_punct=False,
                                               with_empty=False)
        target_words = [ graphs.target.node[tn]["word"].lower()
                         for tn in target_nodes ]
        target_roots = [ graphs.target.node[tn]["root"]
                         for tn in target_nodes ]
        
        for sn in graphs.source.terminals_iter(with_punct=False,
                                               with_empty=False):
            sw = graphs.source.node[sn]["word"].lower()
            relation = None
            
            # align identical words
            for i, tw in enumerate(target_words):
                if sw == tw:
                    relation = "equals"
                    break
                    
            if not relation:
                sr = graphs.source.node[sn]["root"]
            
                # align identical roots
                for i, tr in enumerate(target_roots):
                    if sr == tr:
                        relation = "restates"
                        break
                        
            if not relation:
                sparts = set(sr.split("_"))
                
                # check for spec, gen, or intersect
                for i, tr in enumerate(target_roots):
                    tw = target_words[i]
                    
                    if sr in tr and len(sw) > 3:
                        relation = "generalizes"
                        break
                    elif tr in sr and len(tw) > 3:
                        relation = "specifies"
                        break
                    # check if roots share a morphological segment
                    elif sparts.intersection(tr.split("_")):
                        relation = "intersects"
                        break
            
            if relation:
                tn = target_nodes[i]        
                graph_pair.add_align(Pair(sn, tn), relation)
                
                del target_nodes[i]
                del target_words[i]
                del target_roots[i]     
Пример #29
0
def pgc_from_ptc(text_corpus_file,
                 source_graphbank_file,
                 target_graphbank_file,
                 focus_tags=Pair("s", "s"),
                 graph_formats=Pair("alpino", "alpino"),
                 relations=RELATIONS,
                 min_token_diff=0,
                 max_token_len=99999):
    """
    Create a new parallel graph corpus from a parallel text corpus and a pair of
    graphbanks
    
    @PARAM text_corpus_file: parallel text corpus filename
    @PARAM source_bank: source graphank filename
    @PARAM target_bank: target graphbank filname
        
    @KEYWORD focus_tags: pair of focus tags
    @KEYWORD graph_format: pair of graphbank formats
    @KEYWORD relations: list of alignment relations
    @keyword min_token_diff: minimum number of different tokens
    @keyword max_token_len: maximum number of tokens per focus element 
    
    @RETURN: ParallelGraphCorpus object
    """
    # read parallel text corpus
    text_corpus = HitaextDoc(file=text_corpus_file)
    doc_trees = text_corpus.get_doc_trees(search=True)

    # read graph banks
    source_bank = GraphBank(source_graphbank_file, graph_formats.source)
    source_bank.load()
    target_bank = GraphBank(target_graphbank_file, graph_formats.target)
    target_bank.load()
    graph_banks = Pair(source_bank, target_bank)

    # create an empty parallel graph corpus
    graph_corpus = ParallelGraphCorpus(relations=relations)

    for alignment in text_corpus.alignment:
        if (alignment.get("from_tag") != focus_tags.source
                or alignment.get("to_tag") != focus_tags.target):
            continue

        source_tokens = _get_elem_tokens(doc_trees.source, focus_tags.source,
                                         alignment.get("from_id"))
        target_tokens = _get_elem_tokens(doc_trees.target, focus_tags.target,
                                         alignment.get("to_id"))

        if len(source_tokens) > max_token_len or len(
                target_tokens) > max_token_len:
            continue

        if (min_token_diff and
                _token_diff(source_tokens, target_tokens) < min_token_diff):
            continue

        # the crucial assumption is that id's of the aligned focus
        # elements in the marked-up text have corresponding graphs with
        # the same id in the graph banks
        source_graph_id = alignment.get("from_id")
        target_graph_id = alignment.get("to_id")
        graphs = Pair(source_bank.get_graph(source_graph_id),
                      target_bank.get_graph(target_graph_id))

        graph_pair = GraphPair(graph_banks, graphs)
        graph_corpus.append(graph_pair)

    return graph_corpus
Пример #30
0
class AlignServer(object):
    # the maximum number that the parser for Alpino XML will be reused
    max_alpino_parser_reuse = 1000

    # regexp to detect unescaped ampercent, that is, not part of an entity such &amp;, &apos;, etc.
    regexp = re.compile(r"&(?!(?:[a-zA-Z][a-zA-Z0-9]*|#\d+);)")

    def __init__(self, tokenizer=None, alpino=None, graph_aligner=None):
        self.init_tokenizer(tokenizer)
        self.init_alpino(alpino)
        self.init_graph_xml_parser()
        self.init_graph_aligner(graph_aligner)
        self.init_others()

        # a pair of graphbank dummies, which are needed when creating a new
        # GraphMapping instance
        self._graphbanks = Pair(GraphBank("", "alpino"),
                                GraphBank("", "alpino"))

    def init_tokenizer(self, tokenizer=None):
        self._tokenizer = tokenizer

    def init_alpino(self, alpino):
        # "if alpino: ..." does not work because op peculiarities of xml-rpc
        # implementation
        if alpino is None:
            host = "http://%s:%d" % (ALPINO_HOST, ALPINO_PORT)
            self._alpino = ServerProxy(host, encoding="iso-8859-1")
        else:
            self._alpino = alpino

        self._alpino.parse("test")

    def init_graph_xml_parser(self):
        self._alpino_xml_parser = AlpinoParser()
        self._alpino_parser_reused = 0
        # feed fake root node to the xml parser
        self._alpino_xml_parser.parse_string(
            '<?xml version="1.0" encoding="utf-8"?>\n<treebank>')

    def init_graph_aligner(self, graph_aligner):
        if graph_aligner:
            self._graph_aligner = graph_aligner
        else:
            self._graph_aligner = GraphAligner()

        self.no_rel = self._graph_aligner.descriptor.no_rel

    def init_others(self):
        # hook for subclasses
        pass

    def align(self, source_sent, target_sent):
        # the strings received here after transport through XML-RPC are either
        # plain ascii or unicode
        sent_pair = Pair(source_sent, target_sent)

        tok_sent_pair = self._tokenize(sent_pair)

        parse_pair = self._parse(tok_sent_pair)

        graph_pair = self._load_graphs(parse_pair)

        instances = self._align_graphs(graph_pair)

        parse_align, phrase_align = self._get_alignment(instances, graph_pair)

        return dict(source_sent=sent_pair.source,
                    target_sent=sent_pair.target,
                    source_tok=tok_sent_pair.source,
                    target_tok=tok_sent_pair.target,
                    source_parse=parse_pair.source,
                    target_parse=parse_pair.target,
                    parse_align=parse_align,
                    phrase_align=phrase_align)

    def _tokenize(self, sent_pair):
        if self._tokenizer:
            return self._tokenizer(sent_pair)
        else:
            return sent_pair

    def _parse(self, tok_sent_pair):
        return Pair(self._parse_single_sent(tok_sent_pair.source),
                    self._parse_single_sent(tok_sent_pair.target))

    def _parse_single_sent(self, tok_sent):
        # Sentence will be of type unicode if the original sentence passed
        # to the server proxy (client) contained any non-ascii chars, but
        # will be of type str otherwise. Input to the alpino server proxy
        # must be iso-8859-1 encoded, so we have to convert
        tok_sent = tok_sent.encode("iso-8859-1")

        graph = self._alpino.parse(tok_sent)

        # The returned parse is string of type unicode or str, regardless
        # of what the xml header produced by alpino says. First we get rid
        # of this xml header.
        return graph.split("\n", 1)[1]

    def _load_graphs(self, parse_pair):
        # The AlpinoParser instance can be reused to avoid the overhead of
        # creating a new one. It seems that there is maximum to the number of
        # lines though. After that we get an error like:
        #
        # xml.parsers.expat.ExpatError: not well-formed (invalid token):
        # line 2654543, column 300
        #
        # We therefore count the number of reuses and create a new instance
        # when self.max_parser_reuse is reached.
        if self._alpino_parser_reused < self.max_alpino_parser_reuse:
            self._alpino_parser_reused += 1
        else:
            self.init_graph_xml_parser()

        # The xml parser for graphbanks wants utf-8,
        # so we encode as utf-8
        xml_string = (parse_pair.source.encode("utf-8") +
                      parse_pair.target.encode("utf-8"))

        # Alpino outputs ill-formed xml because some "&" are not escaped
        # e.g. <node begin="0" cat="mwu" end="3" id="1" mwu_root="erwin & mireille" mwu_sense="erwin & mireille" rel="--">
        # This is a hack to correct that.
        xml_string = self.regexp.sub("&amp;", xml_string)

        try:
            id2graph = self._alpino_xml_parser.parse_string(xml_string)
        except ExpatError, inst:
            sys.stderr.write("Error:%s\nInput:\n%s\n" % (inst, xml_string))
            # reset parser
            sys.stderr.write("Resetting Alpino output parser\n")
            self.init_graph_xml_parser()
            raise inst
            # the exception surfaces as an xmlrpc fault,
            # but subsequent calls to align method should work

        graph_pair = Pair(*id2graph.values())
        return GraphMatching(banks=self._graphbanks, graphs=graph_pair)
Пример #31
0
 def test__eq__(self):
     p2 = Pair("x", "y")
     self.assertEqual(self.p, p2)
Пример #32
0
 def setUp(self):
     self.p = Pair("x", "y")