示例#1
0
    def align_with_trees(self, strict=False):
        """Compute for each EDU the overlapping trees"""
        edus = self.edus
        syn_trees = self.tkd_trees

        # if there is no sentence segmentation from syntax,
        # use the raw (bad) one from the .out
        if len(syn_trees) == 1:  # only lpad
            self.edu2sent = self.edu2raw_sent
            return self

        # compute edu2sent, prepend 0 for lpad, shift all indices by 1
        assert edus[0].is_left_padding()
        edu2sent = align_edus_with_sentences(self.edus[1:], syn_trees[1:],
                                             strict=strict)
        edu2sent = [0] + [(i+1 if i is not None else i)
                          for i in edu2sent]
        self.edu2sent = edu2sent

        # compute relative index of each EDU from the beginning (resp. to
        # the end) of the sentence around
        idxes_in_sent = relative_indices(edu2sent)
        rev_idxes_in_sent = relative_indices(edu2sent, reverse=True)
        self.edu2idx_in_sent = idxes_in_sent
        self.edu2rev_idx_in_sent = rev_idxes_in_sent

        return self
示例#2
0
def test_relative_indices():
    """Test for relative_indices"""
    # first example: common case
    example1 = [0, 1, 1, 1, 2, 3, 3]

    rel_exa1 = [0, 0, 1, 2, 0, 0, 1]
    assert relative_indices(example1) == rel_exa1

    inv_exa1 = [0, 2, 1, 0, 0, 1, 0]
    assert relative_indices(example1, reverse=True) == inv_exa1

    # second example: case with None
    # each None has relative index 'valna'
    # so if valna=0, each None is considered to be a distinct subgroup
    example2 = [None, 0, 1, 1, None, None]

    rel_exa2 = [0, 0, 0, 1, 0, 0]
    assert relative_indices(example2, valna=0) == rel_exa2

    inv_exa2 = [0, 0, 1, 0, 0, 0]
    assert relative_indices(example2, reverse=True, valna=0) == inv_exa2
示例#3
0
def test_relative_indices():
    """Test for relative_indices"""
    # first example: common case
    example1 = [0, 1, 1, 1, 2, 3, 3]

    rel_exa1 = [0, 0, 1, 2, 0, 0, 1]
    assert relative_indices(example1) == rel_exa1

    inv_exa1 = [0, 2, 1, 0, 0, 1, 0]
    assert relative_indices(example1, reverse=True) == inv_exa1

    # second example: case with None
    # each None has relative index 'valna'
    # so if valna=0, each None is considered to be a distinct subgroup
    example2 = [None, 0, 1, 1, None, None]

    rel_exa2 = [0, 0, 0, 1, 0, 0]
    assert relative_indices(example2, valna=0) == rel_exa2

    inv_exa2 = [0, 0, 1, 0, 0, 0]
    assert relative_indices(example2, reverse=True, valna=0) == inv_exa2
示例#4
0
    def align_with_doc_structure(self):
        """Align EDUs with the document structure (paragraph and sentence).

        Determine which paragraph and sentence (if any) surrounds
        this EDU. Try to accomodate the occasional off-by-a-smidgen
        error by folks marking these EDU boundaries, eg. original
        text:

        Para1: "Magazines are not providing us in-depth information on
        circulation," said Edgar Bronfman Jr., .. "How do readers feel
        about the magazine?...
        Research doesn't tell us whether people actually do read the
        magazines they subscribe to."

        Para2: Reuben Mark, chief executive of Colgate-Palmolive, said...

        Marked up EDU is wide to the left by three characters:
        "

        Reuben Mark, chief executive of Colgate-Palmolive, said...
        """
        text = self.text
        edus = self.edus

        # align EDUs with paragraphs
        paragraphs = self.paragraphs
        # dirty extraction
        if paragraphs is None:
            edu2para = None
        else:
            edu2para = align_edus_with_paragraphs(edus, paragraphs, text)
        if edu2para is None:
            edu2para = [None for edu in edus]
        self.edu2para = edu2para

        # compute relative index of each EDU to the beginning (resp. to
        # the end) of the paragraph
        idxes_in_para = relative_indices(edu2para)
        rev_idxes_in_para = relative_indices(edu2para, reverse=True)
        self.edu2idx_in_para = idxes_in_para
        self.edu2rev_idx_in_para = rev_idxes_in_para

        # align EDUs with raw sentences
        # NB: this usually fails due to bad sentence segmentation, e.g.
        # ... Prof.\nHarold ... in wsj_##.out files,
        # or end of sentence missing in file## files.
        raw_sentences = self.raw_sentences
        if raw_sentences is None:
            edu2raw_sent = [None for edu in edus]
        else:
            edu2raw_sent = []
            edu2raw_sent.append(0)  # left padding
            # align the other EDUs
            for edu in edus[1:]:
                espan = edu.text_span()
                # find enclosing raw sentence
                sent = _filter0(containing(espan), raw_sentences)
                # sloppy EDUs happen; try shaving off some characters
                # if we can't find a sentence
                if sent is None:
                    # DEBUG
                    if False:
                        print('WP ({}) : {}'.format(self.grouping, edu))
                    # end DEBUG
                    espan = copy.copy(espan)
                    espan.char_start += 1
                    espan.char_end -= 1
                    etext = text[espan.char_start:espan.char_end]
                    # kill left whitespace
                    espan.char_start += len(etext) - len(etext.lstrip())
                    etext = etext.lstrip()
                    # kill right whitespace
                    espan.char_end -= len(etext) - len(etext.rstrip())
                    etext = etext.rstrip()
                    # try again
                    sent = _filter0(containing(espan), raw_sentences)
                    # DEBUG
                    if False:
                        if sent is None:
                            print('EP ({}): {}'.format(self.grouping, edu))
                    # end DEBUG

                # update edu to sentence mapping
                raw_sent_idx = (raw_sentences.index(sent) if sent is not None
                                else None)  # TODO or -1 or ... ?
                edu2raw_sent.append(raw_sent_idx)

        self.edu2raw_sent = edu2raw_sent

        return self
示例#5
0
    def align_with_trees(self, strict=False):
        """Compute for each EDU the overlapping trees"""
        syn_trees = self.tkd_trees

        # if there is no sentence segmentation from syntax,
        # use the raw (bad) one from the .out
        if len(syn_trees) == 1:  # only lpad
            self.edu2sent = self.edu2raw_sent
            return self

        edu2sent = []

        edus = self.edus

        # left padding EDU
        assert edus[0].is_left_padding()
        edu2sent.append(0)

        # regular EDUs
        for edu in edus[1:]:
            tree_idcs = [tree_idx
                         for tree_idx, tree in enumerate(syn_trees[1:], start=1)
                         if tree is not None and tree.overlaps(edu)]

            if len(tree_idcs) == 1:
                tree_idx = tree_idcs[0]
            elif len(tree_idcs) == 0:
                # "no tree at all" can happen when the EDU text is totally
                # absent from the list of sentences of this doc in the PTB
                # ex: wsj_0696.out, last sentence
                if strict:
                    print(edu)
                    emsg = 'No PTB tree for this EDU'
                    raise ValueError(emsg)

                tree_idx = None
            else:
                # more than one PTB trees overlap with this EDU
                if strict:
                    emsg = ('Segmentation mismatch:',
                            'one EDU, more than one PTB tree')
                    print(edu)
                    for ptree in ptrees:
                        print('    ', [str(leaf) for leaf in ptree.leaves()])
                    raise ValueError(emsg)

                # heuristics: pick the PTB tree with maximal overlap
                # with the EDU span
                len_espan = edu.span.length()
                ovlaps = [syn_trees[tree_idx].overlaps(edu).length()
                          for tree_idx in tree_idcs]
                ovlap_ratios = [float(ovlap) / len_espan
                                for ovlap in ovlaps]
                # find the argmax
                max_idx = ovlap_ratios.index(max(ovlap_ratios))
                tree_idx = tree_idcs[max_idx]

            edu2sent.append(tree_idx)

        self.edu2sent = edu2sent

        # compute relative index of each EDU from the beginning (resp. to
        # the end) of the sentence around
        idxes_in_sent = relative_indices(edu2sent)
        rev_idxes_in_sent = relative_indices(edu2sent, reverse=True)
        self.edu2idx_in_sent = idxes_in_sent
        self.edu2rev_idx_in_sent = rev_idxes_in_sent

        return self