示例#1
0
 def __get_info_boxes(self, pack: DataPack) -> Iterable[EntryType]:
     if self.config.infobox_type == "property":
         yield from pack.get(WikiInfoBoxProperty)
     elif self.config.infobox_type == "mapped":
         yield from pack.get(WikiInfoBoxMapped)
     else:
         yield from pack.get(WikiInfoBoxProperty)
         yield from pack.get(WikiInfoBoxMapped)
示例#2
0
def sentence_clues(src_sent: Sentence, src_page: str, target_pack: DataPack):
    clues = []

    tgt_sent: Sentence
    for tgt_sent in target_pack.get(Sentence):
        bidirectional = False
        for target_anchor in target_pack.get(WikiAnchor, tgt_sent):
            if target_anchor.target_page_name == src_page:
                bidirectional = True
        overlap, all_grams = compute_overlap(src_sent, tgt_sent)
        clues.append((bidirectional, overlap, tgt_sent, all_grams))
    return sorted(clues, reverse=True)
示例#3
0
def get_coref_chains(pack: DataPack) -> List[List[int]]:
    """

    Args:
        pack:

    Returns: Coref chains, where each chain is the indices of the mention.

    """
    evm_id2index = {}

    for idx, mention in enumerate(all_valid_events(pack)):
        evm_id2index[mention.tid] = idx

    chains: List[List[int]] = []

    hopper: Hopper
    for hopper in pack.get(Hopper):
        chain = []
        for mention in hopper.get_members():
            # Invalid mentions should be removed.
            if mention.tid in evm_id2index:
                idx = evm_id2index[mention.tid]
                chain.append(idx)
        if len(chain) > 1:
            chains.append(sorted(chain))
    return chains
示例#4
0
    def _process(self, input_pack: DataPack):
        instance: NLIPair
        for instance in input_pack.get(NLIPair):
            premise = instance.get_parent().text
            hypo = instance.get_child().text
            results = self._nli_inference(premise, hypo)

            for k, v in enumerate(results):
                instance.entailment[self.__id2label[k]] = v
示例#5
0
    def _process(self, input_pack: DataPack):
        # handle existing entries
        self._process_existing_entries(input_pack)

        for sentence in input_pack.get(Sentence):
            result = self.predictor.predict(sentence=sentence.text)

            if "tokenize" in self.processors:
                # creating new tokens and dependencies
                tokens = self._create_tokens(input_pack, sentence, result)
                if "depparse" in self.processors:
                    self._create_dependencies(input_pack, tokens, result)
示例#6
0
    def _process(self, input_pack: DataPack):
        self._tbf_out.write(f"#BeginOfDocument {input_pack.pack_name}\n")

        eids: Dict[int, str] = {}
        for i, evm in enumerate(input_pack.get(EventMention)):
            self._tbf_out.write("\t".join([
                self.configs.system_name, input_pack.pack_name, f"E{i}",
                f"{evm.begin},{evm.end}",
                evm.text.replace("\n", ""), evm.event_type, "Actual"
            ]) + "\n")
            eids[evm.tid] = f"E{i}"

        hopper: Hopper
        for i, hopper in enumerate(input_pack.get(Hopper)):
            if len(hopper.get_members()) <= 1:
                continue

            member_text = ",".join(
                [eids[evm.tid] for evm in hopper.get_members()])
            self._tbf_out.write(
                "\t".join(["@Coreference", f"R{i}", member_text]) + "\n")

        self._tbf_out.write("#EndOfDocument\n")
示例#7
0
def build_arguments(pack: DataPack):
    all_args: Dict[int, Dict[str, int]] = {}

    argument: EventArgument
    for argument in pack.get(EventArgument):
        evm: EventMention = argument.get_parent()
        arg: EntityMention = argument.get_child()

        try:
            all_args[evm.tid][argument.role] = arg
        except KeyError:
            all_args[evm.tid] = {argument.role: arg}

    return all_args
示例#8
0
def all_valid_events(pack: DataPack) -> List[EventMention]:
    """
    Some events are not in filtered text. We ignore them.

    Args:
        pack:

    Returns:

    """
    all_events: List[EventMention] = []
    for sent in pack.get(Sentence):
        all_events.extend(sent.get(EventMention))
    return all_events
示例#9
0
def get_single(pack: DataPack, entry_type: Type[EntryType]) -> EntryType:
    r"""Take a single entry of type :attr:`entry_type` from the provided data
    pack. This is useful when the target entry type normally appears only one
    time in the :class:`DataPack` for e.g., a Document entry.

    Args:
        pack: The provided data pack to take entries from.
        entry_type: The entry type to be retrieved.

    Returns:
        A single data entry.
    """
    for a in pack.get(entry_type):
        return a

    raise EntryNotFoundError(
        f"The entry {entry_type} is not found in the provided data pack.")
示例#10
0
    def _process(self, input_pack: DataPack):
        all_anchors = defaultdict(list)
        anchor: WikiAnchor
        for anchor in input_pack.get(WikiAnchor):
            all_anchors[(anchor.span.begin, anchor.span.end)].append(anchor)

        for span in all_anchors.keys():
            l_a: List[WikiAnchor] = all_anchors[span]
            if len(l_a) > 1:
                if len(l_a) > 2:
                    print(input_pack.pack_name, l_a[0].target_page_name,
                          len(l_a))
                    logging.error(
                        "There are links that have more than 2 copies.")
                    import pdb
                    pdb.set_trace()
                for a in l_a[1:]:
                    # Removing duplicates.
                    input_pack.delete_entry(a)
示例#11
0
    def _process(self, input_pack: DataPack):
        kp = KeywordProcessor(case_sensitive=True)
        anchor_entities = {}
        existing_anchors = set()

        anchor: WikiAnchor
        for anchor in input_pack.get(WikiAnchor):
            kp.add_keyword(anchor.text)
            existing_anchors.add((anchor.span.begin, anchor.span.end))

            try:
                anchor_entities[anchor.text].append(anchor)
            except KeyError:
                anchor_entities[anchor.text] = [anchor]

        for kw, b, e in kp.extract_keywords(input_pack.text, span_info=True):
            targets = anchor_entities[kw]

            if (b, e) in existing_anchors:
                # Ignore existing anchors.
                continue

            copy_from: WikiAnchor
            if len(targets) == 1:
                copy_from = targets[0]
            elif len(targets) > 1:
                latest_ = targets[0]
                for t in targets:
                    if t.begin < b:
                        latest_ = t
                copy_from = latest_
            else:
                raise RuntimeError(f"Unknown target length {len(targets)}")

            anchor = WikiAnchor(input_pack, b, e)
            anchor.target_page_name = copy_from.target_page_name
            anchor.is_external = copy_from.is_external
            input_pack.add_entry(anchor)
示例#12
0
 def _process(self, input_pack: DataPack):
     hoppers = list(input_pack.get(Hopper))
     for h in hoppers:
         input_pack.delete_entry(h)
示例#13
0
def events2sentences(pack: DataPack) -> Dict[int, Sentence]:
    events2sents: Dict[int, Sentence] = {}
    for sent in pack.get(Sentence):
        for evm in sent.get(EventMention):
            events2sents[evm.tid] = sent
    return events2sents
示例#14
0
 def _process(self, input_pack: DataPack):
     sentence: Sentence
     for sentence in input_pack.get(entry_type=Sentence,
                                    component=self.sentence_component):
         scores = self.analyzer.polarity_scores(sentence.text)
         sentence.sentiment = scores