def __get_info_boxes(self, pack: DataPack) -> Iterable[EntryType]: if self.config.infobox_type == "property": yield from pack.get(WikiInfoBoxProperty) elif self.config.infobox_type == "mapped": yield from pack.get(WikiInfoBoxMapped) else: yield from pack.get(WikiInfoBoxProperty) yield from pack.get(WikiInfoBoxMapped)
def sentence_clues(src_sent: Sentence, src_page: str, target_pack: DataPack): clues = [] tgt_sent: Sentence for tgt_sent in target_pack.get(Sentence): bidirectional = False for target_anchor in target_pack.get(WikiAnchor, tgt_sent): if target_anchor.target_page_name == src_page: bidirectional = True overlap, all_grams = compute_overlap(src_sent, tgt_sent) clues.append((bidirectional, overlap, tgt_sent, all_grams)) return sorted(clues, reverse=True)
def get_coref_chains(pack: DataPack) -> List[List[int]]: """ Args: pack: Returns: Coref chains, where each chain is the indices of the mention. """ evm_id2index = {} for idx, mention in enumerate(all_valid_events(pack)): evm_id2index[mention.tid] = idx chains: List[List[int]] = [] hopper: Hopper for hopper in pack.get(Hopper): chain = [] for mention in hopper.get_members(): # Invalid mentions should be removed. if mention.tid in evm_id2index: idx = evm_id2index[mention.tid] chain.append(idx) if len(chain) > 1: chains.append(sorted(chain)) return chains
def _process(self, input_pack: DataPack): instance: NLIPair for instance in input_pack.get(NLIPair): premise = instance.get_parent().text hypo = instance.get_child().text results = self._nli_inference(premise, hypo) for k, v in enumerate(results): instance.entailment[self.__id2label[k]] = v
def _process(self, input_pack: DataPack): # handle existing entries self._process_existing_entries(input_pack) for sentence in input_pack.get(Sentence): result = self.predictor.predict(sentence=sentence.text) if "tokenize" in self.processors: # creating new tokens and dependencies tokens = self._create_tokens(input_pack, sentence, result) if "depparse" in self.processors: self._create_dependencies(input_pack, tokens, result)
def _process(self, input_pack: DataPack): self._tbf_out.write(f"#BeginOfDocument {input_pack.pack_name}\n") eids: Dict[int, str] = {} for i, evm in enumerate(input_pack.get(EventMention)): self._tbf_out.write("\t".join([ self.configs.system_name, input_pack.pack_name, f"E{i}", f"{evm.begin},{evm.end}", evm.text.replace("\n", ""), evm.event_type, "Actual" ]) + "\n") eids[evm.tid] = f"E{i}" hopper: Hopper for i, hopper in enumerate(input_pack.get(Hopper)): if len(hopper.get_members()) <= 1: continue member_text = ",".join( [eids[evm.tid] for evm in hopper.get_members()]) self._tbf_out.write( "\t".join(["@Coreference", f"R{i}", member_text]) + "\n") self._tbf_out.write("#EndOfDocument\n")
def build_arguments(pack: DataPack): all_args: Dict[int, Dict[str, int]] = {} argument: EventArgument for argument in pack.get(EventArgument): evm: EventMention = argument.get_parent() arg: EntityMention = argument.get_child() try: all_args[evm.tid][argument.role] = arg except KeyError: all_args[evm.tid] = {argument.role: arg} return all_args
def all_valid_events(pack: DataPack) -> List[EventMention]: """ Some events are not in filtered text. We ignore them. Args: pack: Returns: """ all_events: List[EventMention] = [] for sent in pack.get(Sentence): all_events.extend(sent.get(EventMention)) return all_events
def get_single(pack: DataPack, entry_type: Type[EntryType]) -> EntryType: r"""Take a single entry of type :attr:`entry_type` from the provided data pack. This is useful when the target entry type normally appears only one time in the :class:`DataPack` for e.g., a Document entry. Args: pack: The provided data pack to take entries from. entry_type: The entry type to be retrieved. Returns: A single data entry. """ for a in pack.get(entry_type): return a raise EntryNotFoundError( f"The entry {entry_type} is not found in the provided data pack.")
def _process(self, input_pack: DataPack): all_anchors = defaultdict(list) anchor: WikiAnchor for anchor in input_pack.get(WikiAnchor): all_anchors[(anchor.span.begin, anchor.span.end)].append(anchor) for span in all_anchors.keys(): l_a: List[WikiAnchor] = all_anchors[span] if len(l_a) > 1: if len(l_a) > 2: print(input_pack.pack_name, l_a[0].target_page_name, len(l_a)) logging.error( "There are links that have more than 2 copies.") import pdb pdb.set_trace() for a in l_a[1:]: # Removing duplicates. input_pack.delete_entry(a)
def _process(self, input_pack: DataPack): kp = KeywordProcessor(case_sensitive=True) anchor_entities = {} existing_anchors = set() anchor: WikiAnchor for anchor in input_pack.get(WikiAnchor): kp.add_keyword(anchor.text) existing_anchors.add((anchor.span.begin, anchor.span.end)) try: anchor_entities[anchor.text].append(anchor) except KeyError: anchor_entities[anchor.text] = [anchor] for kw, b, e in kp.extract_keywords(input_pack.text, span_info=True): targets = anchor_entities[kw] if (b, e) in existing_anchors: # Ignore existing anchors. continue copy_from: WikiAnchor if len(targets) == 1: copy_from = targets[0] elif len(targets) > 1: latest_ = targets[0] for t in targets: if t.begin < b: latest_ = t copy_from = latest_ else: raise RuntimeError(f"Unknown target length {len(targets)}") anchor = WikiAnchor(input_pack, b, e) anchor.target_page_name = copy_from.target_page_name anchor.is_external = copy_from.is_external input_pack.add_entry(anchor)
def _process(self, input_pack: DataPack): hoppers = list(input_pack.get(Hopper)) for h in hoppers: input_pack.delete_entry(h)
def events2sentences(pack: DataPack) -> Dict[int, Sentence]: events2sents: Dict[int, Sentence] = {} for sent in pack.get(Sentence): for evm in sent.get(EventMention): events2sents[evm.tid] = sent return events2sents
def _process(self, input_pack: DataPack): sentence: Sentence for sentence in input_pack.get(entry_type=Sentence, component=self.sentence_component): scores = self.analyzer.polarity_scores(sentence.text) sentence.sentiment = scores