def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack: DataPack = DataPack() text: str = "" offset: int = 0 with open(file_path, "r", encoding="utf8") as f: for line in f: line = line.strip() if line != "": oie_component: List[str] = line.split("\t") sentence: str = oie_component[0] # Add sentence. Sentence(pack, offset, offset + len(sentence)) offset += len(sentence) + 1 text += sentence + " " head_predicate: str = oie_component[1] full_predicate: str = oie_component[2] # Add head predicate. token: Token = Token(pack, offset, offset + len(head_predicate)) offset += len(head_predicate) + 1 text += head_predicate + " " # Add full predicate. predicate_mention: PredicateMention = PredicateMention(pack, offset, offset + len(full_predicate)) predicate_mention.headword = token offset += len(full_predicate) + 1 text += full_predicate + " " for arg in oie_component[3:]: # Add predicate argument. predicate_arg: PredicateArgument = \ PredicateArgument(pack, offset, offset + len(arg)) offset += len(arg) + 1 text += arg + " " # Add predicate link. PredicateLink(pack, predicate_mention, predicate_arg) pack.set_text(text, replace_func=self.text_replace_operation) Document(pack, 0, len(text)) pack.pack_name = file_path yield pack
def _create_srl(input_pack: DataPack, tokens: List[Token], result: Dict[str, List[str]]) -> None: for _, tag in enumerate(result['srl_tags']): pred_span, arguments = parse_allennlp_srl_tags(tag) if not pred_span: continue pred = PredicateMention(input_pack, tokens[pred_span.begin].begin, tokens[pred_span.end].end) for arg_span, label in arguments: arg = PredicateArgument(input_pack, tokens[arg_span.begin].begin, tokens[arg_span.end].end) link = PredicateLink(input_pack, pred, arg) link.arg_type = label
def pack(self, data_pack: DataPack, inputs: Dict[str, List[Prediction]]) -> None: batch_predictions = inputs["predictions"] for predictions in batch_predictions: for pred_span, arg_result in predictions: pred = PredicateMention(data_pack, pred_span.begin, pred_span.end) for arg_span, label in arg_result: arg = PredicateArgument(data_pack, arg_span.begin, arg_span.end) link = PredicateLink(data_pack, pred, arg) link.arg_type = label
def pack( self, pack: DataPack, predict_results: Dict[str, List[Prediction]], _: Optional[Annotation] = None, ): batch_predictions = predict_results["predictions"] for predictions in batch_predictions: for pred_span, arg_result in predictions: pred = PredicateMention(pack, pred_span.begin, pred_span.end) for arg_span, label in arg_result: arg = PredicateArgument(pack, arg_span.begin, arg_span.end) link = PredicateLink(pack, pred, arg) link.arg_type = label
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack = self.new_pack() with open(file_path, encoding="utf8") as doc: words = [] offset = 0 has_rows = False speaker = part_id = document_id = None sentence_begin = 0 # auxiliary structures current_entity_mention: Optional[Tuple[int, str]] = None verbal_predicates: List[PredicateMention] = [] current_pred_arg: List[Optional[Tuple[int, str]]] = [] verbal_pred_args: List[List[Tuple[PredicateArgument, str]]] = [] groups: DefaultDict[int, List[EntityMention]] = defaultdict(list) coref_stacks: DefaultDict[int, List[int]] = defaultdict(list) for line in doc: line = line.strip() if line.startswith("#end document"): break if line != "" and not line.startswith("#"): fields = self._parse_line(line) speaker = fields.speaker if fields.part_number is not None: part_id = int(fields.part_number) document_id = fields.document_id assert fields.word is not None word_begin = offset word_end = offset + len(fields.word) # add tokens token = Token(pack, word_begin, word_end) if fields.pos_tag is not None: token.pos = fields.pos_tag if fields.word_sense is not None: token.sense = fields.word_sense # add entity mentions current_entity_mention = self._process_entity_annotations( pack, fields.entity_label, word_begin, word_end, current_entity_mention, ) # add predicate mentions if (fields.lemmatised_word is not None and fields.lemmatised_word != "-"): word_is_verbal_predicate = any( "(V" in x for x in fields.predicate_labels) pred_mention = PredicateMention( pack, word_begin, word_end) pred_mention.predicate_lemma = fields.lemmatised_word pred_mention.is_verb = word_is_verbal_predicate if fields.framenet_id is not None: pred_mention.framenet_id = fields.framenet_id if word_is_verbal_predicate: verbal_predicates.append(pred_mention) if not verbal_pred_args: current_pred_arg = [None] * len( fields.predicate_labels) verbal_pred_args = [[] for _ in fields.predicate_labels] # add predicate arguments self._process_pred_annotations( pack, fields.predicate_labels, word_begin, word_end, current_pred_arg, verbal_pred_args, ) # add coreference mentions self._process_coref_annotations( pack, fields.coreference, word_begin, word_end, coref_stacks, groups, ) words.append(fields.word) offset = word_end + 1 has_rows = True else: if not has_rows: continue # add predicate links in the sentence for predicate, pred_arg in zip(verbal_predicates, verbal_pred_args): for arg in pred_arg: link = PredicateLink(pack, predicate, arg[0]) link.arg_type = arg[1] verbal_predicates = [] current_pred_arg = [] verbal_pred_args = [] # add sentence sent = Sentence(pack, sentence_begin, offset - 1) if speaker is not None: sent.speaker = speaker if part_id is not None: sent.part_id = int(part_id) sentence_begin = offset has_rows = False # group the coreference mentions in the whole document for _, mention_list in groups.items(): group = CoreferenceGroup(pack) group.add_members(mention_list) text = " ".join(words) pack.set_text(text, replace_func=self.text_replace_operation) _ = Document(pack, 0, len(text)) if document_id is not None: pack.pack_name = document_id yield pack
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack = DataPack() with open(file_path, encoding="utf8") as doc: text = "" offset = 0 has_rows = False speaker = part_id = document_id = None sentence_begin = 0 # auxiliary structures current_entity_mention: Optional[Tuple[int, str]] = None verbal_predicates: List[PredicateMention] = [] current_pred_arg: List[Optional[Tuple[int, str]]] = [] verbal_pred_args: List[List[Tuple[PredicateArgument, str]]] = [] groups: DefaultDict[int, List[EntityMention]] = defaultdict(list) coref_stacks: DefaultDict[int, List[int]] = defaultdict(list) for line in doc: line = line.strip() if line.startswith("#end document"): break if line != "" and not line.startswith("#"): conll_components = line.split() document_id = conll_components[0] part_id = int(conll_components[1]) word = conll_components[3] pos_tag = conll_components[4] lemmatised_word = conll_components[6] framenet_id = conll_components[7] word_sense = conll_components[8] speaker = conll_components[9] entity_label = conll_components[10] pred_labels = conll_components[11:-1] word_begin = offset word_end = offset + len(word) # add tokens kwargs_i: Dict[str, Any] = {"pos": pos_tag, "sense": word_sense} token = Token(pack, word_begin, word_end) token.set_fields(**kwargs_i) pack.add_or_get_entry(token) # add entity mentions current_entity_mention = self._process_entity_annotations( pack, entity_label, word_begin, word_end, current_entity_mention ) # add predicate mentions if lemmatised_word != "-": word_is_verbal_predicate = any( ["(V" in x for x in pred_labels] ) kwargs_i = { "framenet_id": framenet_id, "pred_lemma": lemmatised_word, "pred_type": "verb" if word_is_verbal_predicate else "other" } pred_mention = PredicateMention( pack, word_begin, word_end) pred_mention.set_fields(**kwargs_i) pred_mention = pack.add_or_get_entry( pred_mention ) if word_is_verbal_predicate: verbal_predicates.append(pred_mention) if not verbal_pred_args: current_pred_arg = [None for _ in pred_labels] verbal_pred_args = [[] for _ in pred_labels] # add predicate arguments self._process_pred_annotations( pack, conll_components[11:-1], word_begin, word_end, current_pred_arg, verbal_pred_args, ) # add coreference mentions self._process_coref_annotations( pack, conll_components[-1], word_begin, word_end, coref_stacks, groups, ) text += word + " " offset = word_end + 1 has_rows = True else: if not has_rows: continue # add predicate links in the sentence for predicate, pred_arg in zip(verbal_predicates, verbal_pred_args): for arg in pred_arg: kwargs_i = { "arg_type": arg[1], } link = PredicateLink(pack, predicate, arg[0]) link.set_fields(**kwargs_i) pack.add_or_get_entry(link) verbal_predicates = [] current_pred_arg = [] verbal_pred_args = [] # add sentence kwargs_i = {"speaker": speaker, "part_id": part_id} sent = Sentence(pack, sentence_begin, offset - 1) sent.set_fields(**kwargs_i) pack.add_or_get_entry(sent) sentence_begin = offset has_rows = False # group the coreference mentions in the whole document for _, mention_list in groups.items(): # kwargs_i = {"coref_type": group_id} group = CoreferenceGroup(pack) # group.set_fields(**kwargs_i) group.add_members(mention_list) pack.add_or_get_entry(group) document = Document(pack, 0, len(text)) pack.add_or_get_entry(document) kwargs_i = {"doc_id": document_id} pack.set_meta(**kwargs_i) pack.set_text(text, replace_func=self.text_replace_operation) yield pack