def _process_parser(self, sentences, input_pack: DataPack): """Parse the sentence. Default behaviour is to segment sentence, POSTag and Lemmatize. Args: sentences: Generator object which yields sentences in document input_pack: input pack which needs to be modified Returns: """ for sentence in sentences: Sentence(input_pack, sentence.start_char, sentence.end_char) if "tokenize" in self.processors: # Iterating through spaCy token objects for word in sentence: begin_pos_word = word.idx end_pos_word = begin_pos_word + len(word.text) token = Token(input_pack, begin_pos_word, end_pos_word) if "pos" in self.processors: token.pos = word.tag_ if "lemma" in self.processors: token.lemma = word.lemma_
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack = self.new_pack() doc = codecs.open(file_path, "r", encoding="utf8") text = "" offset = 0 has_rows = False sentence_begin = 0 sentence_cnt = 0 for line in doc: line = line.strip() if line != "" and not line.startswith("#"): conll_components = line.split() word = conll_components[1] pos = conll_components[2] chunk_id = conll_components[3] ner_tag = conll_components[4] word_begin = offset word_end = offset + len(word) # Add tokens. token = Token(pack, word_begin, word_end) token.pos = pos token.chunk = chunk_id token.ner = ner_tag text += word + " " offset = word_end + 1 has_rows = True else: if not has_rows: # Skip consecutive empty lines. continue # add sentence Sentence(pack, sentence_begin, offset - 1) sentence_begin = offset sentence_cnt += 1 has_rows = False if has_rows: # Add the last sentence if exists. Sentence(pack, sentence_begin, offset - 1) sentence_cnt += 1 pack.set_text(text, replace_func=self.text_replace_operation) Document(pack, 0, len(text)) pack.pack_name = file_path doc.close() yield pack
def _space_token(pack: DataPack): begin = 0 for i, c in enumerate(pack.text): if c == ' ': pack.add_entry(Token(pack, begin, i)) begin = i + 1 if begin < len(pack.text): pack.add_entry(Token(pack, begin, len(pack.text)))
def _process(self, input_pack: DataPack): doc = input_pack.text if len(doc) == 0: logging.warning("Find empty text in doc.") # sentence parsing sentences = self.nlp(doc).sentences # Iterating through stanfordnlp sentence objects for sentence in sentences: Sentence( input_pack, sentence.tokens[0].start_char, sentence.tokens[-1].end_char, ) tokens: List[Token] = [] if "tokenize" in self.processors: # Iterating through stanfordnlp word objects for word in sentence.words: misc = word.misc.split("|") t_start = -1 t_end = -1 for m in misc: k, v = m.split("=") if k == "start_char": t_start = int(v) elif k == "end_char": t_end = int(v) if t_start < 0 or t_end < 0: raise ValueError( "Cannot determine word start or end for " "stanfordnlp." ) token = Token(input_pack, t_start, t_end) if "pos" in self.processors: token.pos = word.pos token.ud_xpos = word.xpos if "lemma" in self.processors: token.lemma = word.lemma tokens.append(token) # For each sentence, get the dependency relations among tokens if "depparse" in self.processors: # Iterating through token entries in current sentence for token, word in zip(tokens, sentence.words): child = token # current token parent = tokens[word.head - 1] # Head token relation_entry = Dependency(input_pack, parent, child) relation_entry.rel_type = word.deprel
def _process(self, input_pack: DataPack): pattern = r"\s+" start = 0 for m in re.finditer(pattern, input_pack.text): input_pack.add_entry(Token(input_pack, start, m.start())) start = m.end() if start < len(input_pack.text): input_pack.add_entry(Token(input_pack, start, len(input_pack.text)))
def _create_tokens(self, input_pack, sentence, result): words, pos = result['words'], result['pos'] tokens = [] offset = sentence.span.begin word_end = 0 for i, word in enumerate(words): word_begin = sentence.text.find(word, word_end) word_end = word_begin + len(word) token = Token(input_pack, offset + word_begin, offset + word_end) if "pos" in self.configs.processors: token.pos = pos[i] tokens.append(token) return tokens
def _parse_pack(self, data: dict) -> Iterator[DataPack]: r"""Extracts information from input `data` of one document output from Prodigy Annotator including the text, tokens and its annotations into a DataPack. Args: data: a dict that contains information for one document. Returns: DataPack containing information extracted from `data`. """ pack = DataPack() text = data['text'] pack.set_text(text, replace_func=self.text_replace_operation) Document(pack, 0, len(text)) tokens = data['tokens'] spans = data['spans'] for token in tokens: begin = token['start'] end = token['end'] Token(pack, begin, end) for span_items in spans: begin = span_items['start'] end = span_items['end'] annotation_entry = EntityMention(pack, begin, end) annotation_entry.ner_type = span_items['label'] pack.meta.doc_id = data['meta']['id'] yield pack
def _parse_pack(self, data: dict) -> Iterator[DataPack]: r"""Extracts information from input `data` of one document output from Prodigy Annotator including the text, tokens and its annotations into a DataPack. Args: data: a dict that contains information for one document. Returns: DataPack containing information extracted from `data`. """ pack = DataPack() text = data["text"] pack.set_text(text, replace_func=self.text_replace_operation) Document(pack, 0, len(text)) tokens = data["tokens"] spans = data["spans"] for token in tokens: begin = token["start"] end = token["end"] Token(pack, begin, end) for span_items in spans: begin = span_items["start"] end = span_items["end"] annotation_entry = EntityMention(pack, begin, end) annotation_entry.ner_type = span_items["label"] pack.pack_name = data["meta"]["id"] yield pack
def test_multi_pack_copy_link_or_group(self): processor = ReplacementDataAugmentProcessor() m_pack = MultiPack() src_pack = m_pack.add_pack("src") tgt_pack = m_pack.add_pack("tgt") src_pack.set_text("input") tgt_pack.set_text("output") src_token = src_pack.add_entry(Token(src_pack, 0, len(src_pack.text))) tgt_token = tgt_pack.add_entry(Token(tgt_pack, 0, len(tgt_pack.text))) mpl = m_pack.add_entry(MultiPackLink(m_pack, src_token, tgt_token)) # The MultiPackLink should not be copied, because its children are not copied. self.assertEqual(processor._copy_multi_pack_link_or_group(mpl, m_pack), False) new_src_pack = processor._auto_align_annotations(src_pack, []) self.assertEqual(len(list(new_src_pack.get(Token))), 1)
def test_replace(self): data_pack = DataPack() data_pack.set_text("auxiliary colleague apple") token_1 = Token(data_pack, 0, 9) token_2 = Token(data_pack, 10, 19) token_3 = Token(data_pack, 20, 25) data_pack.add_entry(token_1) data_pack.add_entry(token_2) data_pack.add_entry(token_3) self.assertIn( self.tyre.replace(token_1)[1], ["auxilliary", "auxilary", "auxillary"], ) self.assertIn(self.tyre.replace(token_2)[1], ["collegue", "colleaque"]) self.assertIn(self.tyre.replace(token_3)[1], ["apple"])
def test_replace(self): data_pack = DataPack() data_pack.set_text("google") token_1 = Token(data_pack, 0, 6) data_pack.add_entry(token_1) is_replace, replaced_token = self.esa.replace(token_1) self.assertTrue(is_replace) self.assertIn(replaced_token, ["yahoo", "aol", "microsoft", "web", "internet"])
def setUp(self): data_pack = DataPack() self.word = "eat" data_pack.set_text(self.word) self.token = Token(data_pack, 0, 3) data_pack.add_all_remaining_entries() self.word_list = ["apple", "banana", "orange"] self.sampler = UniformSampler(self.word_list)
def _process(self, input_pack: DataPack): doc = input_pack.text end_pos = 0 # sentence parsing sentences = self.nlp(doc).sentences # type: ignore # Iterating through stanfordnlp sentence objects for sentence in sentences: begin_pos = doc.find(sentence.words[0].text, end_pos) end_pos = doc.find(sentence.words[-1].text, begin_pos) + len( sentence.words[-1].text) sentence_entry = Sentence(input_pack, begin_pos, end_pos) tokens: List[Token] = [] if "tokenize" in self.processors: offset = sentence_entry.span.begin end_pos_word = 0 # Iterating through stanfordnlp word objects for word in sentence.words: begin_pos_word = sentence_entry.text. \ find(word.text, end_pos_word) end_pos_word = begin_pos_word + len(word.text) token = Token(input_pack, begin_pos_word + offset, end_pos_word + offset) if "pos" in self.processors: token.pos = word.pos token.ud_xpos = word.xpos if "lemma" in self.processors: token.lemma = word.lemma tokens.append(token) # For each sentence, get the dependency relations among tokens if "depparse" in self.processors: # Iterating through token entries in current sentence for token, word in zip(tokens, sentence.words): child = token # current token parent = tokens[word.governor - 1] # Root token relation_entry = Dependency(input_pack, parent, child) relation_entry.rel_type = word.dependency_relation
def test_replace(self): random.seed(42) data_pack = DataPack() test_string = "The lazy fox jumped over the fence" test_result = "T/-/3 lazy f0>< jumpe|) oveI2 th3 fe^ce" data_pack.set_text(test_string) token_1 = Token(data_pack, 0, len(test_string)) data_pack.add_entry(token_1) self.assertIn(self.test.replace(token_1)[1], test_result)
def _process(self, input_pack: DataPack): for sentence in input_pack.get(entry_type=Sentence, component=self.sentence_component): offset = sentence.span.begin end_pos = 0 for word in word_tokenize(sentence.text): begin_pos = sentence.text.find(word, end_pos) end_pos = begin_pos + len(word) token = Token(input_pack, begin_pos + offset, end_pos + offset) input_pack.add_or_get_entry(token)
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack: DataPack = DataPack() text: str = "" offset: int = 0 with open(file_path, "r", encoding="utf8") as f: for line in f: line = line.strip() if line != "": oie_component: List[str] = line.split("\t") sentence: str = oie_component[0] # Add sentence. Sentence(pack, offset, offset + len(sentence)) offset += len(sentence) + 1 text += sentence + " " head_predicate: str = oie_component[1] full_predicate: str = oie_component[2] # Add head predicate. token: Token = Token(pack, offset, offset + len(head_predicate)) offset += len(head_predicate) + 1 text += head_predicate + " " # Add full predicate. predicate_mention: PredicateMention = PredicateMention(pack, offset, offset + len(full_predicate)) predicate_mention.headword = token offset += len(full_predicate) + 1 text += full_predicate + " " for arg in oie_component[3:]: # Add predicate argument. predicate_arg: PredicateArgument = \ PredicateArgument(pack, offset, offset + len(arg)) offset += len(arg) + 1 text += arg + " " # Add predicate link. PredicateLink(pack, predicate_mention, predicate_arg) pack.set_text(text, replace_func=self.text_replace_operation) Document(pack, 0, len(text)) pack.pack_name = file_path yield pack
def setUp(self): data_pack = DataPack() self.word = "eat" data_pack.set_text(self.word) self.token = Token(data_pack, 0, 3) data_pack.add_all_remaining_entries() self.word_list = ["apple", "banana", "orange"] self.word_dict = { "apple": 1, "banana": 2, "mango": 3, }
def test_segmenter(self): data_pack = DataPack() data_pack.set_text("eat phone") token_1 = Token(data_pack, 0, 3) token_2 = Token(data_pack, 4, 9) token_1.pos = "VB" token_2.pos = None data_pack.add_entry(token_1) data_pack.add_entry(token_2) self.assertIn( self.dra.replace(token_1)[1], [ "eat", "feed", "eat on", "consume", "eat up", "use up", "deplete", "exhaust", "run through", "wipe out", "corrode", "rust", ], ) self.assertIn( self.dra.replace(token_2)[1], [ "telephone", "phone", "telephone set", "speech sound", "sound", "earphone", "earpiece", "headphone", "call", "telephone", "call up", "ring", ], )
def _process(self, input_pack: DataPack): for begin, end in self.tokenizer.span_tokenize(input_pack.text): Token(input_pack, begin, end)
def _parse_pack(self, doc_lines) -> Iterator[DataPack]: # pylint: disable=no-self-use token_comp_fields = [ "id", "form", "lemma", "pos", "ud_xpos", "features", "head", "label", "enhanced_dependency_relations", "ud_misc" ] token_multi_fields = [ "features", "ud_misc", "enhanced_dependency_relations" ] token_feature_fields = ["features", "ud_misc"] token_entry_fields = ["lemma", "pos", "ud_xpos", "features", "ud_misc"] data_pack: DataPack = DataPack() doc_sent_begin: int = 0 doc_num_sent: int = 0 doc_text: str = '' doc_offset: int = 0 doc_id: str sent_text: str sent_tokens: Dict[str, Tuple[Dict[str, Any], Token]] = {} for line in doc_lines: line = line.strip() line_comps = line.split() if line.startswith("# newdoc"): doc_id = line.split("=")[1].strip() elif line.startswith("# sent"): sent_text = '' elif len(line_comps) > 0 and \ line_comps[0].strip().isdigit(): # token token_comps: Dict[str, Any] = {} for index, key in enumerate(token_comp_fields): token_comps[key] = str(line_comps[index]) if key in token_multi_fields: values = str(token_comps[key]).split("|") \ if token_comps[key] != '_' else [] if key not in token_feature_fields: token_comps[key] = values else: feature_lst = [ elem.split('=', 1) for elem in values ] feature_dict = { elem[0]: elem[1] for elem in feature_lst } token_comps[key] = feature_dict word: str = token_comps["form"] word_begin = doc_offset word_end = doc_offset + len(word) token: Token \ = Token(data_pack, word_begin, word_end) kwargs = {key: token_comps[key] for key in token_entry_fields} # add token token.set_fields(**kwargs) data_pack.add_or_get_entry(token) sent_tokens[str(token_comps["id"])] = (token_comps, token) sent_text += word + " " doc_offset = word_end + 1 elif line == "": # sentence ends sent_text = sent_text.strip() doc_text += ' ' + sent_text # add dependencies for a sentence when all the tokens have been # added for token_id in sent_tokens: token_comps, token = sent_tokens[token_id] def add_dependency(dep_parent, dep_child, dep_label, dep_type, data_pack_): """Adds dependency to a data_pack Args: dep_parent: dependency parent token dep_child: dependency child token dep_label: dependency label dep_type: "primary" or "enhanced" dependency data_pack_: data_pack to which the dependency is to be added """ dependency = Dependency(data_pack, dep_parent, dep_child) dependency.dep_label = dep_label dependency.type = dep_type data_pack_.add_or_get_entry(dependency) # add primary dependency label = token_comps["label"] if label == "root": token.is_root = True else: token.is_root = False head = sent_tokens[token_comps["head"]][1] add_dependency(head, token, label, "primary", data_pack) # add enhanced dependencies for dep in token_comps["enhanced_dependency_relations"]: head_id, label = dep.split(":", 1) if label != "root": head = sent_tokens[head_id][1] add_dependency(head, token, label, "enhanced", data_pack) # add sentence sent = Sentence(data_pack, doc_sent_begin, doc_offset - 1) data_pack.add_or_get_entry(sent) doc_sent_begin = doc_offset doc_num_sent += 1 # add doc to data_pack document = Document(data_pack, 0, len(doc_text)) data_pack.add_or_get_entry(document) data_pack.meta.doc_id = doc_id data_pack.set_text(doc_text.strip()) yield data_pack
def _parse_pack(self, collection: str) -> Iterator[DataPack]: with open(collection, "r", encoding="utf8") as doc: pack_id: int = 0 pack: DataPack = DataPack() text: str = "" offset: int = 0 has_rows: bool = False sentence_begin: int = 0 sentence_cnt: int = 0 # NER tag is either "O" or in the format "X-Y", # where X is one of B, I, # Y is a tag like ORG, PER etc prev_y = None prev_x = None start_index = -1 for line in doc: line = line.strip() if line.find("DOCSTART") != -1: # Skip the first DOCSTART. if offset == 0: continue # Add remaining sentence. if has_rows: # Add the last sentence if exists. Sentence(pack, sentence_begin, offset - 1) sentence_cnt += 1 pack.set_text(text, replace_func=self.text_replace_operation) Document(pack, 0, len(text)) pack.pack_name = collection + "_%d" % pack_id pack_id += 1 yield pack # Create a new datapack. pack = DataPack() text = "" offset = 0 has_rows = False sentence_begin = 0 sentence_cnt = 0 prev_y = None prev_x = None start_index = -1 elif line != "" and not line.startswith("#"): conll_components = line.split() word = conll_components[0] pos = conll_components[1] chunk_id = conll_components[2] ner_tag = conll_components[3] # A new ner tag occurs. if ner_tag == "O" or ner_tag.split("-")[0] == "B": # Add previous ner tag to sentence if it exists. if prev_y is not None: entity_mention = EntityMention( pack, start_index, offset - 1) entity_mention.ner_type = prev_y # Start process current ner tag. if ner_tag == "O": # Current ner tag is O, reset information. prev_x = None prev_y = None start_index = -1 else: # Current ner tag is B. prev_x = "B" prev_y = ner_tag.split("-")[1] start_index = offset # This ner tag is connected to previous one. else: x, y = ner_tag.split("-") assert x == "I", "Unseen tag %s in the file." % x assert y == prev_y, "Error in %s." % ner_tag assert prev_x in ("B", "I"), "Error in %s." % ner_tag prev_x = "I" word_begin = offset word_end = offset + len(word) # Add tokens. token = Token(pack, word_begin, word_end) token.pos = pos token.chunk = chunk_id text += word + " " offset = word_end + 1 has_rows = True else: if not has_rows: # Skip consecutive empty lines. continue # Add sentence Sentence(pack, sentence_begin, offset - 1) # Handle the last ner tag if exists. if prev_x is not None: entity_mention = EntityMention(pack, start_index, offset - 1) entity_mention.ner_type = prev_y # Reset information. sentence_cnt += 1 has_rows = False prev_y = None prev_x = None sentence_begin = offset if has_rows: # Add the last sentence if exists. Sentence(pack, sentence_begin, offset - 1) sentence_cnt += 1 pack.set_text(text, replace_func=self.text_replace_operation) Document(pack, 0, len(text)) pack.pack_name = os.path.basename(collection) yield pack
def _parse_pack(self, doc_lines) -> Iterator[DataPack]: token_comp_fields = ["id", "form", "lemma", "pos", "ud_xpos", "ud_features", "head", "label", "enhanced_dependency_relations", "ud_misc"] token_multi_fields = ["ud_features", "ud_misc", "enhanced_dependency_relations"] token_feature_fields = ["ud_features", "ud_misc"] data_pack: DataPack = DataPack() doc_sent_begin: int = 0 doc_num_sent: int = 0 doc_text: str = '' doc_offset: int = 0 doc_id: str sent_text: str sent_tokens: Dict[str, Tuple[Dict[str, Any], Token]] = {} for line in doc_lines: line = line.strip() line_comps = line.split() if line.startswith("# newdoc"): doc_id = line.split("=")[1].strip() elif line.startswith("# sent"): sent_text = '' elif len(line_comps) > 0 and \ line_comps[0].strip().isdigit(): # token token_comps: Dict[str, Any] = {} for index, key in enumerate(token_comp_fields): token_comps[key] = str(line_comps[index]) if key in token_multi_fields: values = str(token_comps[key]).split("|") \ if token_comps[key] != '_' else [] if key not in token_feature_fields: token_comps[key] = values else: feature_lst = [elem.split('=', 1) for elem in values] feature_dict = {elem[0]: elem[1] for elem in feature_lst} token_comps[key] = feature_dict word: str = token_comps["form"] word_begin = doc_offset word_end = doc_offset + len(word) # add token token: Token = Token(data_pack, word_begin, word_end) token.lemma = token_comps['lemma'] token.pos = token_comps['pos'] token.ud_xpos = token_comps['ud_xpos'] token.ud_features = token_comps['ud_features'] token.ud_misc = token_comps['ud_misc'] sent_tokens[str(token_comps["id"])] = (token_comps, token) sent_text += word + " " doc_offset = word_end + 1 elif line == "": # sentence ends sent_text = sent_text.strip() doc_text += ' ' + sent_text # add dependencies for a sentence when all the tokens have been # added for token_id in sent_tokens: token_comps, token = sent_tokens[token_id] # add primary dependency label = token_comps["label"] if label == "root": token.is_root = True else: token.is_root = False head = sent_tokens[token_comps["head"]][1] dependency = Dependency(data_pack, head, token) dependency.dep_label = label # add enhanced dependencies for dep in token_comps["enhanced_dependency_relations"]: head_id, label = dep.split(":", 1) if label != "root": head = sent_tokens[head_id][1] enhanced_dependency = \ EnhancedDependency(data_pack, head, token) enhanced_dependency.dep_label = label # add sentence Sentence(data_pack, doc_sent_begin, doc_offset - 1) doc_sent_begin = doc_offset doc_num_sent += 1 doc_text = doc_text.strip() data_pack.set_text(doc_text) # add doc to data_pack Document(data_pack, 0, len(doc_text)) data_pack.pack_name = doc_id yield data_pack
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack = DataPack() with open(file_path, encoding="utf8") as doc: text = "" offset = 0 has_rows = False speaker = part_id = document_id = None sentence_begin = 0 # auxiliary structures current_entity_mention: Optional[Tuple[int, str]] = None verbal_predicates: List[PredicateMention] = [] current_pred_arg: List[Optional[Tuple[int, str]]] = [] verbal_pred_args: List[List[Tuple[PredicateArgument, str]]] = [] groups: DefaultDict[int, List[EntityMention]] = defaultdict(list) coref_stacks: DefaultDict[int, List[int]] = defaultdict(list) for line in doc: line = line.strip() if line.startswith("#end document"): break if line != "" and not line.startswith("#"): conll_components = line.split() document_id = conll_components[0] part_id = int(conll_components[1]) word = conll_components[3] pos_tag = conll_components[4] lemmatised_word = conll_components[6] framenet_id = conll_components[7] word_sense = conll_components[8] speaker = conll_components[9] entity_label = conll_components[10] pred_labels = conll_components[11:-1] word_begin = offset word_end = offset + len(word) # add tokens kwargs_i: Dict[str, Any] = {"pos": pos_tag, "sense": word_sense} token = Token(pack, word_begin, word_end) token.set_fields(**kwargs_i) pack.add_or_get_entry(token) # add entity mentions current_entity_mention = self._process_entity_annotations( pack, entity_label, word_begin, word_end, current_entity_mention ) # add predicate mentions if lemmatised_word != "-": word_is_verbal_predicate = any( ["(V" in x for x in pred_labels] ) kwargs_i = { "framenet_id": framenet_id, "pred_lemma": lemmatised_word, "pred_type": "verb" if word_is_verbal_predicate else "other" } pred_mention = PredicateMention( pack, word_begin, word_end) pred_mention.set_fields(**kwargs_i) pred_mention = pack.add_or_get_entry( pred_mention ) if word_is_verbal_predicate: verbal_predicates.append(pred_mention) if not verbal_pred_args: current_pred_arg = [None for _ in pred_labels] verbal_pred_args = [[] for _ in pred_labels] # add predicate arguments self._process_pred_annotations( pack, conll_components[11:-1], word_begin, word_end, current_pred_arg, verbal_pred_args, ) # add coreference mentions self._process_coref_annotations( pack, conll_components[-1], word_begin, word_end, coref_stacks, groups, ) text += word + " " offset = word_end + 1 has_rows = True else: if not has_rows: continue # add predicate links in the sentence for predicate, pred_arg in zip(verbal_predicates, verbal_pred_args): for arg in pred_arg: kwargs_i = { "arg_type": arg[1], } link = PredicateLink(pack, predicate, arg[0]) link.set_fields(**kwargs_i) pack.add_or_get_entry(link) verbal_predicates = [] current_pred_arg = [] verbal_pred_args = [] # add sentence kwargs_i = {"speaker": speaker, "part_id": part_id} sent = Sentence(pack, sentence_begin, offset - 1) sent.set_fields(**kwargs_i) pack.add_or_get_entry(sent) sentence_begin = offset has_rows = False # group the coreference mentions in the whole document for _, mention_list in groups.items(): # kwargs_i = {"coref_type": group_id} group = CoreferenceGroup(pack) # group.set_fields(**kwargs_i) group.add_members(mention_list) pack.add_or_get_entry(group) document = Document(pack, 0, len(text)) pack.add_or_get_entry(document) kwargs_i = {"doc_id": document_id} pack.set_meta(**kwargs_i) pack.set_text(text, replace_func=self.text_replace_operation) yield pack
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack = self.new_pack() with open(file_path, encoding="utf8") as doc: words = [] offset = 0 has_rows = False speaker = part_id = document_id = None sentence_begin = 0 # auxiliary structures current_entity_mention: Optional[Tuple[int, str]] = None verbal_predicates: List[PredicateMention] = [] current_pred_arg: List[Optional[Tuple[int, str]]] = [] verbal_pred_args: List[List[Tuple[PredicateArgument, str]]] = [] groups: DefaultDict[int, List[EntityMention]] = defaultdict(list) coref_stacks: DefaultDict[int, List[int]] = defaultdict(list) for line in doc: line = line.strip() if line.startswith("#end document"): break if line != "" and not line.startswith("#"): fields = self._parse_line(line) speaker = fields.speaker if fields.part_number is not None: part_id = int(fields.part_number) document_id = fields.document_id assert fields.word is not None word_begin = offset word_end = offset + len(fields.word) # add tokens token = Token(pack, word_begin, word_end) if fields.pos_tag is not None: token.pos = fields.pos_tag if fields.word_sense is not None: token.sense = fields.word_sense # add entity mentions current_entity_mention = self._process_entity_annotations( pack, fields.entity_label, word_begin, word_end, current_entity_mention, ) # add predicate mentions if (fields.lemmatised_word is not None and fields.lemmatised_word != "-"): word_is_verbal_predicate = any( "(V" in x for x in fields.predicate_labels) pred_mention = PredicateMention( pack, word_begin, word_end) pred_mention.predicate_lemma = fields.lemmatised_word pred_mention.is_verb = word_is_verbal_predicate if fields.framenet_id is not None: pred_mention.framenet_id = fields.framenet_id if word_is_verbal_predicate: verbal_predicates.append(pred_mention) if not verbal_pred_args: current_pred_arg = [None] * len( fields.predicate_labels) verbal_pred_args = [[] for _ in fields.predicate_labels] # add predicate arguments self._process_pred_annotations( pack, fields.predicate_labels, word_begin, word_end, current_pred_arg, verbal_pred_args, ) # add coreference mentions self._process_coref_annotations( pack, fields.coreference, word_begin, word_end, coref_stacks, groups, ) words.append(fields.word) offset = word_end + 1 has_rows = True else: if not has_rows: continue # add predicate links in the sentence for predicate, pred_arg in zip(verbal_predicates, verbal_pred_args): for arg in pred_arg: link = PredicateLink(pack, predicate, arg[0]) link.arg_type = arg[1] verbal_predicates = [] current_pred_arg = [] verbal_pred_args = [] # add sentence sent = Sentence(pack, sentence_begin, offset - 1) if speaker is not None: sent.speaker = speaker if part_id is not None: sent.part_id = int(part_id) sentence_begin = offset has_rows = False # group the coreference mentions in the whole document for _, mention_list in groups.items(): group = CoreferenceGroup(pack) group.add_members(mention_list) text = " ".join(words) pack.set_text(text, replace_func=self.text_replace_operation) _ = Document(pack, 0, len(text)) if document_id is not None: pack.pack_name = document_id yield pack
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack = DataPack() doc = codecs.open(file_path, "r", encoding="utf8") text = "" offset = 0 has_rows = False sentence_begin = 0 sentence_cnt = 0 for line in doc: line = line.strip() if line != "" and not line.startswith("#"): conll_components = line.split() word = conll_components[1] pos = conll_components[2] chunk_id = conll_components[3] ner_tag = conll_components[4] word_begin = offset word_end = offset + len(word) # Add tokens. kwargs_i = {"pos": pos, "chunk": chunk_id, "ner": ner_tag} token = Token(pack, word_begin, word_end) token.set_fields(**kwargs_i) pack.add_or_get_entry(token) text += word + " " offset = word_end + 1 has_rows = True else: if not has_rows: # Skip consecutive empty lines. continue # add sentence sent = Sentence(pack, sentence_begin, offset - 1) pack.add_or_get_entry(sent) sentence_begin = offset sentence_cnt += 1 has_rows = False if has_rows: # Add the last sentence if exists. sent = Sentence(pack, sentence_begin, offset - 1) sentence_cnt += 1 pack.add_or_get_entry(sent) document = Document(pack, 0, len(text)) pack.add_or_get_entry(document) pack.set_text(text, replace_func=self.text_replace_operation) pack.meta.doc_id = file_path doc.close() yield pack