def _parse_pack(self, data_source: str) -> Iterator[MultiPack]: """ Takes a raw string and converts into a MultiPack Args: data_source: str that contains text of a document Returns: MultiPack containing a datapack for the current query """ multi_pack = MultiPack() # use context to build the query if self.resource.get("user_utterance"): user_pack = self.resource.get("user_utterance")[-1] multi_pack.update_pack({"user_utterance": user_pack}) if self.resource.get("bot_utterance"): bot_pack = self.resource.get("bot_utterance")[-1] multi_pack.update_pack({"bot_utterance": bot_pack}) pack = DataPack() utterance = Utterance(pack, 0, len(data_source)) pack.add_entry(utterance) pack.set_text(data_source, replace_func=self.text_replace_operation) multi_pack.update_pack({self.config.pack_name: pack}) yield multi_pack
def _process_entity_annotations( self, pack: DataPack, label: Optional[str], word_begin: int, word_end: int, current_entity_mention: Optional[Tuple[int, str]], ) -> Optional[Tuple[int, str]]: if label is None: return None ner_type = label.strip("()*") if "(" in label: # Entering into a span for a particular ner. current_entity_mention = (word_begin, ner_type) if ")" in label: if current_entity_mention is None: raise ValueError( "current_entity_mention is None when meet right blanket.") # Exiting a span, add and then reset the current span. kwargs_i = {"ner_type": current_entity_mention[1]} entity = EntityMention(pack, current_entity_mention[0], word_end) entity.set_fields(**kwargs_i) pack.add_entry(entity) current_entity_mention = None return current_entity_mention
def add_wiki_info(self, pack: DataPack, statements: List[state_type]): for _, _, o in statements: resource_name = get_resource_name(o) if resource_name is not None: wc = WikiCategory(pack) wc.values.append(resource_name) pack.add_entry(wc)
def _parse_pack(self, file_path: str) -> Iterator[MultiPack]: m_pack: MultiPack = MultiPack() input_pack_name = self.config.input_pack_name output_pack_name = self.config.output_pack_name text = "" offset = 0 with open(file_path, "r", encoding="utf8") as doc: input_pack = DataPack(doc_id=file_path) for line in doc: line = line.strip() if len(line) == 0: continue # add sentence sent = Sentence(input_pack, offset, offset + len(line)) input_pack.add_entry(sent) text += line + '\n' offset = offset + len(line) + 1 input_pack.set_text(text, replace_func=self.text_replace_operation) output_pack = DataPack() m_pack.update_pack({ input_pack_name: input_pack, output_pack_name: output_pack }) yield m_pack
def pack(self, data_pack: DataPack, output_dict: Optional[Dict] = None): entries = list(data_pack.get_entries_by_type(NewType)) if len(entries) == 0: entry = NewType(pack=data_pack, value="[BATCH]") data_pack.add_entry(entry) else: entry = entries[0] # type: ignore entry.value += "[BATCH]"
def _process(self, input_pack: DataPack): entries = list(input_pack.get_entries_by_type(NewType)) if len(entries) == 0: entry = NewType(pack=input_pack, value="[PACK]") input_pack.add_entry(entry) else: entry = entries[0] # type: ignore entry.value += "[PACK]"
def test_replace(self): data_pack = DataPack() data_pack.set_text("google") token_1 = Token(data_pack, 0, 6) data_pack.add_entry(token_1) is_replace, replaced_token = self.esa.replace(token_1) self.assertTrue(is_replace) self.assertIn(replaced_token, ["yahoo", "aol", "microsoft", "web", "internet"])
def _parse_pack(self, data_source: str) -> Iterator[MultiPack]: fields = data_source.split("\t") data_pack = DataPack(doc_id=fields[0]) multi_pack = MultiPack() document = Document(pack=data_pack, begin=0, end=len(fields[1])) data_pack.add_entry(document) data_pack.set_text(fields[1]) multi_pack.update_pack({self.config.pack_name: data_pack}) yield multi_pack
def _space_token(pack: DataPack): begin = 0 for i, c in enumerate(pack.text): if c == ' ': pack.add_entry(Token(pack, begin, i)) begin = i + 1 if begin < len(pack.text): pack.add_entry(Token(pack, begin, len(pack.text)))
def test_back_translation(self): random.seed(0) data_pack = DataPack() text = "Natural Language Processing has never been made this simple!" data_pack.set_text(text) sent = Sentence(data_pack, 0, len(text)) data_pack.add_entry(sent) translated_text = "The treatment of natural language has never been easier!" assert(translated_text == self.bta.replace(sent)[1])
def test_replace(self): random.seed(42) data_pack = DataPack() test_string = "The lazy fox jumped over the fence" test_result = "T/-/3 lazy f0>< jumpe|) oveI2 th3 fe^ce" data_pack.set_text(test_string) token_1 = Token(data_pack, 0, len(test_string)) data_pack.add_entry(token_1) self.assertIn(self.test.replace(token_1)[1], test_result)
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: with open(file_path, "r", encoding="utf8") as doc: for line in doc: pack = DataPack(doc_id=file_path) line = line.strip() if len(line) == 0: continue sent = Sentence(pack, 0, len(line)) pack.add_entry(sent) pack.set_text(line) self.count += 1 yield pack
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: # type: ignore with open(file_path, "r", encoding="utf8") as doc: for line in doc: m_pack = MultiPack() pack = DataPack(doc_id=file_path) line = line.strip() if len(line) == 0: continue sent = Sentence(pack, 0, len(line)) pack.add_entry(sent) pack.set_text(line) self.count += 1 m_pack.update_pack({"pack": pack}) yield m_pack # type: ignore
def _process_coref_annotations( self, pack: DataPack, label: Optional[str], word_begin: int, word_end: int, coref_stacks: DefaultDict[int, List[int]], groups: DefaultDict[int, List[EntityMention]], ) -> None: if label is None or label == "-": return for segment in label.split("|"): # The conll representation of coref spans allows spans to overlap. if segment[0] == "(": if segment[-1] == ")": # The span begins and ends at this word (single word span). group_id = int(segment[1:-1]) coref_mention = EntityMention(pack, word_begin, word_end) coref_mention = pack.add_entry(coref_mention) groups[group_id].append(coref_mention) else: # The span is starting, so we record the index of the word. group_id = int(segment[1:]) coref_stacks[group_id].append(word_begin) else: # The span for this id is ending, but not start at this word. group_id = int(segment[:-1]) start = coref_stacks[group_id].pop() coref_mention = EntityMention(pack, start, word_end) coref_mention = pack.add_or_get_entry(coref_mention) groups[group_id].append(coref_mention)
def test_replace(self): data_pack = DataPack() data_pack.set_text("auxiliary colleague apple") token_1 = Token(data_pack, 0, 9) token_2 = Token(data_pack, 10, 19) token_3 = Token(data_pack, 20, 25) data_pack.add_entry(token_1) data_pack.add_entry(token_2) data_pack.add_entry(token_3) self.assertIn( self.tyre.replace(token_1)[1], ["auxilliary", "auxilary", "auxillary"], ) self.assertIn(self.tyre.replace(token_2)[1], ["collegue", "colleaque"]) self.assertIn(self.tyre.replace(token_3)[1], ["apple"])
def _process_pred_annotations( self, pack: DataPack, labels: List[str], word_begin: int, word_end: int, current_pred_arg: List[Optional[Tuple[int, str]]], verbal_pred_args: List[List[Tuple[PredicateArgument, str]]], ) -> None: for label_index, label in enumerate(labels): if "(" in label: # Entering into a span arg_type = label.strip("()*") current_pred_arg[label_index] = (word_begin, arg_type) if ")" in label: # Exiting a span if current_pred_arg[label_index] is None: raise ValueError( "current_pred_arg is None when meet right blanket.") arg_begin = current_pred_arg[label_index][0] # type: ignore arg_type = current_pred_arg[label_index][1] # type: ignore if arg_type != "V": pred_arg = PredicateArgument(pack, arg_begin, word_end) pred_arg = pack.add_entry(pred_arg) verbal_pred_args[label_index].append((pred_arg, arg_type)) current_pred_arg[label_index] = None
def test_segmenter(self): data_pack = DataPack() data_pack.set_text("eat phone") token_1 = Token(data_pack, 0, 3) token_2 = Token(data_pack, 4, 9) token_1.pos = "VB" token_2.pos = None data_pack.add_entry(token_1) data_pack.add_entry(token_2) self.assertIn( self.dra.replace(token_1)[1], [ "eat", "feed", "eat on", "consume", "eat up", "use up", "deplete", "exhaust", "run through", "wipe out", "corrode", "rust", ], ) self.assertIn( self.dra.replace(token_2)[1], [ "telephone", "phone", "telephone set", "speech sound", "sound", "earphone", "earpiece", "headphone", "call", "telephone", "call up", "ring", ], )
def _insert_new_span( entry_class: str, insert_ind: int, inserted_annos: List[Tuple[int, int]], new_pack: DataPack, spans: List[Span], new_spans: List[Span], ): """ An internal helper function for insertion. Args: entry_class: The new annotation type to be created. insert_ind: The index to be insert. inserted_annos: The annotation span information to be inserted. new_pack: The new data pack to insert the annotation. spans: The original spans before replacement, should be a sorted ascending list. new_spans: The original spans before replacement, should be a sorted ascending list. Returns: """ pos: int length: int pos, length = inserted_annos[insert_ind] insert_end: int = modify_index( pos, spans, new_spans, is_begin=False, # Include the inserted span itself. is_inclusive=True, ) insert_begin: int = insert_end - length new_anno = create_class_with_kwargs( entry_class, {"pack": new_pack, "begin": insert_begin, "end": insert_end}, ) new_pack.add_entry(new_anno)
def _parse_pack(self, file_path: str) -> Iterator[MultiPack]: m_pack: MultiPack = MultiPack() input_pack_name = "input_src" output_pack_name = "output_tgt" with open(file_path, "r", encoding="utf8") as doc: text = "" offset = 0 sentence_cnt = 0 input_pack = DataPack(doc_id=file_path) for line in doc: line = line.strip() if len(line) == 0: # skip empty lines continue # add sentence sent = Sentence(input_pack, offset, offset + len(line)) input_pack.add_entry(sent) text += line + '\n' offset = offset + len(line) + 1 sentence_cnt += 1 if sentence_cnt >= 20: break input_pack.set_text(text, replace_func=self.text_replace_operation) output_pack = DataPack() m_pack.update_pack({ input_pack_name: input_pack, output_pack_name: output_pack }) yield m_pack
def _insert_new_span(insert_ind: int, inserted_annos: List[Tuple[int, int]], new_pack: DataPack, spans: List[Span], new_spans: List[Span]): r""" An internal helper function for insertion. """ pos: int length: int pos, length = inserted_annos[insert_ind] insert_end: int = modify_index( pos, spans, new_spans, is_begin=False, # Include the inserted span itself. is_inclusive=True) insert_begin: int = insert_end - length new_anno = create_class_with_kwargs(entry, { "pack": new_pack, "begin": insert_begin, "end": insert_end }) new_pack.add_entry(new_anno)
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: with open(file_path, "r", encoding="utf8", errors='ignore') as file: dataset = json.load(file) pack = DataPack() text: str = dataset['article'] article_end = len(text) article = Article(pack, 0, article_end) pack.add_entry(article) offset = article_end + 1 for qid, ques_text in enumerate(dataset['questions']): text += '\n' + ques_text ques_end = offset + len(ques_text) question = Question(pack, offset, ques_end) offset = ques_end + 1 options: List[Option] = [] options_text = dataset['options'][qid] for option_text in options_text: text += '\n' + option_text option_end = offset + len(option_text) option = Option(pack, offset, option_end) options.append(option) pack.add_entry(option) offset = option_end + 1 question.set_options(options) answers = dataset['answers'][qid] if not isinstance(answers, list): answers = [answers] answers = [self._convert_to_int(ans) for ans in answers] question.set_answers(answers) pack.add_entry(question) pack.set_text(text, replace_func=self.text_replace_operation) passage_id: str = dataset['id'] passage = Passage(pack, 0, len(pack.text)) passage.set_passage_id(passage_id) pack.add_entry(passage) pack.meta.doc_id = passage_id yield pack
def pack(self, data_pack: DataPack, inputs: Dict[str, List[Prediction]]) -> None: batch_predictions = inputs["predictions"] for predictions in batch_predictions: for pred_span, arg_result in predictions: pred = data_pack.add_entry( PredicateMention(data_pack, pred_span.begin, pred_span.end)) for arg_span, label in arg_result: arg = data_pack.add_or_get_entry( PredicateArgument(data_pack, arg_span.begin, arg_span.end)) link = PredicateLink(data_pack, pred, arg) link.set_fields(arg_type=label) data_pack.add_or_get_entry(link)
def _copy_link_or_group( self, entry: Union[Link, Group], entry_map: Dict[int, int], new_pack: DataPack, ) -> bool: r""" This function copies a Link/Group in the data pack. If the children Link/Group does not exist, it will recursively create the children Link/Group. If the children Annotation does not exist, it will abort and return False. Args: entry: The Link/Group in the original data pack to copy. entry_map: The dictionary mapping original entry to copied entry. new_pack: The new data pack, which is the destination of copy. Returns: A bool value indicating whether the copy happens. """ # If the entry has been copied, return True. if entry.tid in entry_map: return True # The entry should be either Link or Group. is_link: bool = isinstance(entry, Link) # Get the children entries. children: List[Entry] if is_link: children = [entry.get_parent(), entry.get_child()] else: children = entry.get_members() # Copy the children entries. new_children: List[Entry] = [] for child_entry in children: if isinstance(child_entry, (Link, Group)): # Recursively copy the children Links/Groups. if not self._copy_link_or_group(child_entry, entry_map, new_pack): return False else: # Children Annotation must have been copied. if child_entry.tid not in entry_map: return False new_child: Entry = new_pack.get_entry(entry_map[child_entry.tid]) new_children.append(new_child) # Create the new entry and add to the new pack. new_entry: Entry if is_link: entry = cast(Link, entry) new_link_parent: Entry = new_children[0] new_link_child: Entry = new_children[1] new_entry = type(entry)(new_pack, new_link_parent, new_link_child) # type: ignore else: entry = cast(Group, entry) new_entry = type(entry)(new_pack, new_children) # type: ignore new_pack.add_entry(new_entry) entry_map[entry.tid] = new_entry.tid return True
def _parse_pack(self, file_path: str) -> Iterator[DataPack]: pack = DataPack() with open(file_path, encoding="utf8") as doc: words = [] offset = 0 has_rows = False speaker = part_id = document_id = None sentence_begin = 0 # auxiliary structures current_entity_mention: Optional[Tuple[int, str]] = None verbal_predicates: List[PredicateMention] = [] current_pred_arg: List[Optional[Tuple[int, str]]] = [] verbal_pred_args: List[List[Tuple[PredicateArgument, str]]] = [] groups: DefaultDict[int, List[EntityMention]] = defaultdict(list) coref_stacks: DefaultDict[int, List[int]] = defaultdict(list) for line in doc: line = line.strip() if line.startswith("#end document"): break if line != "" and not line.startswith("#"): fields = self._parse_line(line) speaker = fields.speaker if fields.part_number is not None: part_id = int(fields.part_number) document_id = fields.document_id assert fields.word is not None word_begin = offset word_end = offset + len(fields.word) # add tokens token = Token(pack, word_begin, word_end) if fields.pos_tag is not None: token.set_fields(pos=fields.pos_tag) if fields.word_sense is not None: token.set_fields(sense=fields.word_sense) pack.add_entry(token) # add entity mentions current_entity_mention = self._process_entity_annotations( pack, fields.entity_label, word_begin, word_end, current_entity_mention, ) # add predicate mentions if (fields.lemmatised_word is not None and fields.lemmatised_word != "-"): word_is_verbal_predicate = any( "(V" in x for x in fields.predicate_labels) kwargs_i = { "pred_lemma": fields.lemmatised_word, "pred_type": ("verb" if word_is_verbal_predicate else "other") } pred_mention = PredicateMention( pack, word_begin, word_end) pred_mention.set_fields(**kwargs_i) if fields.framenet_id is not None: pred_mention.set_fields( framenet_id=fields.framenet_id) pack.add_entry(pred_mention) if word_is_verbal_predicate: verbal_predicates.append(pred_mention) if not verbal_pred_args: current_pred_arg = [None] * len( fields.predicate_labels) verbal_pred_args = [[] for _ in fields.predicate_labels] # add predicate arguments self._process_pred_annotations( pack, fields.predicate_labels, word_begin, word_end, current_pred_arg, verbal_pred_args, ) # add coreference mentions self._process_coref_annotations( pack, fields.coreference, word_begin, word_end, coref_stacks, groups, ) words.append(fields.word) offset = word_end + 1 has_rows = True else: if not has_rows: continue # add predicate links in the sentence for predicate, pred_arg in zip(verbal_predicates, verbal_pred_args): for arg in pred_arg: kwargs_i = { "arg_type": arg[1], } link = PredicateLink(pack, predicate, arg[0]) link.set_fields(**kwargs_i) pack.add_entry(link) verbal_predicates = [] current_pred_arg = [] verbal_pred_args = [] # add sentence sent = Sentence(pack, sentence_begin, offset - 1) if speaker is not None: sent.set_fields(speaker=speaker) if part_id is not None: sent.set_fields(part_id=int(part_id)) pack.add_entry(sent) sentence_begin = offset has_rows = False # group the coreference mentions in the whole document for _, mention_list in groups.items(): # kwargs_i = {"coref_type": group_id} group = CoreferenceGroup(pack) # group.set_fields(**kwargs_i) group.add_members(mention_list) pack.add_entry(group) text = " ".join(words) document = Document(pack, 0, len(text)) pack.add_entry(document) if document_id is not None: pack.set_meta(doc_id=document_id) pack.set_text(text, replace_func=self.text_replace_operation) yield pack