def _parse_pack(self, file_path: str) -> Iterator[DataPack]: data_pack: DataPack = DataPack() sent_begin: int = 0 doc_text: str = "" with open(file_path, encoding="utf8") as doc: for para in doc: para = self.preprocess_reviews(para) sents = para.split("\n") for sent in sents: if len(sent) > 0: sent = sent.strip() doc_text += sent + " " doc_offset = sent_begin + len(sent) + 1 # Add sentences. Sentence(data_pack, sent_begin, doc_offset - 1) sent_begin = doc_offset pos_dir: str = os.path.basename(os.path.dirname(file_path)) movie_file: str = os.path.basename(file_path) title: List = movie_file.split('_') doc_id: str = pos_dir + title[0] score: float = float(title[1].split('.')[0]) score /= 10.0 data_pack.pack_name = doc_id data_pack.set_text(doc_text) # Add documents. document: Document = Document(data_pack, 0, len(doc_text)) document.sentiment = {doc_id: score} yield data_pack
def get_index(pack: DataPack, index_entries: List[Annotation], context_entry: Annotation): founds = [] for i, entry in enumerate(index_entries): if pack.covers(context_entry, entry): founds.append(i) return [founds[0], founds[-1] + 1]
def pack(self, data_pack: DataPack, output_dict: Optional[Dict] = None): r"""Add corresponding fields to data_pack""" if output_dict is None: return for i in range(len(output_dict["RelationLink"]["parent.tid"])): for j in range(len(output_dict["RelationLink"]["parent.tid"][i])): link = RelationLink(data_pack) link.rel_type = output_dict["RelationLink"]["rel_type"][i][j] parent: EntityMention = data_pack.get_entry( # type: ignore output_dict["RelationLink"]["parent.tid"][i][j]) link.set_parent(parent) child: EntityMention = data_pack.get_entry( # type: ignore output_dict["RelationLink"]["child.tid"][i][j]) link.set_child(child) data_pack.add_or_get_entry(link)
def extract(self, pack: DataPack, instance: Annotation) -> Feature: r"""Extract the subword feature of one instance. Args: pack (Datapack): The datapack that contains the current instance. instance (Annotation): The instance from which the extractor will extractor feature. Returns: Feature: a feature that contains the extracted data. """ data = [] for subword in pack.get(self._entry_type, instance): text = subword.text if not subword.is_first_segment: text = "##" + text data.append(self.element2repr(text)) data = ([self.element2repr("[CLS]")] + data + [self.element2repr("[SEP]")]) meta_data = { "need_pad": self.vocab.use_pad, # type: ignore "pad_value": self.get_pad_value(), "dim": 1, "dtype": int, } return Feature(data=data, metadata=meta_data, vocab=self.vocab)
def extract(self, pack: DataPack, instance: Annotation) -> Feature: r"""Extract the character feature of one instance. Args: pack (Datapack): The datapack that contains the current instance. instance (Annotation): The instance from which the extractor will extractor feature. Returns (Feature): a feature that contains the extracted data. """ data = [] max_char_length = -1 for word in pack.get(self._entry_type, instance): if self.vocab: data.append([self.element2repr(char) for char in word.text]) else: data.append(list(word.text)) max_char_length = max(max_char_length, len(data[-1])) if (hasattr(self.config, "max_char_length") and self.config.max_char_length is not None and self.config.max_char_length < max_char_length): data = [token[:self.config.max_char_length] for token in data] meta_data = { "need_pad": self.config.need_pad, "pad_value": self.get_pad_value(), "dim": 2, "dtype": int if self.vocab else str, } return Feature(data=data, metadata=meta_data, vocab=self.vocab)
def update_vocab(self, pack: DataPack, context: Optional[Annotation] = None): r"""Get all attributes of one instance and add them into the vocabulary. Args: pack (DataPack): The data pack input to extract vocabulary. context (Annotation): The context is an Annotation entry where features will be extracted within its range. If None, then the whole data pack will be used as the context. Default is None. """ entry: Entry for entry in pack.get(self.config.entry_type, context): # The following pylint skip due to a bug: # https://github.com/PyCQA/pylint/issues/3507 # Hashable is not recognized the type. # pylint: disable=isinstance-second-argument-not-valid-type element = self._get_attribute(entry, self.config.attribute) if not isinstance(element, Hashable): raise AttributeError( "Only hashable element can be" "added into the vocabulary. Consider setting" "vocab_method to be raw and do not call update_vocab" "if you only need the raw attribute value without" "converting them into index.") self.add(element)
def _get_data_batch( self, data_pack: DataPack, context_type: Type[Annotation], requests: Optional[Dict[Type[Entry], Union[Dict, List]]] = None, offset: int = 0) -> Iterable[Tuple[Dict, int]]: r"""Try to get batches from a dataset with ``batch_size``, but will yield an incomplete batch if the data_pack is exhausted. Returns: An iterator of tuples ``(batch, cnt)``, ``batch`` is a dict containing the required annotations and context, and ``cnt`` is the number of instances in the batch. """ instances: List[Dict] = [] current_size = sum(self.current_batch_sources) for data in data_pack.get_data(context_type, requests, offset): instances.append(data) if len(instances) == self.batch_size - current_size: batch = batch_instances(instances) self.batch_is_full = True yield (batch, len(instances)) instances = [] self.batch_is_full = False # Flush the remaining data. if len(instances) > 0: batch = batch_instances(instances) yield (batch, len(instances))
def _process(self, data_pack: DataPack): """ Process the data pack to collect vocabulary information. Args: data_pack: The ner data to create vocabulary with. Returns: """ # for data_pack in input_pack: for instance in data_pack.get_data(context_type=Sentence, request={ Token: ["chunk", "pos", "ner"] }): for token in instance["Token"]["text"]: for char in token: self.char_cnt[char] += 1 word = self.normalize_func(token) self.word_cnt[word] += 1 for pos in instance["Token"]["pos"]: self.pos_cnt[pos] += 1 for chunk in instance["Token"]["chunk"]: self.chunk_cnt[chunk] += 1 for ner in instance["Token"]["ner"]: self.ner_cnt[ner] += 1
def _parse_pack(self, doc_info: Tuple[str, str]) -> Iterator[DataPack]: r"""Takes the `doc_info` returned by the `_collect` method and returns a `data_pack` that either contains entry of the type `Query`, or contains an entry of the type Document. Args: doc_info: document info to be populated in the data_pack. Returns: query or document data_pack. """ data_pack: DataPack = DataPack() doc_id, doc_text = doc_info data_pack.pack_name = doc_id # data_pack.pack_id = doc_id data_pack.set_text(doc_text) # add documents # print([data_pack.pack_id, data_pack.pack_name, data_pack.text]) Document(data_pack, 0, len(doc_text)) # print([data_pack.pack_id, data_pack.pack_name, data_pack.text]) yield data_pack
def _process(self, input_pack: DataPack): token_entries = list( input_pack.get(entry_type=Token, components=self.token_component)) token_texts = [token.text for token in token_entries] taggings = pos_tag(token_texts) for token, tag in zip(token_entries, taggings): token.pos = tag[1]
def _process(self, input_pack: DataPack): entries = list(input_pack.get_entries_by_type(NewType)) if len(entries) == 0: NewType(pack=input_pack, value="[PACK]") else: entry = entries[0] # type: ignore entry.value += "[PACK]"
def extract(self, pack: DataPack, instance: Annotation) -> Feature: r"""Extract attributes of one instance. For example, the text of tokens in one sentence. Args: pack (Datapack): The datapack that contains the current instance. instance (Annotation): The instance from which the extractor will extractor feature. Returns: Feature: a feature that contains the extracted data. """ data = [] for entry in pack.get(self.config.entry_type, instance): value = self.get_attribute(entry, self.config.attribute) rep = self.element2repr(value) if self.vocab else value data.append(rep) meta_data = { "need_pad": self.config.need_pad, "pad_value": self.get_pad_value(), "dim": 1, "dtype": int if self.vocab else Any } return Feature(data=data, metadata=meta_data, vocab=self.vocab)
def add_to_pack( self, pack: DataPack, instance: Annotation, prediction: Iterable[Union[int, Any]], ): r"""Add the prediction for attribute to the instance. If the prediction is an iterable object, we assume each of the element in prediction will correspond to one entry. If the prediction is only one element, then we assume there will only be one entry in the instance. Extending this class will need to handle the specific prediction data types. The default implementation assume the data type is Integer. Args: pack (DataPack): The datapack that contains the current instance. instance (Annotation): The instance to which the extractor add prediction. prediction (Iterable[Union[int, Any]]): This is the output of the model, which contains the index for attributes of one instance. """ instance_entry = list(pack.get(self._entry_type, instance)) # The following pylint skip due to a bug: # https://github.com/PyCQA/pylint/issues/3507 # Iterable is not recognized the type. # pylint: disable=isinstance-second-argument-not-valid-type if not isinstance(prediction, Iterable): prediction = [prediction] values = [self.id2element(int(x)) for x in prediction] for entry, value in zip(instance_entry, values): self._set_attribute(entry, self.config.attribute, value)
def _process_pred_annotations( self, pack: DataPack, labels: List[str], word_begin: int, word_end: int, current_pred_arg: List[Optional[Tuple[int, str]]], verbal_pred_args: List[List[Tuple[PredicateArgument, str]]], ) -> None: for label_index, label in enumerate(labels): arg_type = label.strip("()*") if arg_type == "V": continue if "(" in label: # Entering into a span current_pred_arg[label_index] = (word_begin, arg_type) if ")" in label: # Exiting a span if current_pred_arg[label_index] is None: raise ValueError( "current_pred_arg is None when meet right blanket.") arg_begin = current_pred_arg[label_index][0] # type: ignore arg_type = current_pred_arg[label_index][1] # type: ignore pred_arg = PredicateArgument(pack, arg_begin, word_end) pred_arg = pack.add_or_get_entry(pred_arg) verbal_pred_args[label_index].append((pred_arg, arg_type)) current_pred_arg[label_index] = None
def pack(self, data_pack: DataPack, output_dict: Optional[Dict] = None): entries = list(data_pack.get_entries_of(NewType)) if len(entries) == 0: NewType(pack=data_pack, value="[BATCH]") else: entry = entries[0] # type: ignore entry.value += "[BATCH]"
def _parse_pack(self, sent_lines) -> Iterator[DataPack]: data_pack: DataPack = DataPack() sent_bias: int = 0 batch_text: str = "\n".join( [sent_text for _, sent_text, _ in sent_lines] ) data_pack.set_text(batch_text) for i, sent_line in enumerate(sent_lines): sent_id: str = sent_line[0] sent_text: str = sent_line[1].strip() parent_pointer_list: List[int] = sent_line[2] # Name the data_pack with the first sentence id. if i == 0: data_pack.pack_name = sent_id # Add sentence to data_pack. Sentence(data_pack, sent_bias, sent_bias + len(sent_text)) self._parse_parent_pointer_list( data_pack, sent_bias, sent_text, parent_pointer_list ) sent_bias += len(sent_text) + 1 yield data_pack
def _parse_pack( self, collection: Tuple[str, Dict[str, List[state_type]]]) -> Iterator[DataPack]: resource_name, info_box_data = collection if resource_name in self.redirects: resource_name = self.redirects[resource_name] if resource_name in self.pack_index: print_progress(f'Add infobox to resource: [{resource_name}]') pack_path = os.path.join(self.pack_dir, self.pack_index[resource_name]) if os.path.exists(pack_path): with open(pack_path) as pack_file: pack = DataPack.deserialize(pack_file.read()) add_info_boxes(pack, info_box_data['literals']) add_info_boxes(pack, info_box_data['objects']) add_property(pack, info_box_data['properties']) yield pack else: print_notice(f"Resource {resource_name} is not in the raw packs.") self.logger.warning("Resource %s is not in the raw packs.", resource_name)
def _parse_pack( self, doc_data: Dict[str, str] ) -> Iterator[DataPack]: pack = DataPack() doc_name: str = doc_data['doc_name'] if doc_name in self.__redirects: doc_name = self.__redirects[doc_name] full_text: str = doc_data['text'] pack.set_text(full_text) page = WikiPage(pack, 0, len(full_text)) page.page_id = doc_data['oldid'] page.page_name = doc_name pack.pack_name = doc_name yield pack
def extract(self, pack: DataPack, context: Optional[Annotation] = None) -> Feature: """Extract the attribute of an entry of the configured entry type. The entry type is passed in from via extractor config `entry_type`. Args: pack (DataPack): The datapack that contains the current instance. context (Annotation): The context is an Annotation entry where features will be extracted within its range. If None, then the whole data pack will be used as the context. Default is None. Returns: Features (attributes) for instance with in the provided context, they will be converted to the representation based on the vocabulary configuration. """ data = [] instance: Annotation for instance in pack.get(self.config.entry_type, context): value = self._get_attribute(instance, self.config.attribute) rep = self.element2repr(value) if self.vocab else value data.append(rep) meta_data = { "need_pad": self.config.need_pad, "pad_value": self.get_pad_value(), "dim": 1, "dtype": int if self.vocab else Any, } return Feature(data=data, metadata=meta_data, vocab=self.vocab)
def _parse_pack(self, data_source: str) -> Iterator[DataPack]: r"""Takes a string which could be either a filepath or html_content and converts into a DataPack. Args: data_source: str that contains text of a document or a filepath Returns: DataPack containing Document. """ pack = DataPack() # Check if data_source is a filepath if self.init_with_fileloc: with open(data_source, "r", encoding="utf8", errors='ignore') as file: text = file.read() # else, must be a string with actual data else: text = data_source self.set_text(pack, text) # Note that pack.text can be different from the text passed in, due to # the text_replace_operation Document(pack, 0, len(pack.text)) yield pack
def _parse_pack(self, data_source: str) -> Iterator[DataPack]: r"""Takes a raw string and converts into a :class:`DataPack`. Args: data_source: str that contains text of a document. Returns: :class:`DataPack` containing Document. """ pack = DataPack() document = Document(pack, 0, len(data_source)) pack.add_or_get_entry(document) self.set_text(pack, data_source) yield pack
def add_to_pack( self, pack: DataPack, predictions: List[int], context: Optional[Annotation] = None, ): r"""Add the prediction results to data pack. The predictions are We make following assumptions for prediction. 1. If we encounter "I" while its tag is different from the previous tag, we will consider this "I" as a "B" and start a new tag here. 2. We will truncate the prediction it according to the number of entry. If the prediction contains `<PAD>` element, this should remove them. Args: pack (DataPack): The datapack that contains the current instance. predictions (Iterable[Union[int, Any]]): This is the output of the model, which contains the index for attributes of one instance. context (Annotation): The context is an Annotation entry where features will be extracted within its range. If None, then the whole data pack will be used as the context. Default is None. """ instance_tagging_unit: List[Annotation] = list( pack.get(self._tagging_unit, context)) if self.config.is_bert: predictions = predictions[1:-1] predictions = predictions[:len(instance_tagging_unit)] if isinstance(predictions, Tensor): predictions = predictions.cpu().numpy() tags = [self.id2element(x) for x in predictions] tag_start = None tag_end = None tag_type = None for entry, tag in zip(instance_tagging_unit, tags): if (tag[1] == "O" or tag[1] == "B" or (tag[1] == "I" and tag[0] != tag_type)): if tag_type: entity_mention = self._entry_type(pack, tag_start, tag_end) setattr(entity_mention, self._attribute, tag_type) tag_start = entry.begin tag_end = entry.end tag_type = tag[0] else: tag_end = entry.end # Handle the final tag if tag_type and tag_start and tag_end: entity_mention = self._entry_type( pack, tag_start, tag_end # type: ignore ) setattr(entity_mention, self._attribute, tag_type)
def _process(self, input_pack: DataPack): subword_tokenizer = self.tokenizer.wordpiece_tokenizer for token in input_pack.get(Token): subwords = subword_tokenizer.tokenize_with_span(token.text) for subword, start, end in subwords: subword_token = Subword(input_pack, token.begin + start, token.begin + end) subword_token.is_first_segment = not subword.startswith("##")
def _process(self, input_pack: DataPack): for sentence in input_pack.get(Sentence): token_entries = list( input_pack.get(entry_type=Token, range_annotation=sentence, component=self.token_component)) token_texts = [token.text for token in token_entries] token_pos = [ penn2morphy(token.pos) # type: ignore for token in token_entries ] lemmas = [ self.lemmatizer.lemmatize(token_texts[i], token_pos[i]) for i in range(len(token_texts)) ] for token, lemma in zip(token_entries, lemmas): token.set_fields(lemma=lemma)
def pack(self, data_pack: DataPack, inputs: Dict[str, List[Prediction]]) -> None: batch_predictions = inputs["predictions"] for predictions in batch_predictions: for pred_span, arg_result in predictions: pred = data_pack.add_entry( PredicateMention(data_pack, pred_span.begin, pred_span.end)) for arg_span, label in arg_result: arg = data_pack.add_or_get_entry( PredicateArgument(data_pack, arg_span.begin, arg_span.end)) link = PredicateLink(data_pack, pred, arg) link.set_fields(arg_type=label) data_pack.add_or_get_entry(link)
def _parse_pack(self, table: str) -> Iterator[DataPack]: p: DataPack = DataPack(pack_name="table_" + table.split("|")[0]) p.set_text(table) # Create the table. UtteranceContext(p, 0, len(table)) yield p
def _parse_pack(self, file_path: str) -> Iterator[MultiPack]: m_pack: MultiPack = MultiPack() input_pack_name = self.config.input_pack_name output_pack_name = self.config.output_pack_name text = "" offset = 0 with open(file_path, "r", encoding="utf8") as doc: input_pack = DataPack(doc_id=file_path) for line in doc: line = line.strip() if len(line) == 0: continue # add sentence sent = Sentence(input_pack, offset, offset + len(line)) input_pack.add_entry(sent) text += line + '\n' offset = offset + len(line) + 1 input_pack.set_text(text, replace_func=self.text_replace_operation) output_pack = DataPack() m_pack.update_pack({ input_pack_name: input_pack, output_pack_name: output_pack }) yield m_pack
def _process(self, input_pack: DataPack): # handle existing entries self._process_existing_entries(input_pack) batch_size: int = self.configs["infer_batch_size"] batches: Iterator[Iterator[Sentence]] # Need a copy of the one-pass iterators to support a second loop on # them. All other ways around it like using `itertools.tee` and `list` # would require extra storage conflicting with the idea of using # iterators in the first place. `more_itertools.ichunked` uses # `itertools.tee` under the hood but our usage (reading iterators # in order) does not cause memory issues. batches_copy: Iterator[Iterator[Sentence]] if batch_size <= 0: batches = iter([input_pack.get(Sentence)]) batches_copy = iter([input_pack.get(Sentence)]) else: batches = more_itertools.ichunked(input_pack.get(Sentence), batch_size) batches_copy = more_itertools.ichunked(input_pack.get(Sentence), batch_size) for sentences, sentences_copy in zip(batches, batches_copy): inputs: List[Dict[str, str]] = [{ "sentence": s.text } for s in sentences] results: Dict[str, List[Dict[str, Any]]] = { k: p.predict_batch_json(inputs) for k, p in self.predictor.items() } for i, sent in enumerate(sentences_copy): result: Dict[str, List[str]] = {} for key in self.predictor: if key == "srl": result.update( parse_allennlp_srl_results( results[key][i]["verbs"])) else: result.update(results[key][i]) if "tokenize" in self.configs.processors: # creating new tokens and dependencies tokens = self._create_tokens(input_pack, sent, result) if "depparse" in self.configs.processors: self._create_dependencies(input_pack, tokens, result) if "srl" in self.configs.processors: self._create_srl(input_pack, tokens, result)
def _process(self, input_pack: DataPack): serialized_datapack: str = input_pack.serialize() self.documents.append( (str(input_pack.pack_id), input_pack.text, serialized_datapack)) if len(self.documents) == self.config.batch_size: self._bulk_process() self.documents = []
def create_utterance(input_pack: DataPack, text: str, speaker: str): """ Create an utterance in the datapack. This is composed of two steps: 1. Append the utterance text to the data pack. 2. Create :class:`~ft.onto.base_ontology.Utterance` entry on the text. 3. Set the speaker of the utterance to the provided `speaker`. Args: input_pack: The data pack to add utterance into. text: The text of the utterance. speaker: The speaker name to be associated with the utterance. """ input_pack.set_text(input_pack.text + '\n' + text) u = Utterance(input_pack, len(input_pack.text) - len(text), len(input_pack.text)) u.speaker = speaker