def parse_and_save(self, formalism: str, input_file: str, output_file: str) -> None: """ Parses an input file and saves it to some given output file. Old content will be overwritten. :param input_file: :param formalism: the name of the formalism of the input_file :param output_file: :return: """ assert self.model, "model must be given, either to the constructor or to set_model" instances = self.dataset_reader.read([[ formalism, input_file ]]) #we need to give the formalism to amconll dataset_reader prev_training_status = self.model.training self.model.train(False) predictions = self.dataset_reader.restore_order( forward_on_instances(self.model, instances, self.data_iterator)) self.model.train(prev_training_status ) #reset training status to whatever it was before i2edge_label = [ self.model.vocab.get_token_from_index( i, namespace=formalism + "_head_tags") for i in range( self.model.vocab.get_vocab_size(formalism + "_head_tags")) ] decoder = AMDecoder(output_file, i2edge_label) for pred in predictions: attributes = pred["attributes"] am_sentence = AMSentence( pred["words"], attributes) #(form,replacement,lemma,pos,ne) sentence = list( zip(am_sentence.get_tokens(shadow_art_root=False), am_sentence.get_replacements(), am_sentence.get_lemmas(), am_sentence.get_pos(), am_sentence.get_ner(), am_sentence.get_ranges())) decoder.add_sentence(pred["root"], pred["predicted_heads"], pred["label_logits"], pred["lexlabels"], pred["supertags"], sentence, am_sentence.attributes_to_list()) decoder.decode(self.threads, self.k, self.give_up, self.give_up_k_1)
def text_to_instance(self, # type: ignore formalism: str, position_in_corpus : int, am_sentence: AMSentence) -> Instance: # pylint: disable=arguments-differ """ Parameters ---------- formalism : str. The formalism of this instance (e.g. DM, PSD, ...) position_in_corpus : ``int``, required. The index of this sentence in the corpus. am_sentence : ``AMSentence``, required. The words in the sentence to be encoded. Returns ------- An instance containing words, pos tags, dependency edge labels, head indices, supertags and lexical labels as fields. """ fields: Dict[str, Field] = {} tokens = TextField([Token(w) for w in am_sentence.get_tokens(shadow_art_root=True)], self._token_indexers) fields["words"] = tokens fields["pos_tags"] = SequenceLabelField(am_sentence.get_pos(), tokens, label_namespace="pos") fields["ner_tags"] = SequenceLabelField(am_sentence.get_ner(), tokens, label_namespace="ner_labels") fields["lemmas"] = SequenceLabelField(am_sentence.get_lemmas(), tokens, label_namespace="lemmas") fields["supertags"] = SequenceLabelField(am_sentence.get_supertags(), tokens, label_namespace=formalism+"_supertag_labels") fields["lexlabels"] = SequenceLabelField(am_sentence.get_lexlabels(), tokens, label_namespace=formalism+"_lex_labels") fields["head_tags"] = SequenceLabelField(am_sentence.get_edge_labels(),tokens, label_namespace=formalism+"_head_tags") #edge labels fields["head_indices"] = SequenceLabelField(am_sentence.get_heads(),tokens,label_namespace="head_index_tags") fields["metadata"] = MetadataField({"words": am_sentence.words, "attributes": am_sentence.attributes, "formalism": formalism, "position_in_corpus" : position_in_corpus, "token_ranges" : am_sentence.get_ranges(), "is_annotated" : am_sentence.is_annotated()}) return Instance(fields)