Пример #1
0
 def parse_and_save(self, formalism: str, input_file: str,
                    output_file: str) -> None:
     """
     Parses an input file and saves it to some given output file. Old content will be overwritten.
     :param input_file:
     :param formalism: the name of the formalism of the input_file
     :param output_file:
     :return:
     """
     assert self.model, "model must be given, either to the constructor or to set_model"
     instances = self.dataset_reader.read([[
         formalism, input_file
     ]])  #we need to give the formalism to amconll dataset_reader
     prev_training_status = self.model.training
     self.model.train(False)
     predictions = self.dataset_reader.restore_order(
         forward_on_instances(self.model, instances, self.data_iterator))
     self.model.train(prev_training_status
                      )  #reset training status to whatever it was before
     i2edge_label = [
         self.model.vocab.get_token_from_index(
             i, namespace=formalism + "_head_tags") for i in range(
                 self.model.vocab.get_vocab_size(formalism + "_head_tags"))
     ]
     decoder = AMDecoder(output_file, i2edge_label)
     for pred in predictions:
         attributes = pred["attributes"]
         am_sentence = AMSentence(
             pred["words"], attributes)  #(form,replacement,lemma,pos,ne)
         sentence = list(
             zip(am_sentence.get_tokens(shadow_art_root=False),
                 am_sentence.get_replacements(), am_sentence.get_lemmas(),
                 am_sentence.get_pos(), am_sentence.get_ner(),
                 am_sentence.get_ranges()))
         decoder.add_sentence(pred["root"], pred["predicted_heads"],
                              pred["label_logits"], pred["lexlabels"],
                              pred["supertags"], sentence,
                              am_sentence.attributes_to_list())
     decoder.decode(self.threads, self.k, self.give_up, self.give_up_k_1)
Пример #2
0
    def text_to_instance(self,  # type: ignore
                         formalism: str,
                         position_in_corpus : int,
                         am_sentence: AMSentence) -> Instance:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        formalism : str.
            The formalism of this instance (e.g. DM, PSD, ...)
        position_in_corpus : ``int``, required.
            The index of this sentence in the corpus.
        am_sentence : ``AMSentence``, required.
            The words in the sentence to be encoded.

        Returns
        -------
        An instance containing words, pos tags, dependency edge labels, head
        indices, supertags and lexical labels as fields.
        """
        fields: Dict[str, Field] = {}

        tokens = TextField([Token(w) for w in am_sentence.get_tokens(shadow_art_root=True)], self._token_indexers)
        fields["words"] = tokens
        fields["pos_tags"] = SequenceLabelField(am_sentence.get_pos(), tokens, label_namespace="pos")
        fields["ner_tags"] = SequenceLabelField(am_sentence.get_ner(), tokens, label_namespace="ner_labels")
        fields["lemmas"] = SequenceLabelField(am_sentence.get_lemmas(), tokens, label_namespace="lemmas")
        fields["supertags"] = SequenceLabelField(am_sentence.get_supertags(), tokens, label_namespace=formalism+"_supertag_labels")
        fields["lexlabels"] = SequenceLabelField(am_sentence.get_lexlabels(), tokens, label_namespace=formalism+"_lex_labels")
        fields["head_tags"] = SequenceLabelField(am_sentence.get_edge_labels(),tokens, label_namespace=formalism+"_head_tags") #edge labels
        fields["head_indices"] = SequenceLabelField(am_sentence.get_heads(),tokens,label_namespace="head_index_tags")

        fields["metadata"] = MetadataField({"words": am_sentence.words, "attributes": am_sentence.attributes,
                                            "formalism": formalism, "position_in_corpus" : position_in_corpus,
                                            "token_ranges" : am_sentence.get_ranges(),
                                            "is_annotated" : am_sentence.is_annotated()})
        return Instance(fields)