Exemplo n.º 1
0
    def init_target_metadata(
        self,
        train_data: textdata.Dataset,
        eval_data: textdata.Dataset,
        test_data: textdata.Dataset,
    ):
        self.metadata.target = []
        # build vocabs for label fields
        for name, label in self.labels.items():
            if name in [Target.TARGET_PROB_FIELD, Target.TARGET_LOGITS_FIELD]:
                continue
            # Need test data to make sure we cover all of the labels in it
            # It is particularly important when BIO is enabled as a B-[Label] can
            # appear in train and eval but test can have B-[Label] and I-[Label]
            weights = None
            if label.use_vocab:
                if not hasattr(label, "vocab"):  # Don't rebuild vocab
                    print("Building vocab for label {}".format(name))
                    label.build_vocab(
                        train_data,
                        eval_data,
                        test_data,
                        min_freq=getattr(label, "min_freq", 1),
                    )
                else:
                    print(f"Vocab for label {name} has been built. Not rebuilding.")
                print(
                    "{} field's vocabulary size is {}".format(
                        name, len(label.vocab.itos)
                    )
                )
                pretrained_embeddings = None
                pretrained_embeddings_path = getattr(
                    label, "pretrained_embeddings_path", None
                )
                if pretrained_embeddings_path:
                    pretrained_embeddings = embeddings_utils.PretrainedEmbedding(
                        pretrained_embeddings_path
                    )
                if pretrained_embeddings:
                    weights = pretrained_embeddings.initialize_embeddings_weights(
                        label.vocab.stoi,
                        label.unk_token,
                        label.embed_dim,
                        label.embedding_init_strategy,
                    )  # this is of type torch.Tensor

            meta = label.get_meta()
            meta.pretrained_embeds_weight = weights
            self.metadata.target.append(meta)
        if len(self.metadata.target) == 1:
            [self.metadata.target] = self.metadata.target
Exemplo n.º 2
0
    def init_feature_metadata(
        self,
        train_data: textdata.Dataset,
        eval_data: textdata.Dataset,
        test_data: textdata.Dataset,
    ):
        # field metadata
        self.metadata.features = {}
        # build vocabs for features
        for name, feat in self.features.items():
            weights = None
            if feat.use_vocab:
                pretrained_embeddings = None
                pretrained_embeddings_path = getattr(
                    feat, "pretrained_embeddings_path", None
                )
                if pretrained_embeddings_path:
                    print(
                        "load pretrained embeddings from {}".format(
                            pretrained_embeddings_path
                        )
                    )
                    pretrained_embeddings = embeddings_utils.PretrainedEmbedding(
                        pretrained_embeddings_path, feat.lower
                    )

                if hasattr(feat, "vocab"):  # Don't rebuild vocab
                    print(f"Vocab for feature {name} has been built. Not rebuilding.")
                else:
                    print(f"Building vocab for feature {name}.")
                    vocab_data = self._get_data_to_build_vocab(
                        feat, train_data, eval_data, test_data, pretrained_embeddings
                    )
                    feat.build_vocab(*vocab_data, min_freq=feat.min_freq)
                print("{} field's vocabulary size is {}".format(name, len(feat.vocab)))

                # Initialize pretrained embedding weights.
                if pretrained_embeddings:
                    weights = pretrained_embeddings.initialize_embeddings_weights(
                        feat.vocab.stoi,
                        VocabMeta.UNK_TOKEN,
                        feat.embed_dim,
                        feat.embedding_init_strategy,
                    )  # this is of type torch.Tensor

            meta = feat.get_meta()
            meta.pretrained_embeds_weight = weights
            self.metadata.features[name] = meta