示例#1
0
    def from_config(
        cls,
        config: Config,
        feature_config: ModelInputConfig,
        target_config: TargetConfig,
        **kwargs,
    ):
        """Factory method to construct an instance of
        ContextualIntentSlotModelDataHandler object from the module's config,
        model input config and target config.

        Args:
            config (Config): Configuration object specifying all the
                parameters of ContextualIntentSlotModelDataHandler.
            feature_config (ModelInputConfig): Configuration object specifying
                model input.
            target_config (TargetConfig): Configuration object specifying target.

        Returns:
            type: An instance of ContextualIntentSlotModelDataHandler.

        """
        features: Dict[str, Field] = create_fields(
            feature_config,
            {
                ModelInput.TEXT: TextFeatureField,
                ModelInput.DICT: DictFeatureField,
                ModelInput.CHAR: CharFeatureField,
                ModelInput.CONTEXTUAL_TOKEN_EMBEDDING:
                ContextualTokenEmbeddingField,
                ModelInput.SEQ: SeqFeatureField,
                ModelInput.DENSE: FloatVectorField,
            },
        )

        # Label fields.
        labels: Dict[str, Field] = create_label_fields(
            target_config,
            {
                DocLabelConfig._name: DocLabelField,
                WordLabelConfig._name: WordLabelField,
            },
        )

        extra_fields: Dict[str, Field] = {
            ExtraField.DOC_WEIGHT: FloatField(),
            ExtraField.WORD_WEIGHT: FloatField(),
            ExtraField.RAW_WORD_LABEL: RawField(),
            ExtraField.TOKEN_RANGE: RawField(),
            ExtraField.UTTERANCE: RawField(),
        }

        kwargs.update(config.items())
        return cls(
            raw_columns=config.columns_to_read,
            labels=labels,
            features=features,
            extra_fields=extra_fields,
            **kwargs,
        )
示例#2
0
    def from_config(cls, config: Config, feature_config: FeatureConfig, *args,
                    **kwargs):
        word_feat_config = feature_config.word_feat
        features: Dict[str, Field] = {
            DatasetFieldName.TEXT_FIELD:
            TextFeatureFieldWithSpecialUnk(
                pretrained_embeddings_path=word_feat_config.
                pretrained_embeddings_path,
                embed_dim=word_feat_config.embed_dim,
                embedding_init_strategy=word_feat_config.
                embedding_init_strategy,
                vocab_file=word_feat_config.vocab_file,
                vocab_size=word_feat_config.vocab_size,
                vocab_from_train_data=word_feat_config.vocab_from_train_data,
                vocab_from_all_data=word_feat_config.vocab_from_all_data,
                min_freq=word_feat_config.min_freq,
                pad_token=None,
            )
        }
        if feature_config.dict_feat and feature_config.dict_feat.embed_dim > 0:
            features[DatasetFieldName.DICT_FIELD] = DictFeatureField()

        # Adding action_field to list of features so that it can be passed to
        # RNNGParser's forward method during training time.
        action_field = ActionField()  # Use the same field for label too.
        features[ACTION_FEATURE_FIELD] = action_field

        if feature_config.contextual_token_embedding:
            features[
                DatasetFieldName.
                CONTEXTUAL_TOKEN_EMBEDDING] = ContextualTokenEmbeddingField(
                    embed_dim=feature_config.contextual_token_embedding.
                    embed_dim)

        extra_fields: Dict[str, Field] = {
            DatasetFieldName.TOKENS: RawField(),
            "text": RawField(),
        }

        return cls(
            raw_columns=config.columns_to_read,
            features=features,
            labels={ACTION_LABEL_FIELD: action_field},
            extra_fields=extra_fields,
            train_path=config.train_path,
            eval_path=config.eval_path,
            test_path=config.test_path,
            train_batch_size=config.train_batch_size,
            eval_batch_size=config.eval_batch_size,
            test_batch_size=config.test_batch_size,
            shuffle=config.shuffle,
            sort_within_batch=config.sort_within_batch,
            column_mapping=config.column_mapping,
            **kwargs,
        )
示例#3
0
    def from_config(
        cls,
        config: Config,
        model_input_config: ModelInputConfig,
        target_config: TargetConfig,
        **kwargs,
    ):
        """
        Factory method to construct an instance of `DocClassificationDataHandler`
        from the module's config object and feature config object.

        Args:
            config (DocClassificationDataHandler.Config): Configuration object
                specifying all the parameters of `DocClassificationDataHandler`.
            model_input_config (ModelInputConfig): Configuration object
                specifying all the parameters of the model config.
            target_config (TargetConfig): Configuration object specifying all
                the parameters of the target.

        Returns:
            type: An instance of `KDDocClassificationDataHandler`.
        """
        model_input_fields: Dict[str, Field] = create_fields(
            model_input_config,
            {
                ModelInput.WORD_FEAT:
                TextFeatureField,
                ModelInput.DICT_FEAT:
                DictFeatureField,
                ModelInput.CHAR_FEAT:
                CharFeatureField,
                ModelInput.PRETRAINED_MODEL_EMBEDDING:
                PretrainedModelEmbeddingField,
            },
        )
        target_fields: Dict[str, Field] = create_label_fields(
            target_config, {DocLabelConfig._name: DocLabelField})
        extra_fields: Dict[str, Field] = {ExtraField.RAW_TEXT: RawField()}
        if target_config.target_prob:
            target_fields[Target.TARGET_PROB_FIELD] = RawField()
            target_fields[Target.TARGET_LOGITS_FIELD] = RawField()

        if target_config.target_prob:
            extra_fields[Target.TARGET_LABEL_FIELD] = RawField()
        kwargs.update(config.items())
        return cls(
            raw_columns=config.columns_to_read,
            labels=target_fields,
            features=model_input_fields,
            extra_fields=extra_fields,
            **kwargs,
        )
示例#4
0
    def from_config(
        cls,
        config: Config,
        feature_config: FeatureConfig,
        label_configs: Union[DocLabelConfig, WordLabelConfig,
                             List[TargetConfigBase]],
        **kwargs,
    ):
        features: Dict[str, Field] = create_fields(
            feature_config,
            {
                DatasetFieldName.TEXT_FIELD:
                TextFeatureField,
                DatasetFieldName.DICT_FIELD:
                DictFeatureField,
                DatasetFieldName.CHAR_FIELD:
                CharFeatureField,
                DatasetFieldName.DENSE_FIELD:
                FloatVectorField,
                DatasetFieldName.PRETRAINED_MODEL_EMBEDDING:
                PretrainedModelEmbeddingField,
            },
        )

        # Label fields.
        labels: Dict[str, Field] = create_label_fields(
            label_configs,
            {
                DocLabelConfig._name: DocLabelField,
                WordLabelConfig._name: WordLabelField,
            },
        )
        has_word_label = WordLabelConfig._name in labels

        extra_fields: Dict[str, Field] = {
            DatasetFieldName.DOC_WEIGHT_FIELD: FloatField(),
            DatasetFieldName.WORD_WEIGHT_FIELD: FloatField(),
            DatasetFieldName.TOKEN_RANGE: RawField(),
            DatasetFieldName.UTTERANCE_FIELD: RawField(),
        }
        if has_word_label:
            extra_fields[DatasetFieldName.RAW_WORD_LABEL] = RawField()

        kwargs.update(config.items())
        return cls(
            raw_columns=config.columns_to_read,
            labels=labels,
            features=features,
            extra_fields=extra_fields,
            **kwargs,
        )
    def from_config(cls, config: Config, feature_config: FeatureConfig,
                    label_config: DocLabelConfig, **kwargs):
        word_feat_config = feature_config.word_feat
        features: Dict[str, Field] = {
            DatasetFieldName.TEXT_FIELD:
            SeqFeatureField(
                pretrained_embeddings_path=word_feat_config.
                pretrained_embeddings_path,
                embed_dim=word_feat_config.embed_dim,
                embedding_init_strategy=word_feat_config.
                embedding_init_strategy,
                vocab_file=word_feat_config.vocab_file,
                vocab_size=word_feat_config.vocab_size,
                vocab_from_train_data=word_feat_config.vocab_from_train_data,
            )
        }
        labels: Dict[str, Field] = {DocLabelConfig._name: DocLabelField()}
        extra_fields: Dict[str, Field] = {
            DatasetFieldName.UTTERANCE_FIELD: RawField()
        }

        return cls(raw_columns=config.columns_to_read,
                   labels=labels,
                   features=features,
                   extra_fields=extra_fields,
                   shuffle=config.shuffle,
                   train_path=config.train_path,
                   eval_path=config.eval_path,
                   test_path=config.test_path,
                   train_batch_size=config.train_batch_size,
                   eval_batch_size=config.eval_batch_size,
                   test_batch_size=config.test_batch_size,
                   **kwargs)
    def from_config(
        cls,
        config: Config,
        feature_config: ModelInputConfig,
        target_config: TargetConfig,
        **kwargs,
    ):
        features: Dict[str, Field] = create_fields(
            feature_config,
            {
                ModelInput.TEXT1: TextFeatureField,
                ModelInput.TEXT2: TextFeatureField
            },
        )
        assert len(features) == 2
        # share the processing field
        features[ModelInput.TEXT2] = features[ModelInput.TEXT1]

        labels: Dict[str, Field] = create_label_fields(
            target_config, {DocLabelConfig._name: DocLabelField})
        extra_fields: Dict[str, Field] = {
            ExtraField.UTTERANCE_PAIR: RawField()
        }
        kwargs.update(config.items())
        return cls(
            raw_columns=config.columns_to_read,
            labels=labels,
            features=features,
            extra_fields=extra_fields,
            **kwargs,
        )
示例#7
0
  def from_config(cls, config: Config,
                  feature_config: ModelInputConfig,
                  target_config: ModelOutputConfig,
                  text_embedder_config: EmbedderInterface.Config,
                  **kwargs):

    text_embedder: EmbedderInterface = EmbedderInterface.from_config(text_embedder_config)
    features: Dict[str, Field] = {
      ModelInput.SEQ: BPEField(text_embedder)
    }
    assert len(features)

    targets: Dict[str, Field] = {
      ModelOutputConfig._name: BPEField(text_embedder, is_target=True, all_responses=config.all_responses),
    }
    extra_fields = {
      RAW_TEXT: RawField(),
      ModelInput.DLG_LEN: RawField(),
      ModelInput.DLG_ID: RawField(),
      ModelInput.DOMAIN_ID: RawField(),
      ModelInput.TASK_ID: RawField()
    }

    kwargs.update(config.items())
    self = cls(
      raw_columns=[],  # ignored in our read function
      features=features,
      labels=targets,
      extra_fields=extra_fields,
      **kwargs,
    )
    self.max_turns = config.max_turns
    self.text_embedder_cfg = text_embedder_config
    self.all_responses = config.all_responses
    self.preproc_chunksize = config.preproc_chunksize
    self.train_domains = config.train_domains
    self.eval_domains = config.eval_domains
    self.featurized_cache_dir = config.featurized_cache_dir
    self.test_domains = config.test_domains
    self.text_embedder = text_embedder
    self.seed = config.seed
    return self
示例#8
0
    def from_config(cls, config: Config, feature_config: FeatureConfig, *args,
                    **kwargs):
        """
        Factory method to construct an instance of `LanguageModelDataHandler`
        from the module's config object and feature config object.

        Args:
            config (LanguageModelDataHandler.Config): Configuration object
                specifying all the parameters of `LanguageModelDataHandler`.
            feature_config (FeatureConfig): Configuration object specifying all
                the parameters of all input features.

        Returns:
            type: An instance of `LanguageModelDataHandler`.
        """
        # For language modeling the only input is a collection of utterances.
        # The input and the labels are created by the LangaugeModelDataHandler.
        # The input at time step t+1 becomes a label for the input at time step t.
        word_feat_config = feature_config.word_feat
        features: Dict[str, Field] = {
            DatasetFieldName.TEXT_FIELD:
            TextFeatureField(
                eos_token=VocabMeta.EOS_TOKEN if config.append_eos else None,
                init_token=VocabMeta.INIT_TOKEN if config.append_bos else None,
                pretrained_embeddings_path=word_feat_config.
                pretrained_embeddings_path,
                embed_dim=word_feat_config.embed_dim,
                embedding_init_strategy=word_feat_config.
                embedding_init_strategy,
                vocab_file=word_feat_config.vocab_file,
                vocab_size=word_feat_config.vocab_size,
                vocab_from_train_data=word_feat_config.vocab_from_train_data,
            )
        }
        labels: Dict[str, Field] = {}
        extra_fields: Dict[str, Field] = {
            DatasetFieldName.UTTERANCE_FIELD: RawField()
        }
        return cls(raw_columns=config.columns_to_read,
                   features=features,
                   labels=labels,
                   extra_fields=extra_fields,
                   train_path=config.train_path,
                   eval_path=config.eval_path,
                   test_path=config.test_path,
                   train_batch_size=config.train_batch_size,
                   eval_batch_size=config.eval_batch_size,
                   test_batch_size=config.test_batch_size,
                   **kwargs)
示例#9
0
    def __init__(
        self,
        raw_columns: List[str],
        labels: Dict[str, Field],
        features: Dict[str, Field],
        featurizer: Featurizer,
        extra_fields: Dict[str, Field] = None,
        text_feature_name: str = DatasetFieldName.TEXT_FIELD,
        shuffle: bool = True,
        sort_within_batch: bool = True,
        train_path: str = "train.tsv",
        eval_path: str = "eval.tsv",
        test_path: str = "test.tsv",
        train_batch_size: int = 128,
        eval_batch_size: int = 128,
        test_batch_size: int = 128,
        max_seq_len: int = -1,
        pass_index: bool = True,
        column_mapping: Dict[str, str] = None,
        **kwargs,
    ) -> None:
        self.raw_columns: List[str] = raw_columns or []
        self.labels: Dict[str, Field] = labels or {}
        self.features: Dict[str, Field] = features or {}
        self.featurizer = featurizer
        self.extra_fields: Dict[str, Field] = extra_fields or {}
        if pass_index:
            self.extra_fields[BatchContext.INDEX] = RawField()
        self.text_feature_name: str = text_feature_name

        self.metadata_cls: Type = CommonMetadata
        self.metadata: CommonMetadata = CommonMetadata()
        self._data_cache: MutableMapping[str, Any] = {}
        self.shuffle = shuffle
        self.sort_within_batch = sort_within_batch
        self.num_workers = multiprocessing.cpu_count()
        self.max_seq_len = max_seq_len

        self.train_path = train_path
        self.eval_path = eval_path
        self.test_path = test_path
        self.train_batch_size = train_batch_size
        self.eval_batch_size = eval_batch_size
        self.test_batch_size = test_batch_size
        self.column_mapping = column_mapping
        log_class_usage(__class__)
示例#10
0
    def from_config(cls, config: Config, feature_config: FeatureConfig,
                    label_config: DocLabelConfig, **kwargs):
        word_feat_config = feature_config.word_feat
        dense_feat_config = feature_config.dense_feat
        features: Dict[str, Field] = {
            ModelInput.WORD_FEAT:
            SeqFeatureField(
                pretrained_embeddings_path=word_feat_config.
                pretrained_embeddings_path,
                embed_dim=word_feat_config.embed_dim,
                embedding_init_strategy=word_feat_config.
                embedding_init_strategy,
                vocab_file=word_feat_config.vocab_file,
                vocab_size=word_feat_config.vocab_size,
                vocab_from_train_data=word_feat_config.vocab_from_train_data,
            )
        }
        if dense_feat_config:
            features[ModelInput.DENSE_FEAT] = FloatVectorField(
                dim=dense_feat_config.dim,
                dim_error_check=dense_feat_config.dim_error_check,
            )

        labels: Dict[str, Field] = {DocLabelConfig._name: DocLabelField()}
        extra_fields: Dict[str, Field] = {
            DatasetFieldName.UTTERANCE_FIELD: RawField()
        }

        return cls(raw_columns=config.columns_to_read,
                   labels=labels,
                   features=features,
                   extra_fields=extra_fields,
                   shuffle=config.shuffle,
                   train_path=config.train_path,
                   eval_path=config.eval_path,
                   test_path=config.test_path,
                   train_batch_size=config.train_batch_size,
                   eval_batch_size=config.eval_batch_size,
                   test_batch_size=config.test_batch_size,
                   **kwargs)