def __init__(self, config: Dict, vocab: Optional[Vocabulary] = None): super().__init__(vocab=vocab or vocabulary.create_empty_vocabulary()) # saves the config in the pl checkpoints self.save_hyperparameters("config") config = PipelineConfiguration.from_dict(config) tokenizer = config.build_tokenizer() featurizer = config.features.compile_featurizer(tokenizer) embedder = config.build_embedder(self.vocab) head = config.head.compile(backbone=ModelBackbone( self.vocab, featurizer=featurizer, embedder=embedder, encoder=config.encoder, )) self.name = config.name self._head = None self.set_head(head) self.file_path: Optional[str] = None self.optimizer: Optional[torch.optim.Optimizer] = None # The lr_scheduler dict follows the Lightning format: # https://pytorch-lightning.readthedocs.io/en/stable/common/optimizers.html#learning-rate-scheduling self.lr_scheduler: Optional[Dict] = None self.best_metrics: Optional[Dict[str, torch.Tensor]] = None # This is set by our trainer to figure out the best_metrics # what metric to monitor? self.monitor: Optional[str] = None # shall the metric increase ("max") or decrease ("min")? self.monitor_mode: Optional[str] = None
def test_pipeline_without_word_features(): tokenizer_config = TokenizerConfiguration() char_features = CharFeatures( embedding_dim=2, encoder={ "type": "gru", "hidden_size": 2, "num_layers": 1, "bidirectional": True, }, dropout=0.1, ) features_config = FeaturesConfiguration(char=char_features) encoder_spec = Seq2SeqEncoderConfiguration(type="gru", hidden_size=2, num_layers=1, bidirectional=True) head_spec = TaskHeadConfiguration( type="TextClassification", labels=["duplicate", "not_duplicate"], pooler={"type": "boe"}, ) pipeline_config = PipelineConfiguration( name="no_word_features", head=head_spec, features=features_config, tokenizer=tokenizer_config, encoder=encoder_spec, ) pl = Pipeline.from_config(pipeline_config) assert "word" not in pl.backbone.featurizer.indexer assert "char" in pl.backbone.featurizer.indexer
def from_config( cls, config: Union[PipelineConfiguration, dict], ) -> "Pipeline": """Creates a pipeline from a `PipelineConfiguration` object or a configuration dictionary Parameters ---------- config A `PipelineConfiguration` object or a configuration dict Returns ------- pipeline A configured pipeline """ if isinstance(config, PipelineConfiguration): config = config.as_dict() model = PipelineModel(config=config) if not isinstance(model, PipelineModel): raise TypeError(f"Cannot load model. Wrong format of {model}") cls._add_transformers_vocab_if_needed(model) return cls(model, PipelineConfiguration.from_dict(config))
def from_config( cls, config: Union[PipelineConfiguration, dict], vocab_path: Optional[str] = None, ) -> "Pipeline": """Creates a pipeline from a `PipelineConfiguration` object or a configuration dictionary Parameters ---------- config: `Union[PipelineConfiguration, dict]` A `PipelineConfiguration` object or a configuration dict vocab_path: `Optional[str]` If provided, the pipeline vocabulary will be loaded from this path Returns ------- pipeline: `Pipeline` A configured pipeline """ if isinstance(config, dict): config = PipelineConfiguration.from_dict(config) model = PipelineModel.from_params( Params({"config": config}), vocab=Vocabulary.from_files(vocab_path) if vocab_path is not None else None, ) if not isinstance(model, PipelineModel): raise TypeError(f"Cannot load model. Wrong format of {model}") cls._add_transformers_vocab_if_needed(model) return cls(model, config)
def test_pipeline_config(pipeline_yaml): tokenizer_config = TokenizerConfiguration( text_cleaning={"rules": ["strip_spaces"]}, use_spacy_tokens=True) word_features = WordFeatures(embedding_dim=2, lowercase_tokens=True) char_features = CharFeatures( embedding_dim=2, encoder={ "type": "gru", "hidden_size": 2, "num_layers": 1, "bidirectional": True, }, dropout=0.1, ) features_config = FeaturesConfiguration(word=word_features, char=char_features) encoder_spec = Seq2SeqEncoderConfiguration(type="gru", hidden_size=2, num_layers=1, bidirectional=True) head_spec = TaskHeadConfiguration( type=TextClassification, labels=["duplicate", "not_duplicate"], pooler={"type": "boe"}, ) pipeline_config = PipelineConfiguration( name="test_pipeline_config", head=head_spec, features=features_config, tokenizer=tokenizer_config, encoder=encoder_spec, ) pl = Pipeline.from_config(pipeline_config) pl_yaml = Pipeline.from_yaml(pipeline_yaml) assert pl.named_trainable_parameters == pl_yaml.named_trainable_parameters assert pl.num_trainable_parameters == pl_yaml.num_trainable_parameters assert pl.num_parameters == pl_yaml.num_parameters sample_text = "My simple text" for instance in [ pl.backbone.featurizer(sample_text), pl_yaml.backbone.featurizer(sample_text), ]: for key, value in instance.items(): assert key == "record" assert isinstance(value, ListField) assert len(value) == 1 for text in value: assert isinstance(text, TextField) assert all(map(lambda t: isinstance(t, Token), text.tokens)) assert sample_text == " ".join([t.text for t in text.tokens])
def from_yaml(cls, path: str) -> "Pipeline": """Creates a pipeline from a config yaml file Parameters ---------- path The path to a YAML configuration file Returns ------- pipeline A configured pipeline """ pipeline_configuration = PipelineConfiguration.from_yaml(path) return cls.from_config(pipeline_configuration)
def from_yaml(cls, path: str, vocab_path: Optional[str] = None) -> "Pipeline": """Creates a pipeline from a config yaml file Parameters ---------- path : `str` The path to a YAML configuration file vocab_path : `Optional[str]` If provided, the pipeline vocab will be loaded from this path Returns ------- pipeline: `Pipeline` A configured pipeline """ pipeline_configuration = PipelineConfiguration.from_yaml(path) return cls.from_config(pipeline_configuration, vocab_path=vocab_path)
def from_config( cls, config: Union[PipelineConfiguration, dict], vocab_path: Optional[str] = None, ) -> "Pipeline": """Creates a pipeline from a `PipelineConfiguration` object or a configuration dictionary Parameters ---------- config: `Union[PipelineConfiguration, dict]` A `PipelineConfiguration` object or a configuration dict vocab_path: `Optional[str]` If provided, the pipeline vocabulary will be loaded from this path Returns ------- pipeline: `Pipeline` A configured pipeline """ if isinstance(config, dict): config = PipelineConfiguration.from_dict(config) return _BlankPipeline(config=config, vocab=vocabulary.load_vocabulary(vocab_path))
def _config_from_archive(archive: Archive) -> PipelineConfiguration: config = archive.config["model"]["config"] return PipelineConfiguration.from_params(config)