def __init__(self, embedding_name, *args, **kwargs): """Use this if you want to use pretrained embedding. See description of IntersectedVocab to get a list of the embedding available from torchtext Parameters ---------- embedding_name : str Name of the pretrained alias for the embedding to used """ self.type = "pretrained" if embedding_name not in vocab.pretrained_aliases: from mmf.common.registry import registry writer = registry.get("writer") error = "Unknown embedding type: %s" % embedding_name, "error" if writer is not None: writer.write(error, "error") raise RuntimeError(error) vector_cache = get_mmf_cache_dir() # First test loading the vectors in master so that everybody doesn't # download it in case it doesn't exist if is_master(): vocab.pretrained_aliases[embedding_name](cache=vector_cache) synchronize() embedding = vocab.pretrained_aliases[embedding_name]( cache=vector_cache) self.UNK_INDEX = 3 self.stoi = defaultdict(lambda: self.UNK_INDEX) self.itos = {} self.itos[self.PAD_INDEX] = self.PAD_TOKEN self.itos[self.SOS_INDEX] = self.SOS_TOKEN self.itos[self.EOS_INDEX] = self.EOS_TOKEN self.itos[self.UNK_INDEX] = self.UNK_TOKEN self.stoi[self.SOS_TOKEN] = self.SOS_INDEX self.stoi[self.EOS_TOKEN] = self.EOS_INDEX self.stoi[self.PAD_TOKEN] = self.PAD_INDEX self.stoi[self.UNK_TOKEN] = self.UNK_INDEX self.vectors = torch.FloatTensor( len(self.itos.keys()) + len(embedding.itos), len(embedding.vectors[0])) for i in range(4): self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i index = 4 for word in embedding.stoi: self.itos[index] = word self.stoi[word] = index actual_index = embedding.stoi[word] self.vectors[index] = embedding.vectors[actual_index] index += 1
def _try_download(self): _is_master = is_master() if self._already_downloaded: return needs_download = False if not hasattr(self.config, "model_file"): if _is_master: warnings.warn("'model_file' key is required but missing " "from FastTextProcessor's config.") needs_download = True model_file = self.config.model_file # If model_file is already an existing path don't join to cache dir if not PathManager.exists(model_file): model_file = os.path.join(get_mmf_cache_dir(), model_file) if not PathManager.exists(model_file): if _is_master: warnings.warn(f"No model file present at {model_file}.") needs_download = True if needs_download: logger.info("Downloading FastText bin") model_file = self._download_model() self.model_file = model_file self._already_downloaded = True synchronize()
def restore(self): synchronize() logger.info("Restoring checkpoint") best_path = os.path.join(self.ckpt_foldername, self.ckpt_prefix + "best.ckpt") if PathManager.exists(best_path): self._load(best_path, force=True)
def load_requirements(self, *args, **kwargs): if is_master(): requirements = self.config.get("zoo_requirements", []) if isinstance(requirements, str): requirements = [requirements] for item in requirements: download_pretrained_model(item, *args, **kwargs) synchronize()
def download_pretrained_model(model_name, *args, **kwargs): import omegaconf from omegaconf import OmegaConf from mmf.utils.configuration import load_yaml, get_mmf_env model_zoo = load_yaml(get_mmf_env(key="model_zoo")) OmegaConf.set_struct(model_zoo, True) OmegaConf.set_readonly(model_zoo, True) data_dir = get_absolute_path(get_mmf_env("data_dir")) model_data_dir = os.path.join(data_dir, "models") download_path = os.path.join(model_data_dir, model_name) try: model_config = OmegaConf.select(model_zoo, model_name) except omegaconf.errors.OmegaConfBaseException as e: print(f"No such model name {model_name} defined in mmf zoo") raise e if "version" not in model_config or "resources" not in model_config: # Version and Resources are not present time to try the defaults try: model_config = model_config.defaults download_path = os.path.join(model_data_dir, model_name + ".defaults") except omegaconf.errors.OmegaConfBaseException as e: print( f"Model name {model_name} doesn't specify 'resources' and 'version' " "while no defaults have been provided" ) raise e # Download requirements if any specified by "zoo_requirements" field # This can either be a list or a string if "zoo_requirements" in model_config: requirements = model_config.zoo_requirements if isinstance(requirements, str): requirements = [requirements] for item in requirements: download_pretrained_model(item, *args, **kwargs) version = model_config.version resources = model_config.resources if is_master(): download_resources(resources, download_path, version) synchronize() return download_path
def build_dataset(self, config, dataset_type="train", *args, **kwargs): """ Similar to load function, used by MMF to build a dataset for first time when it is not available. This internally calls 'build' function. Override that function in your child class. Args: config (DictConfig): Configuration of this dataset loaded from config. dataset_type (str): Type of dataset, train|val|test .. warning:: DO NOT OVERRIDE in child class. Instead override ``build``. """ # Only build in main process, so none of the others have to build if is_master(): self.build(config, dataset_type, *args, **kwargs) synchronize()
def load(self): self.image_path = os.path.join(self._data_folder, _CONSTANTS["images_folder"], self._dataset_type) with open( os.path.join( self._data_folder, _CONSTANTS["questions_folder"], _TEMPLATES["question_json_file"].format( self._dataset_type), )) as f: self.questions = json.load(f)[_CONSTANTS["questions_key"]] # Vocab should only be built in main process, as it will repetition of same task if is_master(): self._build_vocab(self.questions, _CONSTANTS["question_key"]) self._build_vocab(self.questions, _CONSTANTS["answer_key"]) synchronize()
def build_lightning_model( config: Union[DictConfig, "mmf.models.base_model.BaseModel.Config"], checkpoint_path: str = None, ) -> "mmf.models.base_model.BaseModel": from mmf.models.base_model import BaseModel if not checkpoint_path: model = build_model(config) model.is_pl_enabled = True return model # If it is not an OmegaConf object, create the object if not isinstance(config, DictConfig) and isinstance(config, BaseModel.Config): config = OmegaConf.structured(config) model_name = config.model model_class = registry.get_model_class(model_name) if model_class is None: raise RuntimeError(f"No model registered for name: {model_name}") """ model.build is called inside on_load_checkpoint as suggested here: https://github.com/PyTorchLightning/pytorch-lightning/issues/5410 """ if is_main(): model_class.load_requirements(model_class, config=config) model = model_class.load_from_checkpoint( checkpoint_path, config=config, strict=False ) synchronize() else: synchronize() model = model_class.load_from_checkpoint( checkpoint_path, config=config, strict=False ) model.init_losses() model.is_pl_enabled = True return model
def build_multiple_datamodules( dataset_list: List[str], all_dataset_config: DictConfig) -> Dict[str, pl.LightningDataModule]: datamodules: Dict[str, pl.LightningDataModule] = {} for dataset in dataset_list: datamodule_instance = build_datamodule(dataset) if dataset in all_dataset_config: dataset_config = all_dataset_config[dataset] else: warnings.warn(f"Dataset {dataset} is missing from dataset_config" + " in config. Proceeding with empty config.") dataset_config = OmegaConf.create() if is_master(): datamodule_instance.prepare_data(dataset_config) synchronize() datamodule_instance.setup(config=dataset_config) if hasattr(datamodule_instance, "update_registry_for_model"): datamodule_instance.update_registry_for_model(dataset_config) datamodules[dataset] = datamodule_instance return datamodules
def build_model( config: Union[DictConfig, "mmf.models.base_model.BaseModel.Config"] ) -> "mmf.models.base_model.BaseModel": from mmf.models.base_model import BaseModel # If it is not an OmegaConf object, create the object if not isinstance(config, DictConfig) and isinstance( config, BaseModel.Config): config = OmegaConf.structured(config) model_name = config.model model_class = registry.get_model_class(model_name) if model_class is None: raise RuntimeError(f"No model registered for name: {model_name}") model = model_class(config) if hasattr(model, "build"): model.load_requirements() """ Model build involves checkpoint loading If the checkpoint is not available the underlying methods try to download it. Let master build the model (download the checkpoints) while other ranks wait for the sync message Once the master has downloaded the checkpoint and built the model it sends the sync message, completing the synchronization now other cores can proceed to build the model using already downloaded checkpoint. """ if is_master(): model.build() synchronize() else: synchronize() model.build() model.init_losses() return model
def __init__(self, vocab_file, embedding_name, *args, **kwargs): """Use this vocab class when you have a custom vocabulary class but you want to use pretrained embedding vectos for it. This will only load the vectors which intersect with your vocabulary. Use the embedding_name specified in torchtext's pretrained aliases: ['charngram.100d', 'fasttext.en.300d', 'fasttext.simple.300d', 'glove.42B.300d', 'glove.840B.300d', 'glove.twitter.27B.25d', 'glove.twitter.27B.50d', 'glove.twitter.27B.100d', 'glove.twitter.27B.200d', 'glove.6B.50d', 'glove.6B.100d', 'glove.6B.200d', 'glove.6B.300d'] Parameters ---------- vocab_file : str Vocabulary file containing list of words with one word per line which will be used to collect vectors embedding_name : str Embedding name picked up from the list of the pretrained aliases mentioned above """ super().__init__(vocab_file, *args, **kwargs) self.type = "intersected" name = embedding_name.split(".")[0] dim = embedding_name.split(".")[2][:-1] middle = embedding_name.split(".")[1] class_name = EMBEDDING_NAME_CLASS_MAPPING[name] if not hasattr(vocab, class_name): raise RuntimeError(f"Unknown embedding type: {name}") params = [middle] if name == "glove": params.append(int(dim)) vector_cache = get_mmf_cache_dir() # First test loading the vectors in master so that everybody doesn't # download it in case it doesn't exist if is_main(): vocab.pretrained_aliases[embedding_name](cache=vector_cache) synchronize() embedding = getattr(vocab, class_name)(*params, cache=vector_cache) self.vectors = torch.empty( (self.get_size(), len(embedding.vectors[0])), dtype=torch.float) self.embedding_dim = len(embedding.vectors[0]) for i in range(0, 4): self.vectors[i] = torch.ones_like(self.vectors[i]) * 0.1 * i for i in range(4, self.get_size()): word = self.itos[i] embedding_index = embedding.stoi.get(word, None) if embedding_index is None: self.vectors[i] = self.vectors[self.UNK_INDEX] else: self.vectors[i] = embedding.vectors[embedding_index]