def create_tokenizer(self): if self.isComputed(): logger.info("Tokenizer for this dataset has already been created") self.tokenizer = RobertaTokenizerFast.from_pretrained( f"{self.data_dir}", max_len=512) return logger.info(f"Training tokenizer on data in {self.data_dir}") self.train() azure_storage.upload(self.data_dir / "vocab.json") azure_storage.upload(self.data_dir / "merges.txt")
def upload(self): paths = [str(x) for x in Path(self.model_dir).glob("**/*")] for file in paths: azure_storage.upload(file)
def upload(self): azure_storage.upload(self.data_dir / self.file_name)