示例#1
0
    def pre_init(self, hparams):
        teacher = T5ForConditionalGeneration.from_pretrained(hparams.teacher)
        n_layer = hparams.student_decoder_layers
        assert n_layer == hparams.student_encoder_layers  # TODO(SS): relax this
        d_layers_to_copy = get_layers_to_copy(n_layer,
                                              len(teacher.decoder.block))
        e_layers_to_copy: List = get_layers_to_copy(n_layer,
                                                    len(teacher.encoder.block))
        student_updates = {"num_layers": n_layer}
        hparams.d_layer_to_copy = d_layers_to_copy
        hparams.e_layer_to_copy = e_layers_to_copy
        kw = teacher.config.to_diff_dict()

        kw.update(student_updates)
        # Copy weights
        student_cfg = T5Config(**kw)
        student = T5ForConditionalGeneration(student_cfg)
        student, _ = init_student(student, teacher)
        self.copy_to_student(d_layers_to_copy, e_layers_to_copy, hparams,
                             student, teacher)
        Path(hparams.output_dir).mkdir(exist_ok=True)
        task_specific_params = student.config.task_specific_params
        if task_specific_params is not None:
            student.config.update(task_specific_params.get(
                "summarization", {}))
        return d_layers_to_copy, student, student_cfg, teacher
示例#2
0
    def pre_init(self, hparams):
        raise NotImplementedError("T5 Distillation does not work yet")
        self.output_dir = Path(hparams.output_dir)
        self.output_dir.mkdir(exist_ok=True)
        teacher = T5ForConditionalGeneration.from_pretrained(hparams.teacher)
        n_layer = hparams.student_decoder_layers
        assert n_layer == hparams.student_encoder_layers  # TODO(SS): relax this constraint so that we can do 12-6.
        d_layers_to_copy = get_layers_to_copy(n_layer,
                                              len(teacher.decoder.block))
        e_layers_to_copy: List = get_layers_to_copy(n_layer,
                                                    len(teacher.encoder.block))
        student_updates = {"num_layers": n_layer}
        hparams.d_layer_to_copy = d_layers_to_copy
        hparams.e_layer_to_copy = e_layers_to_copy
        kw = teacher.config.to_diff_dict()

        kw.update(student_updates)
        # Copy weights
        student_cfg = T5Config(**kw)
        student = T5ForConditionalGeneration(student_cfg)
        student, _ = init_student(student, teacher)
        self.copy_to_student(d_layers_to_copy, e_layers_to_copy, hparams,
                             student, teacher)
        Path(hparams.output_dir).mkdir(exist_ok=True)
        task_specific_params = student.config.task_specific_params
        if task_specific_params is not None:
            student.config.update(task_specific_params.get(
                "summarization", {}))  # TODO: dont hardcode
        save_dir = self.output_dir.joinpath("student")
        save_dir.mkdir(exist_ok=True)

        student.save_pretrained(save_dir)
        hparams.model_name_or_path = str(save_dir)
        return student, student_cfg, teacher
def convert_model(base_model, path, new_path):
    model = T5ForConditionalGeneration(T5Config.from_pretrained(base_model))
    print("loading weights...")
    load_tf_weights_in_t5(model, None, path)
    model.eval()
    print("saving HF weights...")
    model.save_pretrained(new_path)
示例#4
0
    def __init__(self, config):
        super().__init__(config)
        
        self.save_mem = config.save_mem
        # self.t5 = T5ForConditionalGeneration(config)
        if self.save_mem:
            self.dummy_tensor = torch.ones(1, dtype=torch.float32, requires_grad=True)
            self.t5_wrapped = ModuleWrapperIgnores2ndArg(T5ForConditionalGeneration(config))
        else:
            self.t5 = self.t5_wrapped = T5ForConditionalGeneration(config)
            
        
        #choose to use hidden states or softmaxed values for classification? currently softmaxed_values
        # self.classifier = Linear(int((config.max_seq_len/2)-1), 1)
 
        self.init_weights()
 def create_t5_and_check_t5_generate_with_past_key_value_states(
     self,
     config,
     input_ids,
     decoder_input_ids,
     attention_mask,
     decoder_attention_mask,
     lm_labels,
 ):
     model = T5ForConditionalGeneration(config=config)
     model.to(torch_device)
     model.eval()
     torch.manual_seed(0)
     output_without_past_cache = model.generate(input_ids[:1],
                                                num_beams=2,
                                                max_length=5,
                                                do_sample=True,
                                                use_cache=False)
     torch.manual_seed(0)
     output_with_past_cache = model.generate(input_ids[:1],
                                             num_beams=2,
                                             max_length=5,
                                             do_sample=True)
     self.parent.assertTrue(
         torch.all(output_with_past_cache == output_without_past_cache))
示例#6
0
    def __init__(self, model: str = None):
        log.info(model)
        torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        log.info(torch_device)
        if model is None:
            model = "t5"
        self.modelName = model
        # path to all the files that will be used for inference
        self.path = f"./app/api/{model}/"
        self.model_path = self.path + "pytorch_model.bin"
        self.config_path = self.path + "config.json"

        # Selecting the correct model based on the passed madel input. Default t5
        if model == "t5":
            self.config = T5Config.from_json_file(self.config_path)
            self.model = T5ForConditionalGeneration(self.config)
            self.tokenizer = T5Tokenizer.from_pretrained(self.path)
            self.model.eval()
            self.model.load_state_dict(torch.load(self.model_path, map_location=torch_device))
        elif model == "google/pegasus-newsroom":
            self.config = PegasusConfig.from_json_file(self.config_path)
            # self.model = PegasusForConditionalGeneration(self.config)
            # self.tokenizer = PegasusTokenizer.from_pretrained(self.path)
            self.model = PegasusForConditionalGeneration.from_pretrained(model).to(torch_device)
            self.tokenizer = PegasusTokenizer.from_pretrained(model)
        elif model == "facebook/bart-large-cnn":
            self.config = BartConfig.from_json_file(self.config_path)
            # self.model = PegasusForConditionalGeneration(self.config)
            # self.tokenizer = PegasusTokenizer.from_pretrained(self.path)
            self.model = BartForConditionalGeneration.from_pretrained(model).to(torch_device)
            self.tokenizer = BartTokenizer.from_pretrained(model)
        else:
            raise Exception("This model is not supported")

        self.text = str()
 def build_model_and_check_forward_pass(self, **kwargs):
     tester = T5ModelTester(self, **kwargs)
     config, *inputs = tester.prepare_config_and_inputs()
     (
         input_ids,
         decoder_input_ids,
         attention_mask,
         decoder_attention_mask,
         lm_labels,
     ) = inputs
     model = T5ForConditionalGeneration(
         config=config).to(torch_device).eval()
     outputs = model(
         input_ids=input_ids,
         decoder_input_ids=decoder_input_ids,
         decoder_attention_mask=decoder_attention_mask,
         labels=lm_labels,
     )
     # outputs = model(*inputs)
     assert len(outputs) == 4
     assert outputs["logits"].size() == (tester.batch_size,
                                         tester.decoder_seq_length,
                                         tester.vocab_size)
     assert outputs["loss"].size() == ()
     return model
 def create_and_check_t5_with_lm_head(
     self,
     config,
     input_ids,
     decoder_input_ids,
     attention_mask,
     decoder_attention_mask,
     lm_labels,
 ):
     model = T5ForConditionalGeneration(config=config)
     model.to(torch_device)
     model.eval()
     outputs = model(
         input_ids=input_ids,
         decoder_input_ids=decoder_input_ids,
         decoder_attention_mask=decoder_attention_mask,
         labels=lm_labels,
     )
     loss, prediction_scores, _, _ = outputs
     self.parent.assertEqual(len(outputs), 4)
     result = {
         "loss": loss,
         "prediction_scores": prediction_scores,
     }
     self.parent.assertListEqual(
         list(result["prediction_scores"].size()),
         [self.batch_size, self.decoder_seq_length, self.vocab_size])
     self.check_loss_output(result)
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
    # Initialise PyTorch model
    config = T5Config.from_json_file(config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = T5ForConditionalGeneration(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_t5(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    model.save_pretrained(pytorch_dump_path)
示例#10
0
 def create_and_check_with_lm_head(
     self, config, input_ids, decoder_input_ids, attention_mask, decoder_attention_mask, lm_labels,
 ):
     model = T5ForConditionalGeneration(config=config).to(torch_device).eval()
     outputs = model(
         input_ids=input_ids,
         decoder_input_ids=decoder_input_ids,
         decoder_attention_mask=decoder_attention_mask,
         labels=lm_labels,
     )
     self.parent.assertEqual(len(outputs), 4)
     self.parent.assertEqual(outputs["logits"].size(), (self.batch_size, self.decoder_seq_length, self.vocab_size))
     self.parent.assertEqual(outputs["loss"].size(), ())
示例#11
0
def main(
        model_path: str, corpus: Corpus = "kaggle", split_name: str = "valid",
        max_len: int = 128, batch_size: int = 32):
    if "mt5" in Path(model_path).stem:
        tokenizer = MT5Tokenizer.from_pretrained(model_path)
        # print(tokenizer.encode("</s>"))
        model = MT5ForConditionalGeneration(
            MT5Config.from_pretrained(model_path)
        ).eval()
    else:
        tokenizer = T5Tokenizer.from_pretrained(model_path)
        # print(tokenizer.encode("</s>"))
        model = T5ForConditionalGeneration(
            T5Config.from_pretrained(model_path)
        ).eval()
    shrink_vocab(model_path, model)
    model.lm_head = torch.nn.Linear(model.lm_head.in_features, 3, bias=False)
    model.load_state_dict(torch.load(Path(model_path) / "pytorch_model.bin"))
    model = model.cuda()
    # model.load_state_dict(torch.load(model_path))
    context_tokens_1 = tokenizer.encode("mnli hypothesis:")[:-1]
    context_tokens_2 = tokenizer.encode("premise:")[:-1]
    collate_fn = partial(
        collate_batch, pad=model.config.decoder_start_token_id,
        decode_start_token=model.config.pad_token_id,
        max_len=max_len, is_classifier=True
    )
    dataset = XNLIDataset(
        corpus, split_name + ".jbl",
        context_tokens_1, context_tokens_2)
    data_loader = DataLoader(
        dataset, num_workers=1, shuffle=False, drop_last=False,
        batch_size=batch_size, collate_fn=collate_fn)
    preds, labels = [], []
    for input_batch, label_batch in tqdm(data_loader, ncols=100):
        for key, val in input_batch.items():
            input_batch[key] = val.cuda()
        outputs = model(**input_batch)
        preds_local = torch.argmax(outputs["logits"][:, 0, :].cpu(), dim=-1)
        preds.append(preds_local.numpy())
        labels.append(np.asarray([x[0] for x in label_batch["ids"].cpu().numpy()]))
    full_labels = np.concatenate(labels)
    full_preds = np.concatenate(preds)
    # print("Label mapping:")
    # for key in np.unique(full_labels):
    #     print(f"{key}: {tokenizer.decode([key])}")
    print("Labels:")
    print(pd.Series(full_labels).value_counts())
    print("Predictions:")
    print(pd.Series(full_preds).value_counts())
    print("Acc: %.2f%%" % (np.mean(full_labels == full_preds) * 100))
def get_model(tokenizer_len=None):
  if args.mode == 'train' or args.mode == 'test_without_train':
    model = T5ForConditionalGeneration.from_pretrained(
        args.t5_model, cache_dir=args.cache_dir)
    if tokenizer_len is not None:
      model.resize_token_embeddings(tokenizer_len)
  elif args.mode == 'test' or args.mode == 'continue_train':
    model = T5ForConditionalGeneration(
        T5Config.from_json_file(output_config_file))
    model.load_state_dict(torch.load(output_model_file))
  else:
    raise NotImplementedError(
        f'No such mode called {args.mode}, error raised from get_model.')

  if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)
  return model.to(device)
示例#13
0
def load_pretained_model_and_tokenizer(
    base_model: str,
    model_dict_path: str,
    gpu_device: str,
    eval=False,
):
    '''
    Load pretainted T5 model on UnifiedQA
    base_model: base model name for T5
    model_dict_path: trained model checkpoint for unifiedQA
    '''
    tokenizer = T5Tokenizer.from_pretrained(base_model)
    model = T5ForConditionalGeneration(T5Config.from_pretrained(base_model))

    if eval:
        model = torch.load(model_dict_path, map_location=gpu_device)
    else:
        load_tf_weights_in_t5(model, None, model_dict_path)

    return tokenizer, model
示例#14
0
    def __init__(self, model: str = None, service: str = "summ"):
        if model is None:
            model = "t5"

        # path to all the files that will be used for inference
        self.path = f"./{service}/{model}/"
        self.model_path = self.path + "model.bin"
        self.config_path = self.path + "config.json"

        # Selecting the correct model based on the passed madel input. Default t5
        if model == "t5":
            self.config = T5Config.from_json_file(self.config_path)
            self.model = T5ForConditionalGeneration(self.config)
            self.tokenizer = T5Tokenizer.from_pretrained(self.path)
        else:
            raise Exception("This model is not supported")

        self.model.eval()
        self.model.load_state_dict(
            torch.load(self.model_path, map_location=device))

        self.text = str()
示例#15
0
文件: t5.py 项目: vinhng10/decepticon
    def __init__(self, hparams, batch_fn=None):
        """
        :param batch_fn: function to process batch
        """
        super(RaceModule, self).__init__(hparams, batch_fn)

        if self.hparams.pretrained_model in ["t5-base","t5-small"]:
            # Model:
            config = T5Config(decoder_start_token_id = self.hparams.padding_token)
            self.model = T5ForConditionalGeneration(config).from_pretrained(self.hparams.pretrained_model)
            # Tokenizer:
            self.tokenizer = AutoTokenizer.from_pretrained(self.hparams.pretrained_model)
            self.tokenizer_.add_special_tokens({"additional_special_tokens": ["[CON]","[QUE]","[ANS]","[DIS]"]})

            # Metrics:
            self.metrics = Metrics()
            try:
                self.model.resize_token_embeddings(self.hparams.tokenizer_len)
            except:
                self.model.resize_token_embeddings(32104)
        else:
            raise NotImplementedError
示例#16
0
 def __init__(self, tokenizer):
     super(T5Model, self).__init__()
     self.tokenizer = tokenizer
     config = T5Config.from_pretrained('t5-small')
     self.model = T5ForConditionalGeneration(config=config)
示例#17
0
    def execute_inference(
        self,
        metadata: NetworkMetadata,
        network_fpaths: NetworkModels,
        inference_input: str,
        timing_profile: TimingProfile,
    ) -> NetworkResult:

        # Execute some tests
        tokenizer = T5Tokenizer.from_pretrained(metadata.variant)
        input_ids = tokenizer(inference_input, return_tensors="pt").input_ids

        # By default, huggingface model structure is one giant file.
        t5_torch_fpath = network_fpaths.torch[0].fpath
        config = T5Config(
            use_cache=metadata.other.kv_cache,
            num_layers=T5ModelTRTConfig.NUMBER_OF_LAYERS[metadata.variant],
        )
        t5_model = T5ForConditionalGeneration(config).from_pretrained(
            t5_torch_fpath)

        t5_torch_encoder = T5EncoderTorchFile.TorchModule(t5_model.encoder)
        t5_torch_decoder = T5DecoderTorchFile.TorchModule(
            t5_model.decoder, t5_model.lm_head, t5_model.config)

        encoder_last_hidden_state, encoder_e2e_median_time = encoder_inference(
            t5_torch_encoder, input_ids, timing_profile)
        _, decoder_e2e_median_time = decoder_inference(
            t5_torch_decoder, input_ids, encoder_last_hidden_state,
            timing_profile)
        decoder_output_greedy, full_e2e_median_runtime = full_inference_greedy(
            t5_torch_encoder,
            t5_torch_decoder,
            input_ids,
            tokenizer,
            timing_profile,
            max_length=T5ModelTRTConfig.MAX_SEQUENCE_LENGTH[metadata.variant],
        )

        # Remove the padding and end tokens.
        semantic_outputs = tokenizer.convert_ids_to_tokens(
            decoder_output_greedy.tolist()[0])[1:-1]
        remove_underscore = "".join(
            [s.replace("\u2581", " ") for s in semantic_outputs])

        return NetworkResult(
            input=inference_input,
            output_tensor=encoder_last_hidden_state,
            semantic_output=remove_underscore.strip(),
            median_runtime=[
                NetworkRuntime(
                    name=T5ModelTRTConfig.NETWORK_DECODER_SEGMENT_NAME,
                    runtime=decoder_e2e_median_time,
                ),
                NetworkRuntime(
                    name=T5ModelTRTConfig.NETWORK_ENCODER_SEGMENT_NAME,
                    runtime=encoder_e2e_median_time,
                ),
                NetworkRuntime(
                    name=T5ModelTRTConfig.NETWORK_FULL_NAME,
                    runtime=full_e2e_median_runtime,
                ),
            ],
            models=network_fpaths,
        )
示例#18
0
    def generate_and_download_framework(
            self, metadata: NetworkMetadata,
            workspace: NNFolderWorkspace) -> NetworkModels:

        cache_variant = False
        if metadata.other.kv_cache:
            cache_variant = True

        trt_t5_config = self.config
        metadata_serialized = trt_t5_config.get_metadata_string(metadata)
        workspace_dir = workspace.get_path()

        pytorch_model_dir = os.path.join(workspace_dir, metadata_serialized)
        # We keep track of the generated torch location for cleanup later
        self.torch_t5_dir = pytorch_model_dir

        model = None
        tfm_config = T5Config(
            use_cache=cache_variant,
            num_layers=T5ModelTRTConfig.NUMBER_OF_LAYERS[metadata.variant],
        )
        if not os.path.exists(pytorch_model_dir):
            # Generate the pre-trained weights
            model = T5ForConditionalGeneration(tfm_config).from_pretrained(
                metadata.variant)
            model.save_pretrained(pytorch_model_dir)
            print("Pytorch Model saved to {}".format(pytorch_model_dir))
        else:
            print(
                "Frameworks file already exists, skipping generation and loading from file instead."
            )
            model = T5ForConditionalGeneration(tfm_config).from_pretrained(
                pytorch_model_dir)

        # These ONNX models can be converted using special encoder and decoder classes.
        root_onnx_model_name = "{}.onnx".format(metadata_serialized)
        root_onnx_model_fpath = os.path.join(os.getcwd(), workspace_dir,
                                             root_onnx_model_name)
        encoder_onnx_model_fpath = root_onnx_model_fpath + "-encoder.onnx"
        decoder_onnx_model_fpath = root_onnx_model_fpath + "-decoder-with-lm-head.onnx"

        t5_encoder = T5EncoderTorchFile(model, metadata)
        t5_decoder = T5DecoderTorchFile(model, metadata)
        self.onnx_t5_encoder = t5_encoder.as_onnx_model(
            encoder_onnx_model_fpath, force_overwrite=False)
        self.onnx_t5_decoder = t5_decoder.as_onnx_model(
            decoder_onnx_model_fpath, force_overwrite=False)

        onnx_models = [
            NetworkModel(
                name=T5ModelTRTConfig.NETWORK_DECODER_SEGMENT_NAME,
                fpath=self.onnx_t5_decoder.fpath,
            ),
            NetworkModel(
                name=T5ModelTRTConfig.NETWORK_ENCODER_SEGMENT_NAME,
                fpath=self.onnx_t5_encoder.fpath,
            ),
        ]
        torch_models = [
            NetworkModel(name=T5ModelTRTConfig.NETWORK_FULL_NAME,
                         fpath=pytorch_model_dir)
        ]

        return NetworkModels(torch=torch_models, onnx=onnx_models, trt=None)
示例#19
0
    def __init__(self,
                 hparams: argparse.Namespace,
                 num_labels=None,
                 mode="base",
                 config=None,
                 tokenizer=None,
                 model=None,
                 **config_kwargs):
        """Initialize a model, tokenizer and config."""
        super().__init__()
        # TODO: move to self.save_hyperparameters()
        # self.save_hyperparameters()
        # can also expand arguments into trainer signature for easier reading

        self.save_hyperparameters(hparams)
        self.step_count = 0
        self.output_dir = Path(self.hparams.output_dir)
        cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
        if config is None:
            self.config = AutoConfig.from_pretrained(
                self.hparams.config_name if self.hparams.config_name else
                self.hparams.model_name_or_path,
                **({
                    "num_labels": num_labels
                } if num_labels is not None else {}),
                cache_dir=cache_dir,
                **config_kwargs,
            )
        else:
            self.config: PretrainedConfig = config

        extra_model_params = ("encoder_layerdrop", "decoder_layerdrop",
                              "dropout", "attention_dropout")
        for p in extra_model_params:
            if getattr(self.hparams, p, None):
                assert hasattr(
                    self.config,
                    p), f"model config doesn't have a `{p}` attribute"
                setattr(self.config, p, getattr(self.hparams, p))

        if tokenizer is None:
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.hparams.tokenizer_name if self.hparams.tokenizer_name else
                self.hparams.model_name_or_path,
                # self.hparams.model_name_or_path,
                cache_dir=cache_dir,
            )
        else:
            self.tokenizer: PreTrainedTokenizer = tokenizer
        self.model_type = MODEL_MODES[mode]
        if model is None:
            try:
                self.model = self.model_type.from_pretrained(
                    self.hparams.model_name_or_path,
                    from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
                    config=self.config,
                    cache_dir=cache_dir,
                )
            except:
                self.model = T5ForConditionalGeneration(config=self.config)
        else:
            self.model = model
示例#20
0
    def __init__(
        self,
        model_name,
        args=None,
        tokenizer=None,
        use_cuda=True,
        cuda_device=-1,
        **kwargs,
    ):

        """
        Initializes a T5Model model.

        Args:
            model_name: The exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
            cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
            **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied.
        """  # noqa: ignore flake8"

        self.args = self._load_model_args(model_name)

        if isinstance(args, dict):
            self.args.update_from_dict(args)
        elif isinstance(args, T5Args):
            self.args = args

        if "sweep_config" in kwargs:
            sweep_config = kwargs.pop("sweep_config")
            sweep_values = sweep_config_to_sweep_values(sweep_config)
            self.args.update_from_dict(sweep_values)

        if self.args.manual_seed:
            random.seed(self.args.manual_seed)
            np.random.seed(self.args.manual_seed)
            torch.manual_seed(self.args.manual_seed)
            if self.args.n_gpu > 0:
                torch.cuda.manual_seed_all(self.args.manual_seed)

        if use_cuda:
            if torch.cuda.is_available():
                if cuda_device == -1:
                    self.device = torch.device("cuda")
                else:
                    self.device = torch.device(f"cuda:{cuda_device}")
            else:
                raise ValueError(
                    "'use_cuda' set to True when cuda is unavailable."
                    "Make sure CUDA is available or set `use_cuda=False`.")
        else:
            self.device = "cpu"

        self.results = {}

        if model_name is None:
            self.config = self.args.config
            self.model = T5ForConditionalGeneration(config=self.config)
        else:
            self.config = T5Config.from_pretrained(model_name,
                                                   **self.args.config)
            self.model = T5ForConditionalGeneration.from_pretrained(
                model_name, config=self.config)

        if isinstance(tokenizer, T5Tokenizer):
            self.tokenizer = tokenizer
        else:
            self.tokenizer = T5Tokenizer.from_pretrained(model_name,
                                                         truncate=True)
        self.model.resize_token_embeddings(len(self.tokenizer))

        if self.args.dynamic_quantize:
            self.model = torch.quantization.quantize_dynamic(self.model,
                                                             {torch.nn.Linear},
                                                             dtype=torch.qint8)

        if not use_cuda:
            self.args.fp16 = False

        self.args.model_type = "T5"
        if model_name is None:
            self.args.model_name = "T5_from_scratch"
        else:
            self.args.model_name = model_name

        if self.args.wandb_project and not wandb_available:
            warnings.warn(
                "wandb_project specified but wandb is not available. Wandb disabled."
            )
            self.args.wandb_project = None
from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration
from transformers.modeling_t5 import load_tf_weights_in_t5
from flask import Flask, request, jsonify

app = Flask(__name__)

base_model = "t5-large"
tokenizer = T5Tokenizer.from_pretrained(base_model)
model = T5ForConditionalGeneration(T5Config.from_pretrained(base_model))

load_tf_weights_in_t5(model, None, "/data/")
model.eval()

ret_dict = {
    'low air quality': 'LowAirQuality',
    'low humidity': 'LowHumidity',
    'low brightness': 'LowBrightness',
    'low noise level': 'LowNoise',
    'low security': 'LowSecurity',
    'low temperature': 'LowTemperature',
    'high air quality': 'HighAirQuality',
    'high humidity': 'HighHumidity',
    'high brightness': 'HighBrightness',
    'high noise level': 'HighNoise',
    'high security': 'HighSecurity',
    'high temperature': 'HighTemperature'
}


def run_model(input_string, **generator_args):
    input_ids = tokenizer.encode(input_string, return_tensors="pt")
示例#22
0
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

config = T5Config(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

tokenizer = T5TokenizerFast.from_pretrained(tokenizer_dir, max_len=512)

model = T5ForConditionalGeneration(config=config)
model.num_parameters()

train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=f"{data_dir}/train_texts.txt",
    block_size=128,
)

test_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=f"{data_dir}/valid_texts.txt",
    block_size=128,
)

data_collator = DataCollatorForLanguageModeling(
示例#23
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if data_args.eval_data_file is None and training_args.do_eval:
        raise ValueError(
            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
            "or remove the --do_eval argument.")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir)
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir)
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported, but you can do it from another script, save it,"
            "and load it from here, using --tokenizer_name")

    if model_args.model_name_or_path != "new":
        model = T5ForConditionalGeneration.from_pretrained(
            model_args.model_name_or_path, )
    else:
        config = AutoConfig.from_pretrained("t5-small")
        model = T5ForConditionalGeneration(config=config)

    model.resize_token_embeddings(len(tokenizer))

    if data_args.block_size <= 0:
        data_args.block_size = tokenizer.max_len
        # Our input block size will be the max possible for the model
    else:
        data_args.block_size = min(data_args.block_size, tokenizer.max_len)

    # Get datasets

    train_dataset = get_dataset(
        data_args, tokenizer=tokenizer) if training_args.do_train else None
    eval_dataset = get_dataset(
        data_args, tokenizer=tokenizer,
        evaluate=True) if training_args.do_eval else None
    data_collator = DoNothingDataCollator()

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        model_path = (model_args.model_name_or_path
                      if model_args.model_name_or_path is not None
                      and os.path.isdir(model_args.model_name_or_path) else
                      None)
        # trainer.train(model_path=model_path)
        trainer.train()
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval:
        logger.info("*** Evaluate ***")

        model.eval()
        data_collator = DoNothingDataCollatorForGeneration()
        sampler = SequentialSampler(eval_dataset)
        data_loader = DataLoader(
            eval_dataset,
            sampler=sampler,
            batch_size=training_args.eval_batch_size,
            collate_fn=data_collator.collate_batch,
        )
        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results_lm.txt")
        writer = open(output_eval_file, "w")
        for inputs in tqdm(data_loader, "Prediction"):
            for k, v in inputs.items():
                inputs[k] = v.cuda()

            with torch.no_grad():
                outputs = model.generate(
                    input_ids=inputs['input_ids'],
                    attention_mask=inputs['attention_mask'],
                    max_length=12)
                dec = [tokenizer.decode(ids) for ids in outputs]

                for i in range(0, len(dec)):
                    writer.write(dec[i] + "\n")

    return results