Exemplo n.º 1
0
    def __init__(self, config, dataset):
        super(T5, self).__init__(config, dataset)

        self.max_source_length = dataset.max_source_length
        self.max_target_length = dataset.max_target_length

        self.pretrained_model_path = config['pretrained_model_path']
        self.tokenizer = T5Tokenizer.from_pretrained(
            self.pretrained_model_path, add_prefix_space=True)
        self.configuration = T5Config.from_pretrained(
            self.pretrained_model_path)

        self.decoder = T5ForConditionalGeneration.from_pretrained(
            self.pretrained_model_path, config=self.configuration)

        self.padding_token_idx = self.tokenizer.pad_token_id
        self.loss = nn.CrossEntropyLoss(ignore_index=self.padding_token_idx,
                                        reduction='none')
        if config['task_type'] == "summarization":
            self.t5_task_text = "summarize: "
        elif config['task_type'] == "translation":
            self.t5_task_text = "translate German to English: "
        else:
            raise NotImplementedError(
                "Only summarization and translation are supported.")
Exemplo n.º 2
0
def convert_model(base_model, path, new_path):
    model = T5ForConditionalGeneration(T5Config.from_pretrained(base_model))
    print("loading weights...")
    load_tf_weights_in_t5(model, None, path)
    model.eval()
    print("saving HF weights...")
    model.save_pretrained(new_path)
Exemplo n.º 3
0
 def __init__(self,
              class_count: int,
              label_str: str,
              model_name_str: str = 't5-base'):
     config = T5Config.from_pretrained(model_name_str)
     tokenizer = T5Tokenizer.from_pretrained(model_name_str)
     super().__init__(class_count, label_str, config, tokenizer,
                      model_name_str)
Exemplo n.º 4
0
 def load_model(self):
   file = pathlib.Path('{}/pytorch_model.bin'.format(self.working_folder))
   if file.exists():
     self.model = T5ForConditionalGeneration.from_pretrained(self.working_folder)
   
   else:
     config = T5Config.from_pretrained(self.model_name)
     self.model = T5ForConditionalGeneration.from_pretrained(self.model_name, config=config)
     self.model.save_pretrained(self.working_folder)
Exemplo n.º 5
0
    def __init__(self, model_or_model_path, onnx_model_sessions):
        config = T5Config.from_pretrained(model_or_model_path)
        super().__init__(config)

        assert len(
            onnx_model_sessions) == 3, 'all three models should be given'

        encoder_sess, decoder_sess, decoder_sess_init = onnx_model_sessions

        self.encoder = T5Encoder(encoder_sess)
        self.decoder = T5Decoder(decoder_sess)
        self.decoder_init = T5DecoderInit(decoder_sess_init)
Exemplo n.º 6
0
    def __init__(self,
                 checkpoint='model.ckpt-1004000',
                 base_model='t5-base',
                 num_samples=3,
                 batch_size=4,
                 doc_attr="text",
                 append=False,
                 out_attr="querygen",
                 verbose=True):

        self.num_samples = num_samples
        self.doc_attr = doc_attr
        self.append = append
        self.out_attr = out_attr
        if append:
            assert out_attr == 'querygen', "append=True cannot be used with out_attr"
        self.verbose = verbose
        self.batch_size = batch_size
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        self.pattern = re.compile("^\\s*http\\S+")
        self.tokenizer = T5Tokenizer.from_pretrained(base_model)
        config = T5Config.from_pretrained(base_model)
        self.model = T5ForConditionalGeneration.from_pretrained(checkpoint,
                                                                from_tf=True,
                                                                config=config)
        self.model.to(self.device)
        self.model.eval()

        def _add_attr(df):
            iter = chunked(df.itertuples(), self.batch_size)
            if self.verbose:
                iter = pt.tqdm(iter, total=len(df) / self.batch_size, unit='d')
            output = []
            for batch_rows in iter:
                docs = [getattr(row, self.doc_attr) for row in batch_rows]
                gens = self._doc2query(docs)
                if self.append:
                    gens = [
                        f'{getattr(row, self.doc_attr)} {gen}'
                        for row, gen in zip(batch_rows, gens)
                    ]
                output.extend(gens)
            if self.append:
                df[self.doc_attr] = output  # replace doc content
            else:
                df[self.out_attr] = output  # add new column
            return df

        super().__init__(_add_attr)

        print("Doc2query using %s" % str(self.device))
Exemplo n.º 7
0
def main(
        model_path: str, corpus: Corpus = "kaggle", split_name: str = "valid",
        max_len: int = 128, batch_size: int = 32):
    if "mt5" in Path(model_path).stem:
        tokenizer = MT5Tokenizer.from_pretrained(model_path)
        # print(tokenizer.encode("</s>"))
        model = MT5ForConditionalGeneration(
            MT5Config.from_pretrained(model_path)
        ).eval()
    else:
        tokenizer = T5Tokenizer.from_pretrained(model_path)
        # print(tokenizer.encode("</s>"))
        model = T5ForConditionalGeneration(
            T5Config.from_pretrained(model_path)
        ).eval()
    shrink_vocab(model_path, model)
    model.lm_head = torch.nn.Linear(model.lm_head.in_features, 3, bias=False)
    model.load_state_dict(torch.load(Path(model_path) / "pytorch_model.bin"))
    model = model.cuda()
    # model.load_state_dict(torch.load(model_path))
    context_tokens_1 = tokenizer.encode("mnli hypothesis:")[:-1]
    context_tokens_2 = tokenizer.encode("premise:")[:-1]
    collate_fn = partial(
        collate_batch, pad=model.config.decoder_start_token_id,
        decode_start_token=model.config.pad_token_id,
        max_len=max_len, is_classifier=True
    )
    dataset = XNLIDataset(
        corpus, split_name + ".jbl",
        context_tokens_1, context_tokens_2)
    data_loader = DataLoader(
        dataset, num_workers=1, shuffle=False, drop_last=False,
        batch_size=batch_size, collate_fn=collate_fn)
    preds, labels = [], []
    for input_batch, label_batch in tqdm(data_loader, ncols=100):
        for key, val in input_batch.items():
            input_batch[key] = val.cuda()
        outputs = model(**input_batch)
        preds_local = torch.argmax(outputs["logits"][:, 0, :].cpu(), dim=-1)
        preds.append(preds_local.numpy())
        labels.append(np.asarray([x[0] for x in label_batch["ids"].cpu().numpy()]))
    full_labels = np.concatenate(labels)
    full_preds = np.concatenate(preds)
    # print("Label mapping:")
    # for key in np.unique(full_labels):
    #     print(f"{key}: {tokenizer.decode([key])}")
    print("Labels:")
    print(pd.Series(full_labels).value_counts())
    print("Predictions:")
    print(pd.Series(full_preds).value_counts())
    print("Acc: %.2f%%" % (np.mean(full_labels == full_preds) * 100))
 def from_pretrained(clz, config, do_not_download_weights=False, **kwargs):
     cfg = T5Config.from_pretrained(config['reader_transformer_type'],
                                    cache_dir=config["transformers_cache"])
     cfg.attention_probs_dropout_prob = config["attention_dropout"]
     cfg.hidden_dropout_prob = config["hidden_dropout"]
     cfg.fusion_strategy = config["fusion_strategy"]
     cfg.custom_config = config
     if do_not_download_weights:
         return T5FusionInDecoder(config=cfg)
     return super(T5FusionInDecoder, clz).from_pretrained(
         config['reader_transformer_type'],
         config=cfg,
         cache_dir=config["transformers_cache"],
         **kwargs)
Exemplo n.º 9
0
    def init_from_base_t5_model(model_name_or_path='t5-base',
                                output_root='./'):
        os.makedirs(output_root, exist_ok=True)

        tokenizer = T5Tokenizer.from_pretrained(model_name_or_path)
        model_config = T5Config.from_pretrained(model_name_or_path)

        # torch.save(model.encoder.embed_tokens.state_dict(), EMBEDDINGS_OUTPUT_FILE)
        tokenizer.save_pretrained(output_root)

        model = T5Siamese(config=model_config)
        model.encoder_left = T5EncoderModel.from_pretrained(model_name_or_path)
        model.encoder_right = T5EncoderModel.from_pretrained(
            model_name_or_path)
        model.save_pretrained(output_root)
        model_config.save_pretrained(output_root)
Exemplo n.º 10
0
    def __init__(self,
                 hparams: argparse.Namespace,
                 num_labels=None,
                 **config_kwargs) -> 'T5QaModel':
        super().__init__()
        self.hparams = hparams
        cache_dir = self.hparams.cache_dir if self.hparams.cache_dir else None
        self.config = T5Config.from_pretrained(
            self.hparams.config_name
            if self.hparams.config_name else self.hparams.model_name_or_path,
            **({
                "num_labels": num_labels
            } if num_labels is not None else {}),
            cache_dir=cache_dir,
            **config_kwargs,
        )
        self.tokenizer = T5Tokenizer.from_pretrained(
            self.hparams.tokenizer_name if self.hparams.tokenizer_name else
            self.hparams.model_name_or_path,
            cache_dir=cache_dir,
        )
        self.model = T5ForConditionalGeneration.from_pretrained(
            self.hparams.model_name_or_path,
            from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
            config=self.config,
            cache_dir=cache_dir,
        )

        # fix for eos token id problem
        # see https://github.com/huggingface/transformers/issues/5142 for more info on the problem and workaround
        if self.tokenizer.eos_token_id == 1:
            self.tokenizer.add_special_tokens({'eos_token': '[EOS]'})
            self.model.resize_token_embeddings(len(self.tokenizer))

        self.dataset_kwargs: dict = dict(
            data_dir=self.hparams.input_dir,
            max_source_length=1024,
            max_target_length=56,
        )

        self.loss_names = ["loss"]
        self.metric_names = ROUGE_KEYS
        self.val_metric = "rouge2"
Exemplo n.º 11
0
    def load(self) -> T5ForConditionalGeneration:
        try:
            if not self.flush_cache:
                return self._fix_t5_model(
                    T5ForConditionalGeneration.from_pretrained(
                        str(self.model_cache_dir),
                        from_tf=True,
                        force_download=False))
        except (RuntimeError, OSError):
            logging.info('T5 model weights not in cache.')
        m = re.search(r'model_checkpoint_path: "(.+?)"', self.ckpt_prefix)
        assert m is not None, 'checkpoint file malformed'

        # Copy over checkpoint data
        ckpt_patt = re.compile(
            rf'^{m.group(1)}\.(data-\d+-of-\d+|index|meta)$')
        for name in file_io.list_directory(self.url):
            if not ckpt_patt.match(name):
                continue
            url = os.path.join(self.url, name)
            url_stat = file_io.stat(url)
            cache_file_path = self.model_cache_dir / ckpt_patt.sub(
                rf'{TRANSFO_PREFIX}.\1', name)
            try:
                cs = os.stat(str(cache_file_path))
                if cs.st_size == url_stat.length and cs.st_mtime_ns > url_stat.mtime_nsec and not self.flush_cache:
                    logging.info(f'Skipping {name}...')
                    continue
            except FileNotFoundError:
                pass
            logging.info(f'Caching {name}...')
            file_io.copy(url, str(cache_file_path), overwrite=True)

        # Transformers expects a model config.json
        config = T5Config.from_pretrained(self.model_type)
        with open(str(self.model_cache_dir / 'config.json'), 'w') as f:
            json.dump(config.__dict__, f, indent=4)
        return self._fix_t5_model(
            T5ForConditionalGeneration.from_pretrained(str(
                self.model_cache_dir),
                                                       from_tf=True,
                                                       force_download=False))
Exemplo n.º 12
0
def load_pretained_model_and_tokenizer(
    base_model: str,
    model_dict_path: str,
    gpu_device: str,
    eval=False,
):
    '''
    Load pretainted T5 model on UnifiedQA
    base_model: base model name for T5
    model_dict_path: trained model checkpoint for unifiedQA
    '''
    tokenizer = T5Tokenizer.from_pretrained(base_model)
    model = T5ForConditionalGeneration(T5Config.from_pretrained(base_model))

    if eval:
        model = torch.load(model_dict_path, map_location=gpu_device)
    else:
        load_tf_weights_in_t5(model, None, model_dict_path)

    return tokenizer, model
Exemplo n.º 13
0
def create_t2t_model(model_name_or_path,
                     args,
                     tokenizer=None,
                     from_pretrained=True):
    ## transformer encoder
    if from_pretrained:
        encoder = TFT5ForConditionalGeneration.from_pretrained(
            model_name_or_path)
        encoder_config = encoder.config
    else:
        encoder_config = T5Config.from_pretrained(args.model_select)
        if tokenizer != None:
            assert encoder_config.vocab_size == len(tokenizer)
            assert encoder_config.pad_token_id == tokenizer.pad_token_id
            assert encoder_config.eos_token_id == tokenizer.eos_token_id
            assert encoder_config.decoder_start_token_id == tokenizer.pad_token_id
        encoder = TFT5ForConditionalGeneration(encoder_config)
        # build the model with dummy_inputs
        encoder(encoder.dummy_inputs, training=False)

    if not os.path.isfile(os.path.join(args.output_path, "config.json")):
        encoder_config.save_pretrained(args.output_path)
    return encoder
Exemplo n.º 14
0
    def __init__(self, hparams):
        super().__init__()
        self.hparams = hparams

        config = T5Config.from_pretrained(
            self.hparams.config_name
            if self.hparams.config_name else self.hparams.model_name_or_path,
            cache_dir=self.hparams.cache_dir
            if self.hparams.cache_dir else None,
        )
        self.tokenizer = T5Tokenizer.from_pretrained(
            self.hparams.tokenizer_name if self.hparams.tokenizer_name else
            self.hparams.model_name_or_path,
            do_lower_case=self.hparams.do_lower_case,
            cache_dir=self.hparams.cache_dir
            if self.hparams.cache_dir else None,
        )
        self.model = T5ForConditionalGeneration.from_pretrained(
            self.hparams.model_name_or_path,
            from_tf=bool(".ckpt" in self.hparams.model_name_or_path),
            config=config,
            cache_dir=self.hparams.cache_dir
            if self.hparams.cache_dir else None,
        )
Exemplo n.º 15
0
from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration
from transformers.modeling_t5 import load_tf_weights_in_t5
from flask import Flask, request, jsonify

app = Flask(__name__)

base_model = "t5-large"
tokenizer = T5Tokenizer.from_pretrained(base_model)
model = T5ForConditionalGeneration(T5Config.from_pretrained(base_model))

load_tf_weights_in_t5(model, None, "/data/")
model.eval()

ret_dict = {
    'low air quality': 'LowAirQuality',
    'low humidity': 'LowHumidity',
    'low brightness': 'LowBrightness',
    'low noise level': 'LowNoise',
    'low security': 'LowSecurity',
    'low temperature': 'LowTemperature',
    'high air quality': 'HighAirQuality',
    'high humidity': 'HighHumidity',
    'high brightness': 'HighBrightness',
    'high noise level': 'HighNoise',
    'high security': 'HighSecurity',
    'high temperature': 'HighTemperature'
}


def run_model(input_string, **generator_args):
    input_ids = tokenizer.encode(input_string, return_tensors="pt")
Exemplo n.º 16
0
MODEL_PATH = os.environ.get("MODEL_PATH", "/data/model.pth")
BASE_MODEL = os.environ.get("BASE_MODEL", "t5-base")
DECODING = os.environ.get("DECODING", "greedy")  # greedy, topk-N (e.g., topk-10)

cuda = torch.cuda.is_available()
if cuda:
    torch.cuda.set_device(0)  # singe gpu
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

logger.info(f"question generation is set to run on {device}")

# init model
logger.info("question generation model is preparing...")
config = T5Config.from_pretrained(BASE_MODEL)
model = T5ForConditionalGeneration(config=config)
t = QGTokenizer(tokenizer=BASE_MODEL)
checkpoint = torch.load(MODEL_PATH, map_location=device)
model.load_state_dict(checkpoint["model_state_dict"])
model.eval()
if cuda:
    model.cuda()

logger.info(f"question generation model is ready")

app = Flask(__name__)


@app.route("/question", methods=["POST"])
def respond():
def convert_t5x_checkpoint_to_flax(t5x_checkpoint_path, config_name,
                                   flax_dump_folder_path):
    config = T5Config.from_pretrained(config_name)
    flax_model = FlaxT5ForConditionalGeneration(config=config)
    t5x_model = checkpoints.load_t5x_checkpoint(t5x_checkpoint_path)

    split_mlp_wi = "wi_0" in t5x_model["target"]["encoder"]["layers_0"]["mlp"]

    # Encoder
    for layer_index in range(config.num_layers):
        layer_name = f"layers_{str(layer_index)}"

        # Self-Attention
        t5x_attention_key = t5x_model["target"]["encoder"][layer_name][
            "attention"]["key"]["kernel"]
        t5x_attention_out = t5x_model["target"]["encoder"][layer_name][
            "attention"]["out"]["kernel"]
        t5x_attention_query = t5x_model["target"]["encoder"][layer_name][
            "attention"]["query"]["kernel"]
        t5x_attention_value = t5x_model["target"]["encoder"][layer_name][
            "attention"]["value"]["kernel"]

        # Layer Normalization
        t5x_attention_layer_norm = t5x_model["target"]["encoder"][layer_name][
            "pre_attention_layer_norm"]["scale"]

        if split_mlp_wi:
            t5x_mlp_wi_0 = t5x_model["target"]["encoder"][layer_name]["mlp"][
                "wi_0"]["kernel"]
            t5x_mlp_wi_1 = t5x_model["target"]["encoder"][layer_name]["mlp"][
                "wi_1"]["kernel"]
        else:
            t5x_mlp_wi = t5x_model["target"]["encoder"][layer_name]["mlp"][
                "wi"]["kernel"]

        t5x_mlp_wo = t5x_model["target"]["encoder"][layer_name]["mlp"]["wo"][
            "kernel"]

        # Layer Normalization
        t5x_mlp_layer_norm = t5x_model["target"]["encoder"][layer_name][
            "pre_mlp_layer_norm"]["scale"]

        # Assigning
        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][
            "SelfAttention"]["k"]["kernel"] = t5x_attention_key
        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][
            "SelfAttention"]["o"]["kernel"] = t5x_attention_out
        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][
            "SelfAttention"]["q"]["kernel"] = t5x_attention_query
        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][
            "SelfAttention"]["v"]["kernel"] = t5x_attention_value

        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["0"][
            "layer_norm"]["weight"] = t5x_attention_layer_norm

        if split_mlp_wi:
            flax_model.params["encoder"]["block"][str(layer_index)]["layer"][
                "1"]["DenseReluDense"]["wi_0"]["kernel"] = t5x_mlp_wi_0
            flax_model.params["encoder"]["block"][str(layer_index)]["layer"][
                "1"]["DenseReluDense"]["wi_1"]["kernel"] = t5x_mlp_wi_1
        else:
            flax_model.params["encoder"]["block"][str(layer_index)]["layer"][
                "1"]["DenseReluDense"]["wi"]["kernel"] = t5x_mlp_wi

        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"][
            "DenseReluDense"]["wo"]["kernel"] = t5x_mlp_wo
        flax_model.params["encoder"]["block"][str(layer_index)]["layer"]["1"][
            "layer_norm"]["weight"] = t5x_mlp_layer_norm

    # Only for layer 0:
    t5x_encoder_rel_embedding = t5x_model["target"]["encoder"]["relpos_bias"][
        "rel_embedding"].T
    flax_model.params["encoder"]["block"]["0"]["layer"]["0"]["SelfAttention"][
        "relative_attention_bias"]["embedding"] = t5x_encoder_rel_embedding

    # Assigning
    t5x_encoder_norm = t5x_model["target"]["encoder"]["encoder_norm"]["scale"]
    flax_model.params["encoder"]["final_layer_norm"][
        "weight"] = t5x_encoder_norm

    # Decoder
    for layer_index in range(config.num_decoder_layers):
        layer_name = f"layers_{str(layer_index)}"

        # Self-Attention
        t5x_attention_key = t5x_model["target"]["decoder"][layer_name][
            "self_attention"]["key"]["kernel"]
        t5x_attention_out = t5x_model["target"]["decoder"][layer_name][
            "self_attention"]["out"]["kernel"]
        t5x_attention_query = t5x_model["target"]["decoder"][layer_name][
            "self_attention"]["query"]["kernel"]
        t5x_attention_value = t5x_model["target"]["decoder"][layer_name][
            "self_attention"]["value"]["kernel"]

        # Layer Normalization
        t5x_pre_attention_layer_norm = t5x_model["target"]["decoder"][
            layer_name]["pre_self_attention_layer_norm"]["scale"]

        # Encoder-Decoder-Attention
        t5x_enc_dec_attention_key = t5x_model["target"]["decoder"][layer_name][
            "encoder_decoder_attention"]["key"]["kernel"]
        t5x_enc_dec_attention_out = t5x_model["target"]["decoder"][layer_name][
            "encoder_decoder_attention"]["out"]["kernel"]
        t5x_enc_dec_attention_query = t5x_model["target"]["decoder"][
            layer_name]["encoder_decoder_attention"]["query"]["kernel"]
        t5x_enc_dec_attention_value = t5x_model["target"]["decoder"][
            layer_name]["encoder_decoder_attention"]["value"]["kernel"]

        # Layer Normalization
        t5x_cross_layer_norm = t5x_model["target"]["decoder"][layer_name][
            "pre_cross_attention_layer_norm"]["scale"]

        # MLP
        if split_mlp_wi:
            t5x_mlp_wi_0 = t5x_model["target"]["decoder"][layer_name]["mlp"][
                "wi_0"]["kernel"]
            t5x_mlp_wi_1 = t5x_model["target"]["decoder"][layer_name]["mlp"][
                "wi_1"]["kernel"]
        else:
            t5x_mlp_wi = t5x_model["target"]["decoder"][layer_name]["mlp"][
                "wi"]["kernel"]

        t5x_mlp_wo = t5x_model["target"]["decoder"][layer_name]["mlp"]["wo"][
            "kernel"]

        # Layer Normalization
        tx5_mlp_layer_norm = t5x_model["target"]["decoder"][layer_name][
            "pre_mlp_layer_norm"]["scale"]

        # Assigning
        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][
            "SelfAttention"]["k"]["kernel"] = t5x_attention_key
        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][
            "SelfAttention"]["o"]["kernel"] = t5x_attention_out
        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][
            "SelfAttention"]["q"]["kernel"] = t5x_attention_query
        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][
            "SelfAttention"]["v"]["kernel"] = t5x_attention_value

        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["0"][
            "layer_norm"]["weight"] = t5x_pre_attention_layer_norm

        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][
            "EncDecAttention"]["k"]["kernel"] = t5x_enc_dec_attention_key
        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][
            "EncDecAttention"]["o"]["kernel"] = t5x_enc_dec_attention_out
        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][
            "EncDecAttention"]["q"]["kernel"] = t5x_enc_dec_attention_query
        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][
            "EncDecAttention"]["v"]["kernel"] = t5x_enc_dec_attention_value

        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["1"][
            "layer_norm"]["weight"] = t5x_cross_layer_norm

        if split_mlp_wi:
            flax_model.params["decoder"]["block"][str(layer_index)]["layer"][
                "2"]["DenseReluDense"]["wi_0"]["kernel"] = t5x_mlp_wi_0
            flax_model.params["decoder"]["block"][str(layer_index)]["layer"][
                "2"]["DenseReluDense"]["wi_1"]["kernel"] = t5x_mlp_wi_1
        else:
            flax_model.params["decoder"]["block"][str(layer_index)]["layer"][
                "2"]["DenseReluDense"]["wi"]["kernel"] = t5x_mlp_wi

        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"][
            "DenseReluDense"]["wo"]["kernel"] = t5x_mlp_wo

        flax_model.params["decoder"]["block"][str(layer_index)]["layer"]["2"][
            "layer_norm"]["weight"] = tx5_mlp_layer_norm

    # Decoder Normalization
    tx5_decoder_norm = t5x_model["target"]["decoder"]["decoder_norm"]["scale"]
    flax_model.params["decoder"]["final_layer_norm"][
        "weight"] = tx5_decoder_norm

    # Only for layer 0:
    t5x_decoder_rel_embedding = t5x_model["target"]["decoder"]["relpos_bias"][
        "rel_embedding"].T
    flax_model.params["decoder"]["block"]["0"]["layer"]["0"]["SelfAttention"][
        "relative_attention_bias"]["embedding"] = t5x_decoder_rel_embedding

    # Token Embeddings
    tx5_token_embeddings = t5x_model["target"]["token_embedder"]["embedding"]
    flax_model.params["shared"]["embedding"] = tx5_token_embeddings

    # LM Head (only in v1.1 checkpoints)
    if "logits_dense" in t5x_model["target"]["decoder"]:
        flax_model.params["lm_head"]["kernel"] = t5x_model["target"][
            "decoder"]["logits_dense"]["kernel"]

    flax_model.save_pretrained(flax_dump_folder_path)
    print("T5X Model was sucessfully converted!")
Exemplo n.º 18
0
def convert_model(args):
    if os.path.exists(args.decoder_onnx):
        print(f"skip convert_to_onnx since path existed: {args.decoder_onnx}")
    else:
        assert args.model_type == "gpt2", "please have onnx model ready for model type that is not gpt2"
        gpt2_to_onnx(args)

    # TODO: fix shape inference for T5. Currently symbolic shape inference on T5 is broken.
    enable_shape_inference = args.model_type == "gpt2"

    if enable_shape_inference:
        print(f"Run symbolic shape inference on {args.decoder_onnx}. The file will be overwritten.")
        shape_inference(args.decoder_onnx)

    global config
    if args.model_type == "gpt2":
        config = GPT2Config.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
    else:
        config = T5Config.from_pretrained(args.model_name_or_path, cache_dir=args.cache_dir)
    print(config)

    eos_token_id = config.eos_token_id
    pad_token_id = config.eos_token_id
    vocab_size = config.vocab_size

    # if vocab_size is given in parameters use that.
    if args.vocab_size != -1:
        vocab_size = args.vocab_size

    model = onnx.load(args.decoder_onnx)
    model.graph.name = f"{args.model_type} decoder subgraph"

    if args.model_type == "gpt2":
        verify_gpt2_subgraph(model.graph, args.precision)
    else:
        verify_t5_decoder_subgraph(model.graph, args.precision)

    inputs = [
        "input_ids",
        "max_length",
        "min_length",
        "num_beams",
        "num_return_sequences",
        "temperature",
        "length_penalty",
        "repetition_penalty",
        "vocab_mask",
    ]
    if args.prefix_vocab_mask:
        inputs.append("prefix_vocab_mask")

    outputs = ["sequences"]
    if args.output_sequences_scores:
        outputs.append("sequences_scores")

    if args.output_token_scores:
        assert args.output_sequences_scores, "--output_token_scores requires --output_sequences_scores"
        outputs.append("scores")

    node = helper.make_node(
        "BeamSearch",
        inputs=inputs,
        outputs=outputs,
        name=f"BeamSearch_{args.model_type}",
    )
    node.domain = "com.microsoft"
    node.attribute.extend(
        [
            helper.make_attribute("eos_token_id", eos_token_id),
            helper.make_attribute("pad_token_id", pad_token_id),
            helper.make_attribute("no_repeat_ngram_size", args.no_repeat_ngram_size),
            helper.make_attribute("early_stopping", 1 if args.early_stopping else 0),
            helper.make_attribute("model_type", 0 if args.model_type == "gpt2" else 1),
            helper.make_attribute("decoder", model.graph),
        ]
    )

    if args.model_type == "t5":
        if enable_shape_inference:
            print(f"Run symbolic shape inference on {args.encoder_decoder_init_onnx}. The file will be overwritten.")
            shape_inference(args.encoder_decoder_init_onnx)
        init_model = onnx.load(args.encoder_decoder_init_onnx)
        init_model.graph.name = f"{args.model_type} encoder decoder init subgraph"
        verify_t5_encoder_decoder_init_subgraph(init_model.graph, args.precision)
        node.attribute.extend(
            [
                helper.make_attribute("encoder_decoder_init", init_model.graph),
            ]
        )

    from onnx import TensorProto

    # graph inputs
    input_ids = helper.make_tensor_value_info("input_ids", TensorProto.INT32, ["batch_size", "sequence_length"])
    max_length = helper.make_tensor_value_info("max_length", TensorProto.INT32, [1])
    min_length = helper.make_tensor_value_info("min_length", TensorProto.INT32, [1])
    num_beams = helper.make_tensor_value_info("num_beams", TensorProto.INT32, [1])
    num_return_sequences = helper.make_tensor_value_info("num_return_sequences", TensorProto.INT32, [1])
    temperature = helper.make_tensor_value_info("temperature", TensorProto.FLOAT, [1])
    length_penalty = helper.make_tensor_value_info("length_penalty", TensorProto.FLOAT, [1])
    repetition_penalty = helper.make_tensor_value_info("repetition_penalty", TensorProto.FLOAT, [1])
    vocab_mask = helper.make_tensor_value_info("vocab_mask", TensorProto.INT32, [vocab_size])

    graph_inputs = [
        input_ids,
        max_length,
        min_length,
        num_beams,
        num_return_sequences,
        temperature,
        length_penalty,
        repetition_penalty,
        vocab_mask,
    ]

    if args.prefix_vocab_mask:
        prefix_vocab_mask = helper.make_tensor_value_info(
            "prefix_vocab_mask", TensorProto.INT32, ["batch_size", vocab_size]
        )
        graph_inputs.append(prefix_vocab_mask)

    # graph outputs
    sequences = helper.make_tensor_value_info(
        "sequences",
        TensorProto.INT32,
        ["batch_size", "num_return_sequences", "max_length"],
    )

    sequences_scores = helper.make_tensor_value_info(
        "sequences_scores", TensorProto.FLOAT, ["batch_size", "num_return_sequences"]
    )

    scores = helper.make_tensor_value_info(
        "scores",
        TensorProto.FLOAT,
        ["max_length - sequence_length", "batch_size", "num_beams", vocab_size],
    )

    initializers = []

    graph_outputs = [sequences]

    if args.output_sequences_scores:
        graph_outputs.append(sequences_scores)

    if args.output_token_scores:
        graph_outputs.append(scores)

    new_graph = helper.make_graph(
        [node],
        f"{args.model_type}-beam-search",
        graph_inputs,
        graph_outputs,
        initializers,
    )

    # Create the model
    new_model = helper.make_model(
        new_graph,
        producer_name="onnxruntime.transformers",
        opset_imports=model.opset_import,
    )
    onnx.save(new_model, args.output)
Exemplo n.º 19
0
from tqdm.notebook import tqdm
from transformers import T5Config, T5Tokenizer, T5ForConditionalGeneration

print("Downloading and unzipping model.", file=sys.stderr)
os.system(
    "wget -nc https://storage.googleapis.com/doctttttquery_git/t5-base.zip")
os.system("unzip -o t5-base.zip")

nltk.download('punkt')

# Define the target device. Use GPU if available.
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Instantiate and load the QG model to the GPU.
qg_tokenizer = T5Tokenizer.from_pretrained('t5-base')
qg_config = T5Config.from_pretrained('t5-base')
qg_model = T5ForConditionalGeneration.from_pretrained('model.ckpt-1004000',
                                                      from_tf=True,
                                                      config=qg_config)

qg_model.to(device)


def preprocess(document: str, span=10, stride=5) -> List[str]:
    """
    Define your preprocessing function.

    This function should take the a corpus document and output a list of generation
    spans. This is required so we can match the expected sequence size of the
    generation model.
    """
Exemplo n.º 20
0
    def __init__(
        self,
        model_name,
        args=None,
        use_cuda=True,
        cuda_device=-1,
        **kwargs,
    ):

        """
        Initializes a T5Model model.

        Args:
            model_name: The exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
            cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
            **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied.
        """  # noqa: ignore flake8"

        if args and "manual_seed" in args:
            random.seed(args["manual_seed"])
            np.random.seed(args["manual_seed"])
            torch.manual_seed(args["manual_seed"])
            if "n_gpu" in args and args["n_gpu"] > 0:
                torch.cuda.manual_seed_all(args["manual_seed"])

        self.args = {
            "dataset_class": None,
            "do_sample": False,
            "max_steps": -1,
            "evaluate_generated_text": False,
            "num_beams": 1,
            "max_length": 20,
            "repetition_penalty": 1.0,
            "length_penalty": 2.0,
            "early_stopping": True,
            "preprocess_inputs": True,
        }

        self.args.update(global_args)

        if args:
            self.args.update(args)

        if use_cuda:
            if torch.cuda.is_available():
                if cuda_device == -1:
                    self.device = torch.device("cuda")
                else:
                    self.device = torch.device(f"cuda:{cuda_device}")
            else:
                raise ValueError(
                    "'use_cuda' set to True when cuda is unavailable."
                    "Make sure CUDA is available or set `use_cuda=False`.")
        else:
            self.device = "cpu"

        self.results = {}

        self.config = T5Config.from_pretrained(model_name,
                                               **self.args["config"])

        self.model = T5ForConditionalGeneration.from_pretrained(
            model_name, config=self.config)

        self.tokenizer = T5Tokenizer.from_pretrained(model_name)

        if not use_cuda:
            self.args["fp16"] = False

        self.args["model_name"] = model_name

        if self.args["wandb_project"] and not wandb_available:
            warnings.warn(
                "wandb_project specified but wandb is not available. Wandb disabled."
            )
            self.args["wandb_project"] = None
Exemplo n.º 21
0
    def _build_vocab(self, max_vocab_cnt):
        # build vocab
        if self.tokenizer_type.startswith('word'):
            self._build_vocab_manual(max_vocab_cnt)
        elif self.tokenizer_type.startswith('bert-'):
            self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>")
            # self.vocab_count = 30522  # fixed for pretrained BERT vocab (old version)
            config_pretrained = BertConfig.from_pretrained(self.tokenizer_type)
            self.vocab_count = config_pretrained.vocab_size

            map_vocab = {}
            for ind in range(self.vocab_count):
                map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind)

            inv_map = {v: k for k, v in map_vocab.items()}

        elif self.tokenizer_type.startswith('xlnet-'):
            # self.vocab = self.tokenizer.vocab
            # self.rev_vocab = self.tokenizer.ids_to_tokens
            # self.pad_id = self.vocab["[PAD]"]
            self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>")
            # self.vocab_count = 32000  # fixed for pretrained BERT vocab
            config_pretrained = XLNetConfig.from_pretrained(
                self.tokenizer_type)
            self.vocab_count = config_pretrained.vocab_size

            map_vocab = {}
            for ind in range(self.vocab_count):
                map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind)

            inv_map = {v: k for k, v in map_vocab.items()}

            self.vocab = map_vocab
            self.rev_vocab = inv_map

        elif self.tokenizer_type.startswith('x5-'):
            self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>")
            # self.vocab_count = 32000
            config_pretrained = T5Config.from_pretrained(self.tokenizer_type)
            self.vocab_count = config_pretrained.vocab_size

            map_vocab = {}
            for ind in range(self.vocab_count):
                map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind)

            inv_map = {v: k for k, v in map_vocab.items()}
            self.vocab = map_vocab
            self.rev_vocab = inv_map

        elif self.tokenizer_type.startswith('bart-'):
            self.pad_id = self.tokenizer.sp_model.piece_to_id("<pad>")
            # self.vocab_count = 32000  # fixed for pretrained BERT vocab
            config_pretrained = BartConfig.from_pretrained(self.tokenizer_type)
            self.vocab_count = config_pretrained.vocab_size

            map_vocab = {}
            for ind in range(self.vocab_count):
                map_vocab[ind] = self.tokenizer.sp_model.id_to_piece(ind)

            inv_map = {v: k for k, v in map_vocab.items()}

        return
Exemplo n.º 22
0
        print("starting to train")
        # train
        word_tokens_train, pos_tokens_train = tasks.pos('UD_English-EWT/en_ewt-ud-train.conllu')
        tokenizer = T5Tokenizer.from_pretrained("t5-small")

        ## i want to append pos: - do I include the pos token associated with it?
        if args.control:
            word_tokens_train, pos_tokens_train = tasks.make_control(tokenizer, word_tokens_train, pos_tokens_train, args.embsize)

        torch_ids_train, torch_masks_train, torch_token_starts, torch_labels_train = r.prepare_data(tokenizer, word_tokens_train, pos_tokens_train)

        # data for training
        split = int(0.75 * len(torch_ids_train))
        #dataset_train = Dataset(torch_ids_train[:split], torch_masks_train[:split], torch_labels_train[:split])
        #dataset_dev = Dataset(torch_ids_train[split:], torch_masks_train[split:], torch_labels_train[split:])
        config = T5Config.from_pretrained("t5-small", output_hidden_states=True, output_attentions=True)
        model = T5ForConditionalGeneration.from_pretrained("t5-small", config=config)
        model.to(device)
        #train(model, dataset_train, dataset_dev, torch_token_starts[split:], tokenizer)

        # 100 values test
        dataset_train = Dataset(torch_ids_train[:200], torch_masks_train[:200], torch_labels_train[:200])
        dataset_dev = Dataset(torch_ids_train[200:400], torch_masks_train[200:400], torch_labels_train[200:400])

        train(model, dataset_train, dataset_dev, torch_token_starts[200:400], tokenizer)

        print("done!")

    else:
        print("starting to evaluate")
        tokenizer = T5Tokenizer.from_pretrained("t5-small")
Exemplo n.º 23
0
        'input_ids', 'attention_mask', 'labels', 'decoder_attention_mask'
    ]
    encoded.set_format(type='torch', columns=columns)

    train_dataloader = torch.utils.data.DataLoader(encoded["train"],
                                                   collate_fn=collate_fn,
                                                   batch_size=args.batch_size)
    val_dataloader = torch.utils.data.DataLoader(encoded["validation"],
                                                 collate_fn=collate_fn,
                                                 batch_size=args.batch_size *
                                                 4)

    if args.from_pretrained:
        model = T5ForConditionalGeneration.from_pretrained(args.model_select)
    else:
        config = T5Config.from_pretrained(args.model_select)
        model = T5ForConditionalGeneration(config)

    no_decay = ["bias", "LayerNorm.weight"]
    params_decay = [
        p for n, p in model.named_parameters()
        if not any(nd in n for nd in no_decay)
    ]
    params_nodecay = [
        p for n, p in model.named_parameters()
        if any(nd in n for nd in no_decay)
    ]
    optim_groups = [
        {
            "params": params_decay,
            "weight_decay": 0.1
Exemplo n.º 24
0
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
        )
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
        )
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.config_name:
        config = T5Config.from_pretrained(
            model_args.config_name, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer)
        )
    elif model_args.model_name_or_path:
        config = T5Config.from_pretrained(
            model_args.model_name_or_path, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer)
        )
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning("You are instantiating a new config instance from scratch.")

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
Exemplo n.º 25
0
 def __init__(self, tokenizer):
     super(T5Model, self).__init__()
     self.tokenizer = tokenizer
     config = T5Config.from_pretrained('t5-small')
     self.model = T5ForConditionalGeneration(config=config)
Exemplo n.º 26
0
    def __init__(
        self,
        model_name,
        args=None,
        use_cuda=True,
        cuda_device=-1,
        **kwargs,
    ):

        """
        Initializes a T5Model model.

        Args:
            model_name: The exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files.
            args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
            use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
            cuda_device (optional): Specific GPU that should be used. Will use the first available GPU by default.
            **kwargs (optional): For providing proxies, force_download, resume_download, cache_dir and other options specific to the 'from_pretrained' implementation where this will be supplied.
        """  # noqa: ignore flake8"

        self.args = self._load_model_args(model_name)

        if isinstance(args, dict):
            self.args.update_from_dict(args)
        elif isinstance(args, T5Args):
            self.args = args

        if "sweep_config" in kwargs:
            sweep_config = kwargs.pop("sweep_config")
            sweep_values = {
                key: value["value"]
                for key, value in sweep_config.as_dict().items()
                if key != "_wandb"
            }
            self.args.update_from_dict(sweep_values)

        if self.args.manual_seed:
            random.seed(self.args.manual_seed)
            np.random.seed(self.args.manual_seed)
            torch.manual_seed(self.args.manual_seed)
            if self.args.n_gpu > 0:
                torch.cuda.manual_seed_all(self.args.manual_seed)

        if use_cuda:
            if torch.cuda.is_available():
                if cuda_device == -1:
                    self.device = torch.device("cuda")
                else:
                    self.device = torch.device(f"cuda:{cuda_device}")
            else:
                raise ValueError(
                    "'use_cuda' set to True when cuda is unavailable."
                    "Make sure CUDA is available or set `use_cuda=False`.")
        else:
            self.device = "cpu"

        self.results = {}

        self.config = T5Config.from_pretrained(model_name, **self.args.config)

        self.model = T5ForConditionalGeneration.from_pretrained(
            model_name, config=self.config)

        self.tokenizer = T5Tokenizer.from_pretrained(model_name, truncate=True)

        if self.args.dynamic_quantize:
            self.model = torch.quantization.quantize_dynamic(self.model,
                                                             {torch.nn.Linear},
                                                             dtype=torch.qint8)

        if not use_cuda:
            self.args.fp16 = False

        self.args.model_type = "T5"
        self.args.model_name = model_name

        if self.args.wandb_project and not wandb_available:
            warnings.warn(
                "wandb_project specified but wandb is not available. Wandb disabled."
            )
            self.args.wandb_project = None
Exemplo n.º 27
0
# This is a very small notebook showing how to grab a pre-trained T5 model, fine-tune it, and export it to onnx.]
# A lot of this is inspired by huggingface.

from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config, AdamW
import torch
from onnxt5 import generate_onnx_representation, GenerativeT5
from onnxt5.api import get_sess
import tempfile

temp_dir = tempfile.gettempdir()

base_model = "t5-base"

# Setting up the model and tokenizer
config = T5Config.from_pretrained(base_model)
config.n_positions = 256  # You can change the properties of your model here
model = T5ForConditionalGeneration(config=config)

# Download vocab file
tokenizer = T5Tokenizer(config=config, vocab_file="test_sentencepiece.model")
model.train()

# Let's setup our optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [{
    'params': [
        p for n, p in model.named_parameters()
        if not any(nd in n for nd in no_decay)
    ],
    'weight_decay':
Exemplo n.º 28
0
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.

    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath(sys.argv[1]))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    # Sending telemetry. Tracking the example usage helps us better allocate resources to maintain them. The
    # information sent is the one passed as arguments along with your Python/PyTorch versions.
    send_example_telemetry("run_t5_mlm",
                           model_args,
                           data_args,
                           framework="flax")

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty."
            "Use --overwrite_output_dir to overcome.")

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        level=logging.INFO,
        datefmt="[%X]",
    )

    # Log on each process the small summary:
    logger = logging.getLogger(__name__)

    # Set the verbosity to info of the Transformers logger (on main process only):
    logger.info(f"Training/evaluation parameters {training_args}")

    # Set seed before initializing model.
    set_seed(training_args.seed)

    # Handle the repository creation
    if training_args.push_to_hub:
        if training_args.hub_model_id is None:
            repo_name = get_full_repo_name(Path(
                training_args.output_dir).absolute().name,
                                           token=training_args.hub_token)
        else:
            repo_name = training_args.hub_model_id
        repo = Repository(training_args.output_dir, clone_from=repo_name)

    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
    #
    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
    # 'text' is found. You can easily tweak this behavior (see below).
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
        datasets = load_dataset(
            data_args.dataset_name,
            data_args.dataset_config_name,
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )

        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
                use_auth_token=True if model_args.use_auth_token else None,
            )
            datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
                cache_dir=model_args.cache_dir,
                use_auth_token=True if model_args.use_auth_token else None,
            )
    else:
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
        datasets = load_dataset(
            extension,
            data_files=data_files,
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )

        if "validation" not in datasets.keys():
            datasets["validation"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
                use_auth_token=True if model_args.use_auth_token else None,
            )
            datasets["train"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[{data_args.validation_split_percentage}%:]",
                cache_dir=model_args.cache_dir,
                use_auth_token=True if model_args.use_auth_token else None,
            )
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    # Load pretrained model and tokenizer

    if model_args.tokenizer_name:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    elif model_args.model_name_or_path:
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
            use_fast=model_args.use_fast_tokenizer,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        raise ValueError(
            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
        )

    if model_args.config_name:
        config = T5Config.from_pretrained(
            model_args.config_name,
            cache_dir=model_args.cache_dir,
            vocab_size=len(tokenizer),
            use_auth_token=True if model_args.use_auth_token else None,
        )
    elif model_args.model_name_or_path:
        config = T5Config.from_pretrained(
            model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        config = CONFIG_MAPPING[model_args.model_type]()
        logger.warning(
            "You are instantiating a new config instance from scratch.")

    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
        column_names = datasets["train"].column_names
    else:
        column_names = datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

    # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
    # Since we make sure that all sequences are of the same length, no attention_mask is needed.
    def tokenize_function(examples):
        return tokenizer(examples[text_column_name],
                         return_attention_mask=False)

    tokenized_datasets = datasets.map(
        tokenize_function,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        remove_columns=column_names,
        load_from_cache_file=not data_args.overwrite_cache,
    )

    # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token.
    # To ensure that the input length is `max_seq_length`, we need to increase the maximum length
    # according to `mlm_probability` and `mean_noise_span_length`. We can also define the label length accordingly.
    expanded_inputs_length, targets_length = compute_input_and_target_lengths(
        inputs_length=max_seq_length,
        noise_density=data_args.mlm_probability,
        mean_noise_span_length=data_args.mean_noise_span_length,
    )

    # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
    def group_texts(examples):
        # Concatenate all texts.
        concatenated_examples = {
            k: list(chain(*examples[k]))
            for k in examples.keys()
        }
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= expanded_inputs_length:
            total_length = (total_length //
                            expanded_inputs_length) * expanded_inputs_length
        # Split by chunks of max_len.
        result = {
            k: [
                t[i:i + expanded_inputs_length]
                for i in range(0, total_length, expanded_inputs_length)
            ]
            for k, t in concatenated_examples.items()
        }
        return result

    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
    # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
    # might be slower to preprocess.
    #
    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
    tokenized_datasets = tokenized_datasets.map(
        group_texts,
        batched=True,
        num_proc=data_args.preprocessing_num_workers,
        load_from_cache_file=not data_args.overwrite_cache,
    )

    # Enable tensorboard only on the master node
    has_tensorboard = is_tensorboard_available()
    if has_tensorboard and jax.process_index() == 0:
        try:
            from flax.metrics.tensorboard import SummaryWriter

            summary_writer = SummaryWriter(
                log_dir=Path(training_args.output_dir))
        except ImportError as ie:
            has_tensorboard = False
            logger.warning(
                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
            )
    else:
        logger.warning(
            "Unable to display metrics through TensorBoard because the package is not installed: "
            "Please run pip install tensorboard to enable.")

    # Initialize our training
    rng = jax.random.PRNGKey(training_args.seed)
    dropout_rngs = jax.random.split(rng, jax.local_device_count())

    if model_args.model_name_or_path:
        model = FlaxT5ForConditionalGeneration.from_pretrained(
            model_args.model_name_or_path,
            config=config,
            seed=training_args.seed,
            dtype=getattr(jnp, model_args.dtype),
            use_auth_token=True if model_args.use_auth_token else None,
        )
    else:
        config.vocab_size = len(tokenizer)
        model = FlaxT5ForConditionalGeneration(
            config,
            seed=training_args.seed,
            dtype=getattr(jnp, model_args.dtype),
        )

    # Data collator
    # This one will take care of randomly masking the tokens.
    data_collator = FlaxDataCollatorForT5MLM(
        tokenizer=tokenizer,
        noise_density=data_args.mlm_probability,
        mean_noise_span_length=data_args.mean_noise_span_length,
        input_length=max_seq_length,
        target_length=targets_length,
        pad_token_id=model.config.pad_token_id,
        decoder_start_token_id=model.config.decoder_start_token_id,
    )

    # Store some constant
    num_epochs = int(training_args.num_train_epochs)
    train_batch_size = int(
        training_args.per_device_train_batch_size) * jax.device_count()
    per_device_eval_batch_size = int(training_args.per_device_eval_batch_size)
    eval_batch_size = per_device_eval_batch_size * jax.device_count()

    num_train_steps = len(
        tokenized_datasets["train"]) // train_batch_size * num_epochs

    num_of_hosts = jax.process_count()
    current_host_idx = jax.process_index()

    # Create learning rate schedule
    warmup_fn = optax.linear_schedule(
        init_value=0.0,
        end_value=training_args.learning_rate,
        transition_steps=training_args.warmup_steps)
    decay_fn = optax.linear_schedule(
        init_value=training_args.learning_rate,
        end_value=0,
        transition_steps=num_train_steps - training_args.warmup_steps,
    )
    linear_decay_lr_schedule_fn = optax.join_schedules(
        schedules=[warmup_fn, decay_fn],
        boundaries=[training_args.warmup_steps])

    # We use Optax's "masking" functionality to not apply weight decay
    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
    # mask boolean with the same structure as the parameters.
    # The mask is True for parameters that should be decayed.
    def decay_mask_fn(params):
        flat_params = traverse_util.flatten_dict(params)
        # find out all LayerNorm parameters
        layer_norm_candidates = ["layernorm", "layer_norm", "ln"]
        layer_norm_named_params = set([
            layer[-2:] for layer_norm_name in layer_norm_candidates
            for layer in flat_params.keys()
            if layer_norm_name in "".join(layer).lower()
        ])
        flat_mask = {
            path: (path[-1] != "bias"
                   and path[-2:] not in layer_norm_named_params)
            for path in flat_params
        }
        return traverse_util.unflatten_dict(flat_mask)

    # create adam optimizer
    if training_args.adafactor:
        # We use the default parameters here to initialize adafactor,
        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
        optimizer = optax.adafactor(
            learning_rate=linear_decay_lr_schedule_fn, )
    else:
        optimizer = optax.adamw(
            learning_rate=linear_decay_lr_schedule_fn,
            b1=training_args.adam_beta1,
            b2=training_args.adam_beta2,
            weight_decay=training_args.weight_decay,
            mask=decay_mask_fn,
        )

    # Setup train state
    state = train_state.TrainState.create(apply_fn=model.__call__,
                                          params=model.params,
                                          tx=optimizer)

    # Define gradient update step fn
    def train_step(state, batch, dropout_rng):
        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)

        def loss_fn(params):
            labels = batch.pop("labels")

            logits = state.apply_fn(**batch,
                                    params=params,
                                    dropout_rng=dropout_rng,
                                    train=True)[0]

            # compute loss
            loss = optax.softmax_cross_entropy(
                logits, onehot(labels, logits.shape[-1])).mean()

            return loss

        grad_fn = jax.value_and_grad(loss_fn)
        loss, grad = grad_fn(state.params)
        grad = jax.lax.pmean(grad, "batch")
        new_state = state.apply_gradients(grads=grad)

        metrics = jax.lax.pmean(
            {
                "loss": loss,
                "learning_rate": linear_decay_lr_schedule_fn(state.step)
            },
            axis_name="batch")

        return new_state, metrics, new_dropout_rng

    # Create parallel version of the train step
    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0, ))

    # Define eval fn
    def eval_step(params, batch):
        labels = batch.pop("labels")

        logits = model(**batch, params=params, train=False)[0]

        # compute loss
        loss = optax.softmax_cross_entropy(logits,
                                           onehot(labels, logits.shape[-1]))

        # compute accuracy
        accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels)

        # summarize metrics
        metrics = {"loss": loss.mean(), "accuracy": accuracy.mean()}
        metrics = jax.lax.pmean(metrics, axis_name="batch")

        return metrics

    p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0, ))

    # Replicate the train state on each device
    state = jax_utils.replicate(state)

    train_time = 0
    epochs = tqdm(range(num_epochs), desc="Epoch ... ", position=0)
    for epoch in epochs:
        # ======================== Training ================================
        train_start = time.time()
        train_metrics = []

        # Create sampling rng
        rng, input_rng = jax.random.split(rng)

        # Generate an epoch by shuffling sampling indices from the train dataset
        num_train_samples = len(tokenized_datasets["train"])
        # Avoid using jax.numpy here in case of TPU training
        train_samples_idx = np.random.permutation(np.arange(num_train_samples))
        train_batch_idx = generate_batch_splits(train_samples_idx,
                                                train_batch_size)

        # Gather the indexes for creating the batch and do a training step
        for step, batch_idx in enumerate(
                tqdm(train_batch_idx, desc="Training...", position=1)):
            samples = [
                tokenized_datasets["train"][int(idx)] for idx in batch_idx
            ]
            model_inputs = data_collator(samples)

            local_host_model_inputs = {
                key: np.split(model_inputs.data[key], num_of_hosts,
                              axis=0)[current_host_idx]
                for key, value in model_inputs.data.items()
            }

            # Model forward
            model_inputs = shard(local_host_model_inputs)
            state, train_metric, dropout_rngs = p_train_step(
                state, model_inputs, dropout_rngs)
            train_metrics.append(train_metric)

            cur_step = epoch * (num_train_samples // train_batch_size) + step

            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
                # Save metrics
                train_metric = jax_utils.unreplicate(train_metric)
                train_time += time.time() - train_start
                if has_tensorboard and jax.process_index() == 0:
                    write_train_metric(summary_writer, train_metrics,
                                       train_time, cur_step)

                epochs.write(
                    f"Step... ({cur_step} | Loss: {train_metric['loss'].mean()}, Learning Rate:"
                    f" {train_metric['learning_rate'].mean()})")

                train_metrics = []

            if cur_step % training_args.eval_steps == 0 and cur_step > 0:
                # ======================== Evaluating ==============================
                num_eval_samples = len(tokenized_datasets["validation"])
                # Avoid using jax.numpy here in case of TPU training
                eval_samples_idx = np.arange(num_eval_samples)
                eval_batch_idx = generate_batch_splits(eval_samples_idx,
                                                       eval_batch_size,
                                                       drop_last=False)

                eval_metrics = []
                for i, batch_idx in enumerate(
                        tqdm(eval_batch_idx, desc="Evaluating ...",
                             position=2)):
                    samples = [
                        tokenized_datasets["validation"][int(idx)]
                        for idx in batch_idx
                    ]
                    model_inputs = data_collator(samples)

                    # Model forward
                    metrics = pad_shard_unpad(p_eval_step, static_return=True)(
                        state.params,
                        model_inputs.data,
                        min_device_batch=per_device_eval_batch_size)
                    eval_metrics.append(metrics)

                # get eval metrics
                eval_metrics = get_metrics(eval_metrics)
                eval_metrics = jax.tree_map(jnp.mean, eval_metrics)

                # Update progress bar
                epochs.write(
                    f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})"
                )

                # Save metrics
                if has_tensorboard and jax.process_index() == 0:
                    write_eval_metric(summary_writer, eval_metrics, cur_step)

            if cur_step % training_args.save_steps == 0 and cur_step > 0:
                # save checkpoint after each epoch and push checkpoint to the hub
                if jax.process_index() == 0:
                    params = jax.device_get(
                        jax.tree_map(lambda x: x[0], state.params))
                    model.save_pretrained(training_args.output_dir,
                                          params=params)
                    tokenizer.save_pretrained(training_args.output_dir)
                    if training_args.push_to_hub:
                        repo.push_to_hub(
                            commit_message=
                            f"Saving weights and logs of step {cur_step}",
                            blocking=False)

    # Eval after training
    if training_args.do_eval:
        num_eval_samples = len(tokenized_datasets["validation"])
        # Avoid using jax.numpy here in case of TPU training
        eval_samples_idx = np.arange(num_eval_samples)
        eval_batch_idx = generate_batch_splits(eval_samples_idx,
                                               eval_batch_size,
                                               drop_last=False)

        eval_metrics = []
        for i, batch_idx in enumerate(
                tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
            samples = [
                tokenized_datasets["validation"][int(idx)] for idx in batch_idx
            ]
            model_inputs = data_collator(samples)

            # Model forward
            metrics = pad_shard_unpad(p_eval_step, static_return=True)(
                state.params,
                model_inputs.data,
                min_device_batch=per_device_eval_batch_size)
            eval_metrics.append(metrics)

        # get eval metrics
        eval_metrics = get_metrics(eval_metrics)
        eval_metrics = jax.tree_map(lambda metric: jnp.mean(metric).item(),
                                    eval_metrics)

        if jax.process_index() == 0:
            eval_metrics = {
                f"eval_{metric_name}": value
                for metric_name, value in eval_metrics.items()
            }
            path = os.path.join(training_args.output_dir, "eval_results.json")
            with open(path, "w") as f:
                json.dump(eval_metrics, f, indent=4, sort_keys=True)
Exemplo n.º 29
0
    return torch.tensor(LA.norm(mat, n).item())


for dirname in [
        './models/11b/heads'
]:  # for your case, replace dirname with the path to your model file
    seeds = [0, 1, 2, 3, 4]
    for seed in seeds:
        gc.collect()
        results_encoder = defaultdict(list)
        results_decoder = defaultdict(list)
        table_file_decoder = open(
            f'l1_decoder_{dirname.split("/")[-1]}-{seed}.tsv', 'w')
        table_file_encoder = open(
            f'l1_encoder_{dirname.split("/")[-1]}-{seed}.tsv', 'w')
        config = T5Config.from_pretrained(f'{dirname}-{seed}')
        model = T5ForConditionalGeneration.from_pretrained(f'{dirname}-{seed}',
                                                           config=config)

        org_config = T5Config.from_pretrained(f'./models/11b')
        org_model = T5ForConditionalGeneration.from_pretrained(
            f'./models/11b', config=org_config)

        org_dict = org_model.state_dict()
        trained_dict = model.state_dict()

        for encoder_n in range(24):
            print("seed", seed, encoder_n)
            q_org = org_dict[
                f'encoder.block.{encoder_n}.layer.0.SelfAttention.q.weight']
            q_new = trained_dict[