Пример #1
0
    def __init__(self,
                 n_labels,
                 hidden_size,
                 dropout=0.2,
                 label_ignore_idx=0,
                 max_seq_length=128,
                 batch_size=32,
                 head_init_range=0.04,
                 device='cuda',
                 vocab_size=320):
        super().__init__()
        self.n_labels = n_labels

        self.linear_1 = nn.Linear(hidden_size, hidden_size)
        self.classification_head = nn.Linear(hidden_size, n_labels)
        self.label_ignore_idx = label_ignore_idx
        self.tokenizer = ReformerTokenizer.from_pretrained(
            'google/reformer-crime-and-punishment')
        config = ReformerConfig(
            axial_pos_shape=[batch_size,
                             int(max_seq_length / batch_size)])
        self.model = ReformerModel(config)
        self.dropout = nn.Dropout(dropout)

        self.device = device

        # initializing classification head
        self.classification_head.weight.data.normal_(mean=0.0,
                                                     std=head_init_range)
def prepare_dataset(max_length):
    # get pretrained tokenizer
    tokenizer = ReformerTokenizer.from_pretrained(
        "patrickvonplaten/reformer-crime-and-punish")

    # define our map function to reduce the dataset to one sample
    def flatten_and_tokenize(batch):
        all_input_text = ["".join(batch["line"])]
        input_ids_dict = tokenizer.batch_encode_plus(
            all_input_text,
            pad_to_max_length=True,
            max_length=max_length,
        )

        # duplicate data 8 times to have have 8 examples in dataset
        for key in input_ids_dict.keys():
            input_ids_dict[key] = [8 * [x] for x in input_ids_dict[key]][0]
        return input_ids_dict

    # load the dataset
    dataset = nlp.load("crime_and_punish", split="train")

    # reduce the dataset
    dataset = dataset.map(flatten_and_tokenize,
                          batched=True,
                          batch_size=-1,
                          remove_columns=["line"])

    # prepare dataset to be in torch format
    dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

    return dataset
Пример #3
0
    def test_pretrained_generate_use_cache_equality(self):
        model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment").to(torch_device)
        tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment")
        model.eval()
        input_ids = tokenizer.encode("A few months later", return_tensors="pt").to(torch_device)
        output_ids_with_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=False)
        output_ids_without_cache = model.generate(input_ids, max_length=130, num_hashes=8, use_cache=True)

        output_with_cache = tokenizer.decode(output_ids_with_cache[0])
        output_without_cache = tokenizer.decode(output_ids_without_cache[0])

        self.assertEqual(output_with_cache, output_without_cache)
    def test_pretrained_generate_crime_and_punish(self):
        model = ReformerModelWithLMHead.from_pretrained("google/reformer-crime-and-punishment").to(torch_device)
        tokenizer = ReformerTokenizer.from_pretrained("google/reformer-crime-and-punishment")
        model.eval()

        input_ids = tokenizer.encode("A few months later", return_tensors="pt").to(torch_device)
        output_ids = model.generate(
            input_ids, max_length=50, num_beams=4, early_stopping=True, do_sample=False, num_hashes=8
        )
        output_text = tokenizer.decode(output_ids[0])
        self.assertEqual(
            output_text,
            "A few months later state expression in his ideas, at the first entrance. He was positively for an inst",
        )
    def test_tokenization_reformer(self):
        # Given
        self.base_tokenizer = ReformerTokenizer.from_pretrained(
            'google/reformer-crime-and-punishment',
            do_lower_case=False,
            cache_dir=self.test_dir)
        self.rust_tokenizer = PyReformerTokenizer(get_from_cache(
            self.base_tokenizer.pretrained_vocab_files_map['vocab_file']
            ['google/reformer-crime-and-punishment']),
                                                  do_lower_case=True)
        output_baseline = []
        for example in self.examples:
            output_baseline.append(
                self.base_tokenizer.encode_plus(
                    example.text_a,
                    add_special_tokens=True,
                    return_overflowing_tokens=True,
                    return_special_tokens_mask=True,
                    max_length=128))

        # When
        output_rust = self.rust_tokenizer.encode_list(
            [example.text_a for example in self.examples],
            max_len=128,
            truncation_strategy='longest_first',
            stride=0)

        # Then
        for idx, (rust,
                  baseline) in enumerate(zip(output_rust, output_baseline)):
            assert rust.token_ids == baseline[
                'input_ids'], f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' \
                              f'Sentence a: {self.examples[idx].text_a} \n' \
                              f'Sentence b: {self.examples[idx].text_b} \n' \
                              f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' \
                              f'Rust: {rust.token_ids} \n' \
                              f'Python {baseline["input_ids"]}'
            assert (
                rust.special_tokens_mask == baseline['special_tokens_mask'])
    def test_full_tokenizer(self):
        tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True)

        tokens = tokenizer.tokenize("This is a test")
        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])

        self.assertListEqual(
            tokenizer.convert_tokens_to_ids(tokens),
            [285, 46, 10, 170, 382],
        )

        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
        self.assertListEqual(
            tokens,
            [
                SPIECE_UNDERLINE + "I",
                SPIECE_UNDERLINE + "was",
                SPIECE_UNDERLINE + "b",
                "or",
                "n",
                SPIECE_UNDERLINE + "in",
                SPIECE_UNDERLINE + "",
                "9",
                "2",
                "0",
                "0",
                "0",
                ",",
                SPIECE_UNDERLINE + "and",
                SPIECE_UNDERLINE + "this",
                SPIECE_UNDERLINE + "is",
                SPIECE_UNDERLINE + "f",
                "al",
                "s",
                "é",
                ".",
            ],
        )
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(
            ids,
            [
                8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46,
                72, 80, 6, 0, 4
            ],
        )

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(
            back_tokens,
            [
                SPIECE_UNDERLINE + "I",
                SPIECE_UNDERLINE + "was",
                SPIECE_UNDERLINE + "b",
                "or",
                "n",
                SPIECE_UNDERLINE + "in",
                SPIECE_UNDERLINE + "",
                "<unk>",
                "2",
                "0",
                "0",
                "0",
                ",",
                SPIECE_UNDERLINE + "and",
                SPIECE_UNDERLINE + "this",
                SPIECE_UNDERLINE + "is",
                SPIECE_UNDERLINE + "f",
                "al",
                "s",
                "<unk>",
                ".",
            ],
        )
    def setUp(self):
        super().setUp()

        tokenizer = ReformerTokenizer(SAMPLE_VOCAB, keep_accents=True)
        tokenizer.save_pretrained(self.tmpdirname)
 def big_tokenizer(self):
     return ReformerTokenizer.from_pretrained(
         "google/reformer-crime-and-punishment")
Пример #9
0
def main():
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TrainingArguments))

    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        model_args, data_args, training_args = parser.parse_json_file(
            json_file=os.path.abspath('args.json'))
    else:
        model_args, data_args, training_args = parser.parse_args_into_dataclasses(
        )

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO
        if training_args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        training_args.local_rank,
        training_args.device,
        training_args.n_gpu,
        bool(training_args.local_rank != -1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Set seed
    set_seed(training_args.seed)

    tokenizer = ReformerTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    model = ReformerForQuestionAnswering.from_pretrained(
        model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )

    # Get datasets
    train_examples = DeepThinkDataset(data_args.input_train_file)
    train_dataset = DTDataset(tokenizer, train_examples,
                              data_args.max_seq_length)
    eval_examples = DeepThinkDataset(data_args.input_eval_file)
    eval_dataset = DTDataset(tokenizer, eval_examples,
                             data_args.max_seq_length)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        data_collator=DummyDataCollator(),
        prediction_loss_only=True,
    )

    # Training
    if training_args.do_train:
        trainer.train(model_path=model_args.model_name_or_path if os.path.
                      isdir(model_args.model_name_or_path) else None)
        trainer.save_model()
        # For convenience, we also re-save the tokenizer to the same directory,
        # so that you can share your model easily on huggingface.co/models =)
        if trainer.is_world_master():
            tokenizer.save_pretrained(training_args.output_dir)

    # Evaluation
    results = {}
    if training_args.do_eval and training_args.local_rank in [-1, 0]:
        logger.info("*** Evaluate ***")

        eval_output = trainer.evaluate()

        output_eval_file = os.path.join(training_args.output_dir,
                                        "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(eval_output.keys()):
                logger.info("  %s = %s", key, str(eval_output[key]))
                writer.write("%s = %s\n" % (key, str(eval_output[key])))

        results.update(eval_output)

    return results
Пример #10
0
import torch
from transformers import ReformerTokenizer, ReformerModel

MODEL_MAX_LENGTH = 4608

tokenizer_config_path = "protein_reformer/spiece.model"
tokenizer = ReformerTokenizer(vocab_file=tokenizer_config_path,
                              do_lower_case=True,
                              model_max_length=MODEL_MAX_LENGTH)

model_checkpoint = 'output/checkpoint-6500/'
model = ReformerModel.from_pretrained(model_checkpoint)

sequence_file_path = "data/yeast/yeast.txt"
f = open(sequence_file_path, "r")
sequence_txt = f.readlines()
f.close()

input_sequence_list = [
    tokenizer(sequence.strip(), truncation=True,
              return_tensors='pt')['input_ids'].cuda()
    for sequence in sequence_txt
]
model.cuda()
protein_vectors_list = [
    torch.mean(model(inp)[1][-1], dim=1) for inp in input_sequence_list
]
protein_vectors = torch.cat(protein_vectors_list, dim=0)

from sklearn.manifold import TSNE
protein_vectors_tsne = TSNE(n_components=2).fit_transform(
Пример #11
0
# from transformers import pipeline
# nlp = pipeline("sentiment-analysis")
# result = nlp("I hate you")[0]
# print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
# result = nlp("I love you")[0]
# print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

from transformers import ReformerTokenizer, ReformerModel
import torch
tokenizer = ReformerTokenizer.from_pretrained(
    'google/reformer-crime-and-punishment')
model = ReformerModel.from_pretrained('google/reformer-crime-and-punishment',
                                      return_dict=True)
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state
print(last_hidden_states)
Пример #12
0
from transformers import ReformerConfig, ReformerModelWithLMHead, ReformerTokenizer, EncoderDecoderConfig, EncoderDecoderModel
from torch.utils.data import DataLoader, Dataset

NUM_BATCHES = None
BATCH_SIZE = 20
LEARNING_RATE = 0.001  #1e-4 #1e-4
VALIDATE_EVERY = 10
SEQ_LEN = 4608

# spm.SentencePieceTrainer.Train("--input=./data/tokenizer_training/AAresiduals.txt \
#                                 --vocab_size=28 \
#                                 --model_prefix=sequence_tokenizer \
#                                 --model_type=char \
#                                 --character_coverage=1.0")
tokenizer = ReformerTokenizer(vocab_file="sequence_tokenizer.model",
                              do_lower_case=False,
                              model_max_length=SEQ_LEN)
tokenizer.max_model_input_sizes = SEQ_LEN

# def split_file(file,out1,out2,percentage=0.75,isShuffle=True,seed=42):
#     random.seed(seed)
#     with open(file, 'r',encoding="utf-8") as fin, open(out1, 'w') as foutBig, open(out2, 'w') as foutSmall:
#         nLines = sum(1 for line in fin)
#         fin.seek(0)

#         nTrain = int(nLines*percentage)
#         nValid = nLines - nTrain

#         i = 0
#         for line in fin:
#             r = random.random() if isShuffle else 0 # so that always evaluated to true when not isShuffle
Пример #13
0
from transformers import ReformerConfig, ReformerTokenizer, ReformerModel
import sentencepiece as spm
import os


assert os.path.exists('protein_reformer/training_vocab.txt') == 1\
    , f'build a lower case amino acid txt file to train tokenizer. content should be: {"ARNDCQEGHILKMFPSTWYVOUBZX".lower()}'
MODEL_MAX_LENGTH = 4608
spm.SentencePieceTrainer.Train(
    "--input=protein_reformer/training_vocab.txt --model_prefix=spiece --vocab_size=30 --pad_id=29 --character_coverage=1.0"
)
os.system("mv spiece.model spiece.vocab protein_reformer")
tokenizer = ReformerTokenizer(vocab_file="protein_reformer/spiece.model",
                              do_lower_case=True,
                              model_max_length=MODEL_MAX_LENGTH)
tokenizer.save_pretrained("protein_reformer")

configuration = ReformerConfig.from_pretrained(
    "google/reformer-crime-and-punishment")
configuration.axial_pos_shape = (64, 72)
configuration.max_position_embeddings = MODEL_MAX_LENGTH
configuration.vocab_size = tokenizer.vocab_size
configuration.pad_token_id = tokenizer.pad_token_id
# configuration.attn_layers = ["local","lsh","local","lsh"]
configuration.output_hidden_states = True
configuration.save_pretrained('protein_reformer/')