def initialize(self, resources: Resources, configs: HParams): self.resource = resources vocab_file = configs.vocab_file self.tokenizer = BERTTokenizer.load(vocab_file) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.encoder = BERTEncoder(pretrained_model_name="bert-base-uncased") self.encoder.to(self.device)
max_seq_length = 512 print(f"Processing the IMDB reviews...") processor_class = get_processor_class("IMDB") imdb_processor = processor_class("data/IMDB") train_examples = imdb_processor.get_train_examples() dev_examples = imdb_processor.get_dev_examples() reviews = [ example.text_a for dataset in [train_examples, dev_examples] for example in dataset ] # create a BERT tokenizer vocab_file = "data/pretrained_models/uncased_L-12_H-768_A-12/vocab.txt" tokenizer = BERTTokenizer.load(vocab_file) # BERT encoder device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = BERTEncoder(pretrained_model_name="bert-base-uncased") encoder.to(device) print(f"Encoding the text using BERT Tokenizer...") feature_original_types = { "id": ["int64", "FixedLenFeature"], "input_ids": ["int64", "FixedLenFeature", max_seq_length], "segment_ids": ["int64", "FixedLenFeature", max_seq_length], "text": ["str", "FixedLenFeature"] } with RecordData.writer("data/imdb.pkl", feature_original_types) as writer: