Пример #1
0
    def initialize(self, resources: Resources, configs: HParams):
        self.resource = resources
        vocab_file = configs.vocab_file
        self.tokenizer = BERTTokenizer.load(vocab_file)

        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.encoder = BERTEncoder(pretrained_model_name="bert-base-uncased")
        self.encoder.to(self.device)
Пример #2
0
max_seq_length = 512

print(f"Processing the IMDB reviews...")
processor_class = get_processor_class("IMDB")
imdb_processor = processor_class("data/IMDB")
train_examples = imdb_processor.get_train_examples()
dev_examples = imdb_processor.get_dev_examples()
reviews = [
    example.text_a for dataset in [train_examples, dev_examples]
    for example in dataset
]

# create a BERT tokenizer
vocab_file = "data/pretrained_models/uncased_L-12_H-768_A-12/vocab.txt"
tokenizer = BERTTokenizer.load(vocab_file)

# BERT encoder
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = BERTEncoder(pretrained_model_name="bert-base-uncased")
encoder.to(device)

print(f"Encoding the text using BERT Tokenizer...")
feature_original_types = {
    "id": ["int64", "FixedLenFeature"],
    "input_ids": ["int64", "FixedLenFeature", max_seq_length],
    "segment_ids": ["int64", "FixedLenFeature", max_seq_length],
    "text": ["str", "FixedLenFeature"]
}

with RecordData.writer("data/imdb.pkl", feature_original_types) as writer: