示例#1
0
    def __init__(self, tokenizer: AutoTokenizer, file_path: str, args):
        print(file_path)
        assert os.path.isfile(file_path)

        directory, filename = os.path.split(file_path)
        cached_features_file = os.path.join(
            directory, args.bert_model_type + "_cached_mlm_" + filename)

        if os.path.exists(cached_features_file):
            print("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.samples = torch.load(handle)
        else:
            print("Creating features from dataset file at %s", directory)

            # Get the faster tokenizer from tokenizers package
            tokenizer.save_vocabulary(vocab_path='.')
            fast_tokenizer = BertWordPieceTokenizer("vocab.txt",
                                                    lowercase=args.lowercase)
            fast_tokenizer.enable_truncation(tokenizer.max_len)
            fast_tokenizer.enable_padding(max_length=tokenizer.max_len,
                                          pad_token=tokenizer.pad_token)

            self.samples = []

            # Load data over here
            df = pd.read_json(file_path)
            print('SQUAD data: ')

            for _, row in tqdm(df.iterrows(), total=df.shape[0]):
                for paragraph in row['data']['paragraphs']:
                    context = paragraph['context']
                    for qa_pair in paragraph['qas']:
                        question = qa_pair['question']

                        batch = fast_tokenizer.encode(question, context)
                        self.samples.append({
                            'input_ids':
                            batch.ids,
                            'attention_mask':
                            batch.attention_mask
                        })

                        for encoding in batch.overflowing:
                            self.samples.append({
                                'input_ids':
                                encoding.ids,
                                'attention_mask':
                                encoding.attention_mask
                            })

            df = None

            print("Saving features into cached file: ", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                torch.save(self.samples,
                           handle,
                           pickle_protocol=pickle.HIGHEST_PROTOCOL)
示例#2
0
    def __init__(self,
                 tokenizer: AutoTokenizer,
                 args,
                 file_path: str,
                 block_size=512):
        print(file_path)
        assert os.path.isfile(file_path)

        block_size = block_size - \
                     (tokenizer.model_max_length - tokenizer.max_len_single_sentence)

        directory, filename = os.path.split(file_path)
        model_type_string = args.model_type.replace('/', '-')
        cached_features_file = os.path.join(
            directory, model_type_string + "_cached_lm_" + str(block_size) +
            "_" + filename)

        if os.path.exists(cached_features_file):
            print("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            print("Creating features from dataset file at %s", directory)

            tokenizer.save_vocabulary(save_directory='.')

            print('---' * 10)
            print("Saving features into cached file %s", cached_features_file)
            filesize = os.path.getsize(file_path)
            t = tqdm(total=filesize)

            read_chunk = 1024 * 1024 * 32  #Adjust according to memory
            for text in read_in_chunks(file_path, chunk_size=read_chunk):
                self.examples = []
                text_chunks = chunks(text, 300000)
                for chunk in text_chunks:
                    batch = tokenizer(chunk,
                                      truncation=True,
                                      padding='max_length',
                                      return_overflowing_tokens=True)

                    for ids in batch['input_ids']:
                        self.examples.append(ids)

                with open(cached_features_file, "ab") as handle:
                    pickle.dump(self.examples,
                                handle,
                                protocol=pickle.HIGHEST_PROTOCOL)
                t.update(read_chunk)
    def __init__(self,
                 tokenizer: AutoTokenizer,
                 args,
                 file_path: str,
                 block_size=512):
        print(file_path)
        assert os.path.isfile(file_path)

        block_size = block_size - \
                     (tokenizer.model_max_length - tokenizer.max_len_single_sentence)

        directory, filename = os.path.split(file_path)
        model_type_string = args.model_type.replace('/', '-')
        cached_features_file = os.path.join(
            directory, model_type_string + "_cached_lm_" + str(block_size) +
            "_" + filename)

        if os.path.exists(cached_features_file):
            print("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            print("Creating features from dataset file at %s", directory)

            tokenizer.save_vocabulary(save_directory='.')

            self.examples = []
            with open(file_path, encoding="utf-8") as f:
                text = f.read()

            text_chunks = list(chunks(text, 300000))

            for chunk in tqdm(text_chunks):
                batch = tokenizer(chunk,
                                  truncation=True,
                                  padding='max_length',
                                  return_overflowing_tokens=True)

                for ids in batch['input_ids']:
                    self.examples.append(ids)

            print("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples,
                            handle,
                            protocol=pickle.HIGHEST_PROTOCOL)