def __init__(self, data_dir: str, tokenizer: PreTrainedTokenizer, labels: List[str], model_type: str, max_seq_length: Optional[int] = None, overwrite_cache=False, mode: Split = Split.train, local_rank=-1, to_predict: Optional[str] = None): # Load data features from cache or dataset file cached_features_file = os.path.join( data_dir, "cached_{}_{}_{}".format(mode, tokenizer.__class__.__name__, str(max_seq_length)), ) if not to_predict: examples = read_examples_from_file(data_dir, mode) else: examples = [ InputExample(i, sentence.split(), [labels[0] for word in sentence.split()]) for i, sentence in enumerate(to_predict) ] # The below is stated to ensure only one process executes the dataset. If this is not set, # we do have a posibility of messing up with the dataset during training with torch_distributed_zero_first(local_rank): # If a cache file of the features exists, load it into PyTorch if os.path.exists(cached_features_file) and not overwrite_cache: logger.info( f"Loading features from cached file {cached_features_file}" ) self.features = torch.load(cached_features_file) else: logger.info( f"Converting features from dataset file at {data_dir}") self.features = convert_examples_to_features( examples, labels, max_seq_length, tokenizer, cls_token_at_end=bool(model_type in ["xlnet"]), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if model_type in ["xlnet"] else 0, sep_token=tokenizer.sep_token, sep_token_extra=bool(model_type in ["roberta"]), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(tokenizer.padding_side == "left"), pad_token=tokenizer.pad_token_id, pad_token_segment_id=tokenizer.pad_token_type_id, pad_token_label_id=self.pad_token_label_id, ) if local_rank in [-1, 0]: logger.info( f"Saving features into cached file {cached_features_file}" ) torch.save(self.features, cached_features_file)
def __init__( self, data_dir: str, category_num: int, tokenizer: PreTrainedTokenizer, task: str, max_seq_length: Optional[int] = None, overwrite_cache=False, mode: Split = Split.train, local_rank=-1, ): processor = processors[task]() processor.set_labels(category_num) cached_features_file = os.path.join( data_dir, "cached_{}_{}_{}_{}".format( mode.value, tokenizer.__class__.__name__, str(max_seq_length), task, ), ) with torch_distributed_zero_first(local_rank): # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. if os.path.exists(cached_features_file) and not overwrite_cache: logger.info( f"Loading features from cached file {cached_features_file}" ) self.features = torch.load(cached_features_file) else: logger.info( f"Creating features from dataset file at {data_dir}") label_list = processor.get_labels() if mode == Split.dev: examples = processor.get_dev_examples(data_dir) elif mode == Split.test: examples = processor.get_test_examples(data_dir) else: examples = processor.get_train_examples(data_dir) logger.info("Training examples: %s", len(examples)) # TODO clean up all this to leverage built-in features of tokenizers self.features = convert_examples_to_features( examples, label_list, max_seq_length, tokenizer, pad_on_left=bool(tokenizer.padding_side == "left"), pad_token=tokenizer.pad_token_id, pad_token_segment_id=tokenizer.pad_token_type_id, ) if local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(self.features, cached_features_file)
def __init__( self, data_dir: str, tokenizer: PreTrainedTokenizer, labels: List[str], model_type: str, max_seq_length: Optional[int] = None, overwrite_cache=False, mode: Split = Split.train, local_rank=-1, ): # Load data features from cache or dataset file cached_features_file = os.path.join( data_dir, "cached_{}_{}_{}".format(mode.value, tokenizer.__class__.__name__, str(max_seq_length)), ) with torch_distributed_zero_first(local_rank): # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. if os.path.exists( cached_features_file) and not overwrite_cache: logger.info( f"Loading features from cached file {cached_features_file}" ) self.features = torch.load(cached_features_file) else: logger.info( f"Creating features from dataset file at {data_dir}") examples = read_examples_from_file(data_dir, mode) # TODO clean up all this to leverage built-in features of tokenizers self.features = convert_examples_to_features( examples, labels, max_seq_length, tokenizer, cls_token_at_end=bool(model_type in ["xlnet"]), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if model_type in ["xlnet"] else 0, sep_token=tokenizer.sep_token, sep_token_extra=bool(model_type in ["roberta"]), # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805 pad_on_left=bool(tokenizer.padding_side == "left"), pad_token=tokenizer.pad_token_id, pad_token_segment_id=tokenizer.pad_token_type_id, pad_token_label_id=self.pad_token_label_id, ) if local_rank in [-1, 0]: logger.info( f"Saving features into cached file {cached_features_file}" ) torch.save(self.features, cached_features_file)
def dataset(self, predict=False, limit_length: Optional[int] = None, evaluate=False, local_rank=-1) -> Dataset: with torch_distributed_zero_first(local_rank): dataset = self._data_store.load_dataset( limit_length, evaluate) if not predict else None if not dataset: features = self.__generate_features(limit_length, evaluate) dataset = TaskDataset(features) if not predict and local_rank in [-1, 0]: self._data_store.save_dataset(dataset, evaluate) return dataset
def __init__( self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, ctrl_code_tok: int, overwrite_cache=False, local_rank=-1, ): assert os.path.isfile(file_path) # Subtract 1 so we can prepend the CTRL code block_size = block_size - tokenizer.num_special_tokens_to_add(pair=False) - 1 directory, filename = os.path.split(file_path) cached_features_file = os.path.join( directory, "cached_lm_{}_{}_{}".format(tokenizer.__class__.__name__, str(block_size), filename, ), ) with torch_distributed_zero_first(local_rank): # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. if os.path.exists(cached_features_file) and not overwrite_cache: start = time.time() with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) logger.info( f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start ) else: logger.info(f"Creating features from dataset file at {directory}") self.examples = [] with open(file_path, encoding="utf-8") as f: text = f.read() tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text)) for i in trange(0, len(tokenized_text) - block_size + 1, block_size): # Truncate in block of block_size self.examples.append( tokenizer.build_inputs_with_special_tokens([ctrl_code_tok] + tokenized_text[i: i + block_size]) ) # Note that we are losing the last truncated example here for the sake of simplicity (no padding) # If your dataset is small, first you should loook for a bigger one :-) and second you # can change this behavior by adding (model specific) padding. start = time.time() with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) logger.info( f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start )
def __init__(self, tokenizer: PreTrainedTokenizer, split: str, file_path: str, block_size: int, overwrite_cache=False, local_rank=-1): block_size = 4096 self.block_size = 4096 directory = './processed_files' cached_features_file = os.path.join( directory, "cached_lm_{}_{}_{}".format( tokenizer.__class__.__name__, str(block_size), split, ), ) with torch_distributed_zero_first(local_rank): if os.path.exists(cached_features_file) and not overwrite_cache: start = time.time() with open(cached_features_file, "rb") as handle: self.examples = pickle.load(handle) self.examples = [ ex for ex in self.examples[:int(len(self.examples))] if len(ex[0]) == self.block_size and len(ex[1]) == self.block_size ] self.examples = [ex for ex in self.examples[:]] logger.info( f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start) else: input_path = './multinews/train.txt.src' logger.info( f"Creating features from dataset file at {input_path}") encoder_func = select_words_to_mask_special_tokens_only_multiple_docs self.examples = [] self.masking_samples = [] corpus = read_in_train_set(input_path) ln = [] for i in range(len(corpus)): sample = corpus[i].strip() articles = sample.split("story_separator_special_tag") ln.append(articles) tokenizer.add_tokens(['<doc-s>'], special_tokens=True) tokenizer.add_tokens(['</doc-s>'], special_tokens=True) stats = [] while len(stats) < 64 * 25 * 1000: for topic in ln: if len(topic) > 2: s = random.sample(topic, len(topic)) examp, st = encoder_func(s, tokenizer, block_size) self.examples.append(examp) stats.append(st) # Uncomment for creating data for the random baseline # while len(self.examples) < 64*25*1000: # s = random.sample(ln, 10) # curr_false_topic = [] # for topic in s: # curr_false_topic.append(random.sample(topic, 1)[0]) # examp, st = encoder_func(curr_false_topic, tokenizer, block_size) # self.examples.append(examp) # stats.append(st) start = time.time() with open(cached_features_file, "wb") as handle: pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL) logger.info( f"Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start)
def __init__(self, args: DataArguments, tokenizer: PreTrainedTokenizer, processor: DataProcessor, limit_length: Optional[int] = None, evaluate=False, local_rank=-1): self.args = args # Load data features from cache or dataset file cached_features_file = None if not args.predict: cached_features_file = os.path.join( processor.data_dir(), "cached_{}_{}_{}_{}".format( "dev" if evaluate else "train", tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name, ), ) with torch_distributed_zero_first(local_rank): # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. if not args.predict and os.path.exists( cached_features_file) and not args.overwrite_cache: start = time.time() self.features = torch.load(cached_features_file) log.info( f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start) else: log.info(f"Creating features from dataset file") label_list = processor.get_labels() if label_list and args.task_name in [ "mnli", "mnli-mm" ] and tokenizer.__class__ in ( RobertaTokenizer, RobertaTokenizerFast, XLMRobertaTokenizer, ): # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] examples = (processor.get_dev_examples() if evaluate else processor.get_train_examples()) if limit_length is not None: examples = examples[:limit_length] self.features = _glue_convert_examples_to_features( examples, tokenizer, processor, max_length=args.max_seq_length, label_list=label_list, output_mode=self.args.model_mode_for_data, progress_bar=args.progress_bar, evaluate=evaluate, num_print=self.args.show_feature_num) if not args.predict and local_rank in [-1, 0]: log.info("Saving features into cached file %s", cached_features_file) start = time.time() torch.save(self.features, cached_features_file) # ^ This seems to take a lot of time so I want to investigate why and how we can improve. log.info( f"Saved features into cached file %s [took %.3f s]", cached_features_file, time.time() - start)