def load_and_cache_examples(args, tokenizer, output_examples=False): if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file input_file = args.train_file cached_features_file = os.path.join( os.path.dirname(input_file), 'cached_{}_{}_{}'.format( 'dev', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length))) if os.path.exists(cached_features_file ) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", input_file) examples = read_squad_examples( input_file=input_file, is_training=True, version_2_with_negative=args.version_2_with_negative) features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_cls_index, all_p_mask) if output_examples: return dataset, examples, features return dataset
def preprocess_data(self, path_to_stream, input_question): """ preprocess input data params: path_to_stream: document content input_question: input file """ examples = read_squad_examples(input_stream=path_to_stream, is_training=False, version_2_with_negative=False, updated_question=input_question) features = convert_examples_to_features(examples=examples, tokenizer=self._tokenizer, max_seq_length=192, doc_stride=128, max_query_length=64, is_training=False) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask) return examples, features, dataset
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file if args.model_type=='bert': cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_fixbug_{}'.format('dev' if evaluate else input_file.split('/')[1].split('.')[0])) else: cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_fixbug_{}_{}'.format('dev' if evaluate else input_file.split('/')[1].split('.')[0], args.model_type)) if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", input_file) examples = read_squad_examples(input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative) features = convert_examples_to_features(examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, cls_token='[CLS]' if args.model_type == 'bert' else '<s>', sep_token='[SEP]' if args.model_type == 'bert' else '</s>', pad_token=0 if args.model_type == 'bert' else 1, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0, cls_token_at_end=True if args.model_type in ['xlnet'] else False, sequence_a_is_doc=True if args.model_type in ['xlnet'] else False) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) if evaluate: all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask) else: all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) all_answer_masks = torch.tensor([f.answer_mask for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_answer_masks, all_cls_index, all_p_mask) if output_examples: return dataset, examples, features return dataset
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file input_file = [args.predict_file] if evaluate else glob.glob(os.path.join(args.train_file,'*.json')) cached_features_file = os.path.join(os.path.dirname(input_file[0]), 'cached_meta_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length))) if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) features_list = torch.load(cached_features_file) else: features_list = [] for infile in input_file: logger.info("Creating features from dataset file at %s", infile) examples = read_squad_examples(input_file=infile, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative) features_list.append(convert_examples_to_features(examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0, cls_token_at_end=True if args.model_type in ['xlnet'] else False, sequence_a_is_doc=True if args.model_type in ['xlnet'] else False)) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features_list, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache def build_dataset(features): # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) if evaluate: all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask) else: all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_cls_index, all_p_mask) return dataset datasets = [build_dataset(features) for features in features_list] return datasets
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length))) if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", input_file) examples = read_squad_examples(input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative) features = convert_examples_to_features(examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor( [f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor( [f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor( [f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) if evaluate: all_example_index = torch.arange( all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask) else: all_start_positions = torch.tensor( [f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor( [f.end_position for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_cls_index, all_p_mask) if output_examples: return dataset, examples, features return dataset
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file input_file = args.predict_file cached_features_file = os.path.join( os.path.dirname(input_file), 'cached_{}_{}_{}'.format( 'test', list(filter(None, args.save_dir.split('/'))).pop(), str(args.max_seq_length))) # if os.path.exists(cached_features_file): # logger.info("Loading features from cached file %s", cached_features_file) # features = torch.load(cached_features_file) # else: logger.info("Creating features from dataset file at %s", input_file) examples = read_squad_examples( input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative) features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate) # if args.local_rank in [-1, 0]: # logger.info("Saving features into cached file %s", cached_features_file) # torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask) return dataset, examples, features
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): examples = read_squad_examples( input_data=args.input_data, version_2_with_negative=args.version_2_with_negative) features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) if evaluate: all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask) else: all_start_positions = torch.tensor( [f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_cls_index, all_p_mask) if output_examples: return dataset, examples, features return dataset
def load_and_cache_examples(input_file, tokenizer): cached_features_file = 'Ecached_dev_{}_{}'.format(model_name, str(max_seq_length)) """ if os.path.exists(cached_features_file): #print("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) """ print("Creating features from dataset file at %s", input_file) examples = read_squad_examples(input_file=input_file, is_training=False, version_2_with_negative=False) features = convert_examples_to_features(examples=examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=doc_stride, max_query_length=max_query_length, is_training=False) print("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask) return dataset, examples, features
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [ -1, 0 ] and not evaluate and not args.no_distributed_training: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file # ALON adding a different path for the feature cache... if not os.path.exists('data/'): os.mkdir('data/') if not os.path.exists('data/pytorch_transformers_cache'): os.mkdir('data/pytorch_transformers_cache') # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file cached_features_file = os.path.join( 'data/pytorch_transformers_cache/', 'cached_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length))) if os.path.exists(cached_features_file ) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", input_file) examples = read_squad_examples( input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative) # ALON support sampling the input and the evaluation if args.sample_size != -1: examples = random.sample(examples, k=args.sample_size) features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate) # TODO features should be recalc when data changed #if args.local_rank in [-1, 0] or args.no_distributed_training: # logger.info("Saving features into cached file %s", cached_features_file) # torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate and not args.no_distributed_training: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) if evaluate: all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask) else: all_start_positions = torch.tensor( [f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_cls_index, all_p_mask) if output_examples: return dataset, examples, features return dataset
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() examples = None features = None # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file cached_features_file = os.path.join( os.path.dirname(input_file), "cached_{}_{}_{}".format( "dev" if evaluate else "train", list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length), ), ) # Init features and dataset from cache if it exists if os.path.exists(cached_features_file ) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", input_file) examples = read_squad_examples(input_file=input_file, is_training=not evaluate, do_lower_case=args.do_lower_case) features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: # Make sure only the first process in distributed training process the dataset, and the others will use the cache torch.distributed.barrier() # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) if evaluate: all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset( all_input_ids, all_input_mask, all_segment_ids, all_example_index, ) else: all_start_positions = torch.tensor( [f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) dataset = TensorDataset( all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, ) if output_examples: return dataset, examples, features return dataset
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): """ read_squad_data + convert_to_feature + convert_to_TensorDataset Arguments: args {[type]} -- [description] tokenizer {[type]} -- [description] Keyword Arguments: evaluate {bool} -- [description] (default: {False}) output_examples {bool} -- 是否也返回 examples 和 features (default: {False}) Returns: [type] -- [description] """ if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file cached_features_file = os.path.join( os.path.dirname(input_file), 'cached_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length))) if os.path.exists(cached_features_file ) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", input_file) examples = read_squad_examples( input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative) features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) if evaluate: all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask) else: all_start_positions = torch.tensor( [f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_cls_index, all_p_mask) if output_examples: return dataset, examples, features return dataset
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): global_rank = -1 if args.local_rank == -1 else torch.distributed.get_rank() if global_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file cache_filename = 'cached.split={}.mn={}.msl={}.mql={}'.format( '.'.join(input_file.split('/')[-1].split('.')[:-1]), args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, str(args.max_seq_length), str(args.max_query_length)) if args.num_shards != 1: cache_filename += '.num_shards={}.shard_no={}'.format(args.num_shards, args.shard_no) cached_features_file = os.path.join(os.path.dirname(input_file), cache_filename) print('Cached features file {} exists: {}'.format(cached_features_file, str(os.path.exists(cached_features_file)))) if os.path.exists(cached_features_file) and not args.overwrite_cache: if args.num_shards == 1: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Loading features from cached files %s", cached_features_file) features = [] for shard_no in tqdm(range(args.num_shards)): features += torch.load(cached_features_file[:-1] + str(shard_no)) if output_examples: logger.info("Reading examples from file %s", input_file) examples = read_squad_examples(input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative) else: logger.info("Reading examples from dataset file at %s", input_file) examples = read_squad_examples(input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative) logger.info("Creating features from dataset file at %s", input_file) features = convert_examples_to_features(examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, sep_token=tokenizer.sep_token, pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0], pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, num_shards=args.num_shards, shard_no=args.shard_no) if global_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) if evaluate: all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask) else: all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_cls_index, all_p_mask) if output_examples: return dataset, examples, features return dataset
def do_prediction(model_dir): # 1. Load a trained model device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = BertForQuestionAnswering.from_pretrained(model_dir) model.to(device) model.eval() # 2. Load and pre-process the test set dev_file = "data/sfu.json" predict_batch_size = 2 max_seq_length = 384 eval_examples = read_squad_examples(input_file=dev_file, is_training=False, version_2_with_negative=False) tokenizer = BertTokenizer.from_pretrained(model_dir) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=max_seq_length, doc_stride=128, max_query_length=64, is_training=False) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=predict_batch_size) # 3. Run inference on the test set all_results = [] for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) with torch.no_grad(): batch_start_logits, batch_end_logits = model(input_ids, input_mask, segment_ids) for i, example_index in enumerate(example_indices): start_logits = batch_start_logits[i].detach().cpu().tolist() end_logits = batch_end_logits[i].detach().cpu().tolist() eval_feature = eval_features[example_index.item()] unique_id = int(eval_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) output_prediction_file = os.path.join(model_dir, "predictions_sfu.json") output_nbest_file = os.path.join(model_dir, "nbest_predictions_sfu.json") output_null_log_odds_file = os.path.join(model_dir, "null_odds_sfu.json") preds = write_predictions(eval_examples, eval_features, all_results, 20, 30, True, output_prediction_file, output_nbest_file, output_null_log_odds_file, True, False, 0.0)
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file cached_features_file = os.path.join( os.path.dirname(input_file), 'cached_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length))) if os.path.exists(cached_features_file ) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", input_file) examples = read_squad_examples( input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative) features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, cls_token_at_end=bool(args.model_type in ['xlnet']), # xlnet has a cls token at the end cls_token=tokenizer.cls_token, sep_token=tokenizer.sep_token, pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token ])[0], cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0, sep_token_extra=bool(args.model_type in ['roberta']), add_prefix_space=bool(args.model_type in ['roberta'])) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) if evaluate: all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask) else: all_start_positions = torch.tensor( [f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) all_is_impossible = torch.tensor( [1.0 if f.is_impossible else 0.0 for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_is_impossible, all_cls_index, all_p_mask) if output_examples: return dataset, examples, features return dataset
def main(): parser = argparse.ArgumentParser() parser.add_argument("--job_id", default='tmp', type=str, help='Jobid to save training logs') parser.add_argument("--data_dir",default=None,type=str,help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--teacher_model",default=None,type=str,help="The teacher model dir.") parser.add_argument("--student_model",default=None,type=str,help="The student model dir.") parser.add_argument("--output_dir",default='output',type=str,help="The output directory where the model predictions and checkpoints will be written.") # default params for SQuAD parser.add_argument('--version_2_with_negative', action='store_true') parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--doc_stride", default=128, type=int, help="When splitting up a long document into chunks, how much stride to take between chunks.") parser.add_argument("--max_query_length", default=64, type=int, help="The maximum number of tokens for the question. Questions longer than this will " "be truncated to this length.") parser.add_argument("--n_best_size", default=20, type=int, help="The total number of n-best predictions to generate in the nbest_predictions.json " "output file.") parser.add_argument("--max_answer_length", default=30, type=int, help="The maximum length of an answer that can be generated. This is needed because the start " "and end predictions are not conditioned on one another.") parser.add_argument('--null_score_diff_threshold', type=float, default=0.0, help="If null_score - best_non_null is greater than the threshold predict null.") parser.add_argument("--batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--weight_decay', '--wd', default=1e-4, type=float, metavar='W', help='weight decay') parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument('--eval_step', type=int, default=200) parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--do_eval',default = 0,type=int) # distillation params parser.add_argument('--aug_train', action='store_true', help="Whether using data augmentation or not") parser.add_argument('--kd_type', default='no_kd', choices=['no_kd', 'two_stage', 'logit_kd', 'joint_kd'], help="choose one of the kd type") parser.add_argument('--distill_logit', action='store_true', help="Whether using distillation over logits or not") parser.add_argument('--distill_rep_attn', action='store_true', help="Whether using distillation over reps and attns or not") parser.add_argument('--temperature', type=float, default=1.) # quantization params parser.add_argument("--weight_bits", default=32, type=int, help="number of bits for weight") parser.add_argument("--weight_quant_method", default='twn', type=str, choices=['twn', 'bwn', 'uniform', 'laq'], help="weight quantization methods, can be bwn, twn, laq") parser.add_argument("--input_bits", default=32, type=int, help="number of bits for activation") parser.add_argument("--input_quant_method", default='uniform', type=str, choices=['uniform', 'lsq'], help="weight quantization methods, can be bwn, twn, or symmetric quantization for default") parser.add_argument('--learnable_scaling', action='store_true', default=True) parser.add_argument("--ACT2FN", default='gelu', type=str, help='activation fn for ffn-mid. A8 uses uq + gelu; A4 uses lsq + relu.') # training config parser.add_argument('--sym_quant_ffn_attn', action='store_true', help='whether use sym quant for attn score and ffn after act') # default asym parser.add_argument('--sym_quant_qkvo', action='store_true', default=True, help='whether use asym quant for Q/K/V and others.') # default sym # layerwise quantization config parser.add_argument('--clip_init_file', default='threshold_std.pkl', help='files to restore init clip values.') parser.add_argument('--clip_init_val', default=2.5, type=float, help='init value of clip_vals, default to (-2.5, +2.5).') parser.add_argument('--clip_lr', default=1e-4, type=float, help='Use a seperate lr for clip_vals / stepsize') parser.add_argument('--clip_wd', default=0.0, type=float, help='weight decay for clip_vals / stepsize') # layerwise quantization config parser.add_argument('--embed_layerwise', default=False, type=lambda x: bool(int(x))) parser.add_argument('--weight_layerwise', default=True, type=lambda x: bool(int(x))) parser.add_argument('--input_layerwise', default=True, type=lambda x: bool(int(x))) ### spliting parser.add_argument('--split', action='store_true', help='whether to conduct tws spliting. NOTE this is only for training binarybert') parser.add_argument('--is_binarybert', action='store_true', help='whether to use binarybert modelling.') args = parser.parse_args() log_dir = os.path.join(args.output_dir, 'record_%s.log' % args.job_id) init_logging(log_dir) print_args(vars(args)) # Prepare devices device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() logging.info("device: {} n_gpu: {}".format(device, n_gpu)) # Prepare seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.teacher_model, do_lower_case=True) config = BertConfig.from_pretrained(args.teacher_model) config.num_labels = 2 student_config = copy.deepcopy(config) student_config.weight_bits = args.weight_bits student_config.input_bits = args.input_bits student_config.weight_quant_method = args.weight_quant_method student_config.input_quant_method = args.input_quant_method student_config.clip_init_val = args.clip_init_val student_config.learnable_scaling = args.learnable_scaling student_config.sym_quant_qkvo = args.sym_quant_qkvo student_config.sym_quant_ffn_attn = args.sym_quant_ffn_attn student_config.embed_layerwise = args.embed_layerwise student_config.weight_layerwise = args.weight_layerwise student_config.input_layerwise = args.input_layerwise student_config.hidden_act = args.ACT2FN logging.info("***** Training data *****") input_file = 'train-v2.0.json' if args.version_2_with_negative else 'train-v1.1.json' input_file = os.path.join(args.data_dir,input_file) if os.path.exists(input_file+'.features.pkl'): logging.info(" loading from cache %s", input_file+'.features.pkl') train_features = pickle.load(open(input_file+'.features.pkl', 'rb')) else: _, train_examples = read_squad_examples(input_file=input_file, is_training=True, version_2_with_negative=args.version_2_with_negative) train_features = convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True) pickle.dump(train_features, open(input_file+'.features.pkl','wb')) args.batch_size = args.batch_size // args.gradient_accumulation_steps num_train_optimization_steps = int( len(train_features) / args.batch_size / args.gradient_accumulation_steps) * args.num_train_epochs logging.info(" Num examples = %d", len(train_features)) logging.info(" Num total steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.batch_size) logging.info("***** Evaluation data *****") input_file = 'dev-v2.0.json' if args.version_2_with_negative else 'dev-v1.1.json' args.dev_file = os.path.join(args.data_dir,input_file) dev_dataset, eval_examples = read_squad_examples( input_file=args.dev_file, is_training=False, version_2_with_negative=args.version_2_with_negative) eval_features = convert_examples_to_features( examples=eval_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False) logging.info(" Num examples = %d", len(eval_features)) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) if not args.do_eval: from transformer.modeling_dynabert import BertForQuestionAnswering teacher_model = BertForQuestionAnswering.from_pretrained(args.teacher_model, config = config) teacher_model.to(device) if n_gpu > 1: teacher_model = torch.nn.DataParallel(teacher_model) if args.split: # rename the checkpoint to restore split_model_dir = os.path.join(args.output_dir,'binary_model_init') if not os.path.exists(split_model_dir): os.mkdir(split_model_dir) # copy the json file, avoid over-writing source_model_dir = os.path.join(args.student_model, CONFIG_NAME) target_model_dir = os.path.join(split_model_dir, CONFIG_NAME) os.system('cp -v %s %s' % (source_model_dir, target_model_dir)) # create the split model ckpt source_model_dir = os.path.join(args.student_model, WEIGHTS_NAME) target_model_dir = os.path.join(split_model_dir, WEIGHTS_NAME) target_model_dir = tws_split(source_model_dir, target_model_dir) args.student_model = split_model_dir # over-write student_model dir print("transformed binary model stored at: {}".format(target_model_dir)) if args.is_binarybert: from transformer.modeling_dynabert_binary import BertForQuestionAnswering student_model = BertForQuestionAnswering.from_pretrained(args.student_model, config=student_config) else: from transformer.modeling_dynabert_quant import BertForQuestionAnswering student_model = BertForQuestionAnswering.from_pretrained(args.student_model, config=student_config) student_model.to(device) if n_gpu > 1: student_model = torch.nn.DataParallel(student_model) learner = KDLearner(args, device, student_model, teacher_model,num_train_optimization_steps) if args.do_eval: """ evaluation """ learner.eval(student_model, eval_dataloader, eval_features, eval_examples, dev_dataset) return 0 """ perform training """ if args.kd_type == 'joint_kd': learner.args.distill_logit = True learner.args.distill_rep_attn = True learner.build() learner.train(train_dataloader, eval_dataloader, eval_features, eval_examples, dev_dataset) elif args.kd_type == 'logit_kd': # only perform the logits kd learner.args.distill_logit = True learner.args.distill_rep_attn = False learner.build(lr=args.learning_rate) learner.train(train_dataloader, eval_dataloader, eval_features, eval_examples, dev_dataset) elif args.kd_type == 'two_stage': # stage 1: intermediate layer distillation learner.args.distill_logit = False learner.args.distill_rep_attn = True learner.build(lr=2.5*args.learning_rate) learner.train(train_dataloader, eval_dataloader, eval_features, eval_examples, dev_dataset) # stage 2: prediction layer distillation learner.student_model.load_state_dict(torch.load(os.path.join(learner.output_dir,WEIGHTS_NAME))) learner.args.distill_logit = True learner.args.distill_rep_attn = False learner.build(lr=args.learning_rate) # prepare the optimizer again. learner.train(train_dataloader, eval_dataloader, eval_features, eval_examples, dev_dataset) else: assert args.kd_type == 'no_kd' # NO kd training, vanilla cross entropy with hard label learner.build(lr=args.learning_rate) # prepare the optimizer again. learner.train(train_dataloader, eval_dataloader, eval_features, eval_examples, dev_dataset) del learner return 0
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file input_file = args.predict_file if evaluate else args.train_file print("input_file", input_file) ebay_input_file = None if evaluate else args.ebay_file cached_features_file = os.path.join( os.path.dirname(input_file), 'cached_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length))) print("cached_features_file", cached_features_file) # sys.exit(0) if os.path.exists(cached_features_file ) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", input_file) # examples=[] examples = read_squad_examples( input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative) # print(examples[0]) ## Mehdi ToDo ADD EBAY DATA TO features if ebay_input_file: ebay_examples = read_ebay_examples( input_file=ebay_input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative) examples.extend(ebay_examples) print("done reading examples") # sys.exit(0) features = convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate) print("done featureizing") if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) # sys.exit(0) if args.local_rank == 0 and not evaluate: torch.distributed.barrier( ) # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) if evaluate: all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index, all_cls_index, all_p_mask) else: all_start_positions = torch.tensor( [f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, all_cls_index, all_p_mask) if output_examples: return dataset, examples, features return dataset