def seq2vec(sentence): input_file = '/tmp/input.txt' output_file = '/tmp/output.json' vocab_file = '/home/yanai-lab/sugiya-y/space/research/bert/pretrain/uncased_L-12_H-768_A-12/vocab.txt' bert_config_file = '/home/yanai-lab/sugiya-y/space/research/bert/pretrain/uncased_L-12_H-768_A-12/bert_config.json' init_checkpoint = '/home/yanai-lab/sugiya-y/space/research/bert/pretrain/uncased_L-12_H-768_A-12/pytorch_model.bin' layers = '-1,-2,-3,-4' max_seq_length = 128 do_lower_case = True batch_size = 8 local_rank = -1 no_cuda = False with open(input_file, 'w') as writer: writer.write(sentence) if local_rank == -1 or no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # torch.distributed.init_process_group(backend='nccl') # logger.info("device", device, "n_gpu", n_gpu, "distributed training", # bool(local_rank != -1)) layer_indexes = [int(x) for x in layers.split(",")] bert_config = BertConfig.from_json_file(bert_config_file) tokenizer = tokenization.FullTokenizer( vocab_file=vocab_file, do_lower_case=do_lower_case) examples = read_examples(input_file) # print(input_file) features = convert_examples_to_features( examples=examples, seq_length=max_seq_length, tokenizer=tokenizer) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature model = BertModel(bert_config) if init_checkpoint is not None: model.load_state_dict( torch.load(init_checkpoint, map_location='cpu')) model.to(device) if local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[local_rank], output_device=local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) if local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader( eval_data, sampler=eval_sampler, batch_size=batch_size) model.eval() # with open(output_file, "w", encoding='utf-8') as writer: for input_ids, input_mask, example_indices in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) all_encoder_layers, _ = model( input_ids, token_type_ids=None, attention_mask=input_mask) all_encoder_layers = all_encoder_layers for b, example_index in enumerate(example_indices): feature = features[example_index.item()] unique_id = int(feature.unique_id) # feature = unique_id_to_feature[unique_id] output_json = collections.OrderedDict() output_json["linex_index"] = unique_id all_out_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] for (j, layer_index) in enumerate(layer_indexes): layer_output = all_encoder_layers[int( layer_index)].detach().cpu().numpy() layer_output = layer_output[b] layers = collections.OrderedDict() layers["index"] = layer_index layers["values"] = [ round(x.item(), 6) for x in layer_output[i] # これが文書特徴っぽい ] # all_layers.append(layers) # out_features = collections.OrderedDict() # out_features["token"] = token # out_features["layers"] = all_layers # all_out_features.append(out_features) # output_json["features"] = all_out_features # writer.write(json.dumps(output_json) + "\n") return layers["values"] # [768] 文書長に関わらない
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--input_file", default=None, type=str, required=True) parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_file", default=None, type=str, required=True) parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument( "--init_checkpoint", default=None, type=str, required=True, help="Initial checkpoint (usually from a pre-trained BERT model).") ## Other parameters parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device", device, "n_gpu", n_gpu, "distributed training", bool(args.local_rank != -1)) layer_indexes = [int(x) for x in args.layers.split(",")] bert_config = BertConfig.from_json_file(args.bert_config_file) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) examples = read_examples(args.input_file) features = convert_examples_to_features(examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer) unique_id_to_feature = {} for feature in features: unique_id_to_feature[feature.unique_id] = feature model = BertModel(bert_config) if args.init_checkpoint is not None: model.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank) elif n_gpu > 1: model = torch.nn.DataParallel(model) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index) if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size) model.eval() with open(args.output_file, "w", encoding='utf-8') as writer: for input_ids, input_mask, example_indices in eval_dataloader: input_ids = input_ids.to(device) input_mask = input_mask.to(device) all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask) all_encoder_layers = all_encoder_layers for b, example_index in enumerate(example_indices): feature = features[example_index.item()] unique_id = int(feature.unique_id) # feature = unique_id_to_feature[unique_id] output_json = collections.OrderedDict() output_json["linex_index"] = unique_id all_out_features = [] for (i, token) in enumerate(feature.tokens): all_layers = [] for (j, layer_index) in enumerate(layer_indexes): layer_output = all_encoder_layers[int( layer_index)].detach().cpu().numpy() layer_output = layer_output[b] layers = collections.OrderedDict() layers["index"] = layer_index layers["values"] = [ round(x.item(), 6) for x in layer_output[i] ] all_layers.append(layers) out_features = collections.OrderedDict() out_features["token"] = token out_features["layers"] = all_layers all_out_features.append(out_features) output_json["features"] = all_out_features writer.write(json.dumps(output_json) + "\n")
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir") parser.add_argument( "--vocab_file", default=None, type=str, required=True, help="The vocabulary file that the BERT model was trained on.") parser.add_argument("--output_dir", default=None, type=str, required=True) parser.add_argument( "--bert_config_file", default=None, type=str, required=True, help="The config json file corresponding to the pre-trained BERT model. " "This specifies the model architecture.") parser.add_argument( "--init_checkpoint", default=None, type=str, required=True, help="Initial checkpoint (usually from a pre-trained BERT model).") parser.add_argument('--gpu_id', default=0, type=int, help='') parser.add_argument('--no_cuda', default=False, type=bool, help='') ## Other parameters parser.add_argument("--layers", default="-1,-2,-3,-4", type=str) parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. Sequences longer " "than this will be truncated, and sequences shorter than this will be padded." ) parser.add_argument( "--do_lower_case", default=True, action='store_true', help="Whether to lower case the input text. Should be True for uncased " "models and False for cased models.") parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.") args = parser.parse_args() if not args.no_cuda: device = torch.device("cuda", args.gpu_id) n_gpu = 1 # torch.cuda.device_count() else: device = torch.device('cpu') n_gpu = 0 logger.info("device {} n_gpu {}".format(device, n_gpu)) layer_indexes = [int(x) for x in args.layers.split(",")] bert_config = BertConfig.from_json_file(args.bert_config_file) cache_path = os.path.join(args.output_dir, 'tmp_data.pkl') if not os.path.exists(cache_path): tokenizer = tokenization.FullTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) processor = AtecProcessor() label_list = processor.get_labels() train_examples = processor.get_train_examples(args.data_dir) train_features = convert_examples_to_siamese_features( train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Dataset info *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.batch_size) train_dataloader = convert_siamese_features_to_dataset( train_features, args.batch_size) dev_examples = processor.get_dev_examples(args.data_dir) dev_features = convert_examples_to_siamese_features( dev_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Dataset info *****") logger.info(" Num examples = %d", len(dev_examples)) logger.info(" Batch size = %d", args.batch_size) dev_dataloader = convert_siamese_features_to_dataset( dev_features, args.batch_size) with open(cache_path, 'wb') as f: pickle.dump([train_dataloader, dev_dataloader], f) else: logger.info("load data from cache file: {}".format(cache_path)) with open(cache_path, 'rb') as f: train_dataloader, dev_dataloader = pickle.load(f) model = BertModel(bert_config) if args.init_checkpoint is not None: model.load_state_dict( torch.load(args.init_checkpoint, map_location='cpu')) model.to(device) model.eval() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logger.info('extract train features.') bert_feature(os.path.join(args.output_dir, "train.npz"), model, train_dataloader, device) logger.info('extract dev features.') bert_feature(os.path.join(args.output_dir, "dev.npz"), model, dev_dataloader, device)