def create_model(self): input_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.vocab_size) input_mask = None if self.use_input_mask: input_mask = BertModelTest.ids_tensor([self.batch_size, self.seq_length], vocab_size=2) token_type_ids = None if self.use_token_type_ids: token_type_ids = BertModelTest.ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) config = modeling.BertConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range) model = modeling.BertModel(config=config) all_encoder_layers, pooled_output = model(input_ids, token_type_ids, input_mask) outputs = { "sequence_output": all_encoder_layers[-1], "pooled_output": pooled_output, "all_encoder_layers": all_encoder_layers, } return outputs
def bert_train_fn(): is_training=True hidden_size = 768 num_labels = 10 #batch_size=128 max_seq_length=512 use_one_hot_embeddings = False bert_config = modeling.BertConfig(vocab_size=21128, hidden_size=hidden_size, num_hidden_layers=12, num_attention_heads=12,intermediate_size=3072) input_ids = tf.placeholder(tf.int32, [batch_size, max_seq_length], name="input_ids") input_mask = tf.placeholder(tf.int32, [batch_size, max_seq_length], name="input_mask") segment_ids = tf.placeholder(tf.int32, [batch_size,max_seq_length],name="segment_ids") label_ids = tf.placeholder(tf.float32, [batch_size,num_labels], name="label_ids") loss, per_example_loss, logits, probabilities, model = create_model(bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings) # 1. generate or load training/validation/test data. e.g. train:(X,y). X is input_ids,y is labels. # 2. train the model by calling create model, get loss gpu_config = tf.ConfigProto() gpu_config.gpu_options.allow_growth = True sess = tf.Session(config=gpu_config) sess.run(tf.global_variables_initializer()) for i in range(1000): input_ids_=np.ones((batch_size,max_seq_length),dtype=np.int32) input_mask_=np.ones((batch_size,max_seq_length),dtype=np.int32) segment_ids_=np.ones((batch_size,max_seq_length),dtype=np.int32) label_ids_=np.ones((batch_size,num_labels),dtype=np.float32) feed_dict = {input_ids: input_ids_, input_mask: input_mask_,segment_ids:segment_ids_,label_ids:label_ids_} loss_ = sess.run([loss], feed_dict) print("loss:",loss_)
def bert_model(self): real_len = tf.reduce_sum(tf.cast(tf.not_equal(tf.to_int32(0), self._input_ids), tf.int32), axis=1) input_mask = tf.cast(tf.sequence_mask(real_len, self._max_seq_length), tf.int32) base_model = modeling.BertModel( config=modeling.BertConfig(vocab_size=self._vocab_size), is_training=self._is_training, input_ids=self._input_ids, input_mask=input_mask, token_type_ids=tf.zeros_like(self._input_ids, tf.int32), use_one_hot_embeddings=False ) output_layer = base_model.get_pooled_output() self._inference(output_layer) self._build_train_op()
def create_model(self): input_ids = BertModelTest.ids_tensor( [self.batch_size, self.seq_length], self.vocab_size) dist_ids = BertModelTest.dist_tensor( [self.batch_size, self.seq_length]) input_mask = None if self.use_input_mask: input_mask = BertModelTest.ids_tensor( [self.batch_size, self.seq_length], vocab_size=2) token_type_ids = None if self.use_token_type_ids: token_type_ids = BertModelTest.ids_tensor( [self.batch_size, self.seq_length], self.type_vocab_size) config = modeling.BertConfig( vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, hidden_act=self.hidden_act, hidden_dropout_prob=self.hidden_dropout_prob, attention_probs_dropout_prob=self.attention_probs_dropout_prob, max_position_embeddings=self.max_position_embeddings, type_vocab_size=self.type_vocab_size, initializer_range=self.initializer_range) model = modeling.BertModel(config=config, is_training=self.is_training, input_ids=input_ids, position_ids=dist_ids, input_mask=input_mask, token_type_ids=token_type_ids, scope=self.scope) outputs = { "embedding_output": model.get_embedding_output(), "sequence_output": model.get_sequence_output(), "pooled_output": model.get_pooled_output(), "all_encoder_layers": model.get_all_encoder_layers(), } return outputs
def create_model(self): input_ids = BertModelTest.ids_tensor( [self.batch_size, self.seq_length], self.s_vocab) input_mask = None if self.use_input_mask: input_mask = BertModelTest.ids_tensor( [self.batch_size, self.seq_length], s_vocab=2) token_type_ids = None if self.use_token_type_ids: token_type_ids = BertModelTest.ids_tensor( [self.batch_size, self.seq_length], self.n_typ) config = modeling.BertConfig( s_vocab=self.s_vocab, d_hidden=self.d_hidden, n_lays=self.n_lays, n_heads=self.n_heads, d_ff=self.d_ff, act=self.act, drop=self.drop, drop_attn=self.drop_attn, n_pos=self.n_pos, n_typ=self.n_typ, init_range=self.init_range, ) model = modeling.BertModel( config=config, is_training=self.is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, scope=self.scope, ) outputs = { "embedding_output": model.get_embedding_output(), "sequence_output": model.get_sequence_output(), "pooled_output": model.get_pooled_output(), "all_encoder_layers": model.get_all_encoder_layers(), } return outputs
INPUT_FILE = "drop_0_test.pkl" RANDOM_SEED = 12345 MAX_PREDICTIONS_PER_SEQ = 20 MAX_SEQ_LENGTH = 128 DO_LOWER_CASE = True # LEARNING_RATE = 2e-5 # NUM_TRAIN_STEPS = 1 # NUM_WARMUP_STEPS = 10 # USE_TPU = False # BATCH_SIZE = 1 # load model bert_config = modeling.BertConfig(BERT_CONFIG_FILE) device = torch.device("cpu") model1 = modeling.BertForPreTraining(bert_config) # model2 = modeling.BertForPreTraining(bert_config) model1.load_state_dict(torch.load(INIT_CHECKPOINT_PT, map_location='cpu')) # model1.bert.from_pretrained(INIT_DIRECTORY) model1.to(device) print ('model loaded') #resolve features with open(INPUT_FILE, 'rb') as f: features = pickle.load(f) print ("%d total samples" % len(features))
def build_graph(opts, is_training=True): train_graph = tf.Graph() strategy = None if opts['use_popdist']: strategy = create_popdist_strategy() with train_graph.as_default(), ExitStack() as stack: if strategy: stack.enter_context(strategy.scope()) if opts["groupbert"]: bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.GroupBertConfig(vocab_size=None)) else: bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.BertConfig(vocab_size=None)) bert_config.dtype = tf.float32 if opts[ "precision"] == '32' else tf.float16 # define placeholders placeholders = { 'learning_rate': tf.placeholder(tf.float32, shape=[]), 'loss_scaling': tf.placeholder(tf.float32, shape=[]) } learning_rate = placeholders['learning_rate'] loss_scaling = placeholders['loss_scaling'] # define input, datasets must be defined outside the ipu device scope. train_iterator = ipu.ipu_infeed_queue.IPUInfeedQueue( data_loader.load(opts, is_training=is_training)) # define output outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue() # building networks with pipeline def bert_net(): return build_network(train_iterator, outfeed_queue, bert_config, opts, learning_rate, loss_scaling, is_training) with ipu.scopes.ipu_scope('/device:IPU:0'): train = training_step_with_infeeds_and_outfeeds( train_iterator, outfeed_queue, bert_config, opts, learning_rate, loss_scaling, is_training) # get result from outfeed queue outfeed = outfeed_queue.dequeue() if strategy: # Take the mean of all the outputs across the distributed workers outfeed = [ strategy.reduce(tf.distribute.ReduceOp.MEAN, v) for v in outfeed ] if opts['distributed_worker_index'] == 0 or opts['log_all_workers']: log.print_trainable_variables(opts) model_and_optimiser_variables = tf.global_variables() model_variables = tf.trainable_variables() + tf.get_collection( tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES) restore = tf.train.Saver( var_list=model_and_optimiser_variables if opts['restore_optimiser_from_checkpoint'] else model_variables) train_saver = tf.train.Saver( var_list=model_and_optimiser_variables if opts['save_optimiser_to_checkpoint'] else model_variables, max_to_keep=5) ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() tvars = tf.trainable_variables() # calculate the number of required IPU num_ipus = (max(opts['device_mapping']) + 1) * opts['replicas'] num_ipus = ipu_utils.next_power_of_two(num_ipus) ipu_config = ipu_utils.get_config( fp_exceptions=opts["fp_exceptions"], enable_recomputation=opts["enable_recomputation"], disable_graph_outlining=False, num_required_ipus=num_ipus, enable_stochastic_rounding=opts['stochastic_rounding'], minimum_remote_tensor_size=opts['min_remote_tensor_size'], max_cross_replica_sum_buffer_size=opts[ 'max_cross_replica_sum_buffer_size'], max_reduce_scatter_buffer_size=opts['max_reduce_scatter_buffer_size'], scheduler_selection=opts['scheduler'], compile_only=opts['compile_only'], ipu_id=opts['select_ipu']) if opts['use_popdist']: ipu_config = popdist.tensorflow.set_ipu_config(ipu_config, opts['shards'], configure_device=False) # Do not acquire a device, compile only. if opts["compile_only"]: ipu_config.device_connection.version = "ipu2" ipu_config.device_connection.enable_remote_buffers = True # PRE_COMPILE allows for runing execuatables on graph without being online ipu_config.device_connection.type = DeviceConnectionType.PRE_COMPILE # Enforce using a exe cache dir, defaulting if not given if ("TF_POPLAR_FLAGS" in os.environ): if ("--executable_cache_path" not in os.environ["TF_POPLAR_FLAGS"]): print( "Warning: --executable_cache_path in TF_POPLAR_FLAGS " + "(for 'poprun --mpi_local_args') not set. Setting to default " + "path: ./tmp/tf_cache/") os.environ[ "TF_POPLAR_FLAGS"] = "--executable_cache_path=/tmp/tf_cache" # Sometimes TF_POPLAR_FLAGS might not even exist else: print( "Warning: TF_POPLAR_FLAGS environment variable (for 'poprun " + "--mpi_local_args') not set. --executable_cache_path must be " + "defined when using --compile-only. Setting to default path: " + "./tmp/tf_cache/") os.environ[ "TF_POPLAR_FLAGS"] = "--executable_cache_path=/tmp/tf_cache" ipu_config.configure_ipu_system() train_sess = tf.Session(graph=train_graph) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver, restore, tvars)
flags.DEFINE_string("precision","fp32","precision fp32 or fp16") # batch and seq size that fit into a single GPU collected from https://github.com/ROCmSoftwarePlatform/BERT#out-of-memory-issues batch_size = FLAGS.batch seq_length = FLAGS.seq_length heads = FLAGS.heads layers = FLAGS.layers if FLAGS.precision == "fp32": # this is set to LARGE Bert model bert_config = modeling.BertConfig(attention_probs_dropout_prob= 0.1, hidden_act= "gelu", hidden_dropout_prob= 0.1, hidden_size = 1024, initializer_range = 0.02, intermediate_size = 4096, max_position_embeddings = 512, num_attention_heads = heads, num_hidden_layers = layers, type_vocab_size = 2, vocab_size = 30522, precision=tf.float32) else: bert_config = modeling.BertConfig(attention_probs_dropout_prob= 0.1, hidden_act= "gelu", hidden_dropout_prob= 0.1, hidden_size = 1024, initializer_range = 0.02, intermediate_size = 4096, max_position_embeddings = 512, num_attention_heads = heads, num_hidden_layers = layers,
def build_graph(opts, iterations_per_step=1, is_training=True): train_graph = tf.Graph() with train_graph.as_default(): bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.BertConfig(vocab_size=None)) bert_config.dtype = tf.float32 if opts[ "precision"] == '32' else tf.float16 placeholders = dict() learning_rate = None opts['version_2_with_negative'] = False train_iterator = ipu_infeed_queue.IPUInfeedQueue( data_loader.load(opts, is_training=is_training)) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue() # building networks with pipeline if not should_be_pipeline_when_inference(opts): def bert_net(): return build_infer_network_without_pipeline( train_iterator, outfeed_queue, iterations_per_step, bert_config=bert_config, opts=opts) else: def bert_net(): return build_network(train_iterator, outfeed_queue, iterations_per_step, bert_config, opts, learning_rate, is_training) with ipu_scope('/device:IPU:0'): embedded = opts["embedded_runtime"] if embedded and is_training: raise ValueError( "embedded_runtime is only to be used for inference.") train = ipu.ipu_compiler.compile(bert_net, []) if not embedded else None exec_path = None compile_op = None poplar_exec_filepath = get_exec_path( opts['seq_length'], opts['micro_batch_size'], opts['device_mapping'], should_be_pipeline_when_inference(opts)) exec_path = os.path.join(poplar_exec_filepath) compile_op = application_compile_op.experimental_application_compile_op( bert_net, output_path=exec_path, freeze_variables=True) outfeed = outfeed_queue.dequeue() restore = tf.train.Saver(var_list=tf.global_variables()) ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() tvars = tf.trainable_variables() # Calculate the number of required IPU""" num_ipus = (max(opts['device_mapping']) + 1) * int(opts['replicas']) # The number of acquired IPUs must be the power of 2. if num_ipus & (num_ipus - 1) != 0: num_ipus = 2**int(math.ceil(math.log(num_ipus) / math.log(2))) ipu_config = get_config( fp_exceptions=opts["fp_exceptions"], enable_recomputation=opts["enable_recomputation"], disable_graph_outlining=False, num_required_ipus=num_ipus, enable_stochastic_rounding=opts['stochastic_rounding'], max_cross_replica_sum_buffer_size=opts[ 'max_cross_replica_sum_buffer_size'], max_reduce_scatter_buffer_size=opts['max_reduce_scatter_buffer_size'], scheduler_selection='CLUSTERING', compile_only=False, ipu_id=None, partials_type=opts["partials_type"], available_memory_proportion=opts['available_memory_proportion']) ipu_config.configure_ipu_system() train_sess = tf.Session(graph=train_graph) _ = train_sess.run(train_init, []) # ----------------- # Checkpoints restore and save init_checkpoint_path = opts['init_checkpoint'] logger.info(f"At the checkpoint location {init_checkpoint_path}") if init_checkpoint_path: logger.info("Loading checkpoint...") if os.path.isfile(init_checkpoint_path): init_checkpoint_path = os.path.splitext(init_checkpoint_path)[0] logger.info(f"checkpoint path: {init_checkpoint_path}") (assignment_map, initialized_variable_names ) = bert_ipu.get_assignment_map_from_checkpoint( tvars, init_checkpoint_path) for var in tvars: if var.name in initialized_variable_names: mark = "*" else: mark = " " logger.info("%-60s [%s]\t%s (%s)", var.name, mark, var.shape, var.dtype.name) reader = tf.train.NewCheckpointReader(init_checkpoint_path) load_vars = reader.get_variable_to_shape_map() saver_restore = tf.train.Saver(assignment_map) saver_restore.restore(train_sess, init_checkpoint_path) # ----------------- if compile_op is not None: logger.info( f"Compiling and saving Poplar executable to {poplar_exec_filepath}" ) _ = train_sess.run(compile_op, []) else: exec_path = None return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, restore, tvars, exec_path), ipu_config
import tensorflow as tf import modeling input_ids = tf.constant([[31, 51, 99], [15, 5, 0]]) input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) flat_token_type_ids = tf.reshape(token_type_ids, [-1]) one_hot_token_type_ids = tf.one_hot(flat_token_type_ids, depth=2) config = modeling.BertConfig(vocab_size=32000, hidden_size=512, num_hidden_layers=8, num_attention_heads=8, intermediate_size=1024, type_vocab_size=2) model = modeling.BertModel(config=config, is_training=True, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) init = tf.global_variables_initializer() with tf.Session() as sess: sess.run(init) print(sess.run(one_hot_token_type_ids)) print(sess.run(model.get_all_encoder_layers()))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default=None, type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--dataset_name", default="top300_kl", type=str, required=True, help="The name of dataset to inference (without extention ex) top300_kl)") parser.add_argument("--model_type", default="baseline_tfidf", type=str, required=True, help="baseline, baseline_tfidf, ir-v0, ir-v1") parser.add_argument("--model_path", default=None, type=str, required=True, help="path to model dir") parser.add_argument("--output_dir", default=None, type=str, required=True, help="save_path") ## Other parameters parser.add_argument("--bert_model", default="bert-base-multilingual-cased", type=str, help="Default: bert-base-multilingual-cased" "Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument("--model_file", default="pytorch_model.bin", type=str, help="The file of model (.bin), default is pytorhc_model.bin,\n" "특정 파일이 필요시 이름 설정 필요") parser.add_argument("--max_seq_length", default=384, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) processor = IRProcessor() label_list = processor.get_labels() num_labels = len(label_list) print("model:", args.model_type) if args.model_type == "baseline": # load model (finetuned baseline on IR) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=False) config = BertConfig(os.path.join(args.model_path + "bert_config.json")) model = BertForPreTraining(config) model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file))) elif args.model_type == "baseline_tfidf": # load model (baseline_tfidf) tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True) TFIDFconfig = modeling.BertConfig(os.path.join(args.model_path + "bert_config.json")) model = modeling.BertTFIDFForPreTraining(TFIDFconfig) model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file))) elif args.model_type == "ir-v0": # load model (*-head) tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True) head_config = modeling_ir.BertForIRConfig(os.path.join(args.model_path + "bert_config.json")) model = modeling_ir.BertForIRForPreTraining(head_config) model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file))) elif args.model_type == "ir-v1": # load model (*-head) tokenizer = BertTFIDFTokenizer.from_pretrained(args.bert_model, do_lower_case=False, do_tf_idf=True) head_config = modeling_ir_2.BertForIRConfig(os.path.join(args.model_path + "bert_config.json")) model = modeling_ir_2.BertForIRForPreTraining(head_config) model.load_state_dict(torch.load(os.path.join(args.model_path, args.model_file))) if args.fp16: model.half() model.to(device) tfidf_dict = pickle_load(os.path.join(args.data_dir, args.dataset_name + '_tfidf.pkl')) results_logit = dict() results_softmax = dict() eval_set, documents, queries = processor.make_eval_set(args.data_dir, args.dataset_name) logger.info("***** Running evaluation *****") logger.info(" Batch size = %d", args.eval_batch_size) for q_num, query in tqdm(enumerate(queries), total=len(queries), desc="Evaluating"): # for query in queries[0:1]: # for testing logger.info(f"Current Query Num : {q_num}") eval_examples = processor._create_examples(eval_set, query, documents) # logger.info(" Num examples = %d", len(eval_examples)) if args.model_type == "baseline": # baseline or baseline_finetuned eval_features = convert_examples_to_features_for_vanilla( eval_examples, label_list, args.max_seq_length, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Query"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): _, logits = model(input_ids, segment_ids, input_mask) # loss_fct = CrossEntropyLoss() # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) # eval_loss += tmp_eval_loss.mean().item() # nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) else: # baseline_tfidf or *-head model eval_data = LazyDatasetClassifier(eval_examples, label_list, args.max_seq_length, tokenizer, tfidf_dict) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] for batch in tqdm(eval_dataloader, desc="Query"): batch = tuple(t.to(device) for t in batch) input_ids, input_weights, input_mask, segment_ids, label_ids = batch with torch.no_grad(): _, logits = model(input_ids, input_weights, segment_ids, input_mask) # loss_fct = CrossEntropyLoss() # tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) # eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) # eval_loss = eval_loss / nb_eval_steps preds = preds[0] results_softmax[query] = [] for i, pred in enumerate(softmax(preds)): # using softmax pair = dict() pair["score"] = pred[1] pair["doc_id"] = list(documents.keys())[i] results_softmax[query].append(pair) results_softmax[query].sort(reverse=True, key=lambda x: x["score"]) ranked_doc_list = [] for doc in results_logit[query]: ranked_doc_list.append(doc["doc_id"]) results_logit[query] = ranked_doc_list ranked_doc_list = [] for doc in results_softmax[query]: ranked_doc_list.append(doc["doc_id"]) results_softmax[query] = ranked_doc_list save_name2 = args.model_path.split('/')[0] + '_' + args.model_file.split('.')[0] \ + '_' + args.dataset_name + '_output.json' path2 = os.path.join(args.output_dir, save_name2) with open(path2, 'w', encoding="utf8") as f: json.dump(results_softmax, f, indent=4, sort_keys=True, ensure_ascii=False)
def build_graph(opts, iterations_per_step=1, is_training=True): train_graph = tf.Graph() with train_graph.as_default(): if opts["groupbert"]: bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.GroupBertConfig(vocab_size=None)) else: bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.BertConfig(vocab_size=None)) bert_config.dtype = tf.float32 if opts[ "precision"] == '32' else tf.float16 placeholders = dict() if is_training: placeholders['learning_rate'] = tf.placeholder(bert_config.dtype, shape=[]) learning_rate = placeholders['learning_rate'] else: learning_rate = None train_iterator = ipu_infeed_queue.IPUInfeedQueue( data_loader.load(opts, is_training=is_training)) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue() # building networks with pipeline if not should_be_pipeline_when_inference(opts): def bert_net(): return build_infer_network_without_pipeline( train_iterator, outfeed_queue, iterations_per_step, bert_config=bert_config, opts=opts) else: def bert_net(): return build_network(train_iterator, outfeed_queue, iterations_per_step, bert_config, opts, learning_rate, is_training) with ipu_scope('/device:IPU:0'): train = ipu.ipu_compiler.compile(bert_net, []) outfeed = outfeed_queue.dequeue() restore = tf.train.Saver(var_list=tf.global_variables()) train_saver = tf.train.Saver(max_to_keep=5) ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() tvars = tf.trainable_variables() """calculate the number of required IPU""" num_ipus = (max(opts['device_mapping']) + 1) * int(opts['replicas']) # The number of acquired IPUs must be the power of 2. if num_ipus & (num_ipus - 1) != 0: num_ipus = 2**int(math.ceil(math.log(num_ipus) / math.log(2))) ipu_config = get_config( fp_exceptions=opts["fp_exceptions"], enable_recomputation=opts["enable_recomputation"], disable_graph_outlining=False, num_required_ipus=num_ipus, enable_stochastic_rounding=opts['stochastic_rounding'], max_cross_replica_sum_buffer_size=opts[ 'max_cross_replica_sum_buffer_size'], max_reduce_scatter_buffer_size=opts['max_reduce_scatter_buffer_size'], scheduler_selection='CLUSTERING', compile_only=False, ipu_id=None, partials_type=opts["partials_type"]) ipu_config.configure_ipu_system() train_sess = tf.Session(graph=train_graph) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver, restore, tvars)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) bert_config = modeling.BertConfig(256) model_fn = model_fn_builder(bert_config=bert_config, learning_rate=FLAGS.learning_rate, num_train_steps=FLAGS.num_train_steps, num_warmup_steps=FLAGS.num_warmup_steps) max_seq_length = FLAGS.max_seq_length max_predictions_per_seq = FLAGS.max_predictions_per_seq with tf.name_scope("input"): input_ids = tf.placeholder( shape=[FLAGS.train_batch_size, max_seq_length], dtype=tf.int32) input_mask = tf.placeholder( shape=[FLAGS.train_batch_size, max_seq_length], dtype=tf.int32) segment_ids = tf.placeholder( shape=[FLAGS.train_batch_size, max_seq_length], dtype=tf.int32) masked_lm_positions = tf.placeholder( shape=[FLAGS.train_batch_size, max_predictions_per_seq], dtype=tf.int32) masked_lm_ids = tf.placeholder( shape=[FLAGS.train_batch_size, max_predictions_per_seq], dtype=tf.int32) masked_lm_weights = tf.placeholder( shape=[FLAGS.train_batch_size, max_predictions_per_seq], dtype=tf.float32) next_sentence_labels = tf.placeholder( shape=[FLAGS.train_batch_size, 1], dtype=tf.int32) features = { "input_ids": input_ids, "input_mask": input_mask, "segment_ids": segment_ids, "masked_lm_positions": masked_lm_positions, "masked_lm_ids": masked_lm_ids, "masked_lm_weights": masked_lm_weights, "next_sentence_labels": next_sentence_labels } train_op = model_fn(features, None, None, None) infer_shape_ops = add_infer_shape_ops() hooks = [ # Horovod: BroadcastGlobalVariablesHook broadcasts initial variable states # from rank 0 to all other processes. This is necessary to ensure consistent # initialization of all workers when training is started with random weights # or restored from a checkpoint. # Horovod: adjust number of steps based on number of GPUs. tf.train.StopAtStepHook(last_step=205), ] config = tf.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(0) training_batch_generator = train_input_generator(features) with tf.train.MonitoredTrainingSession(hooks=hooks, config=config) as mon_sess: mon_sess = TimelineSession(mon_sess, infer_shape_ops) while not mon_sess.should_stop(): # Run a training step synchronously. feed_dict = next(training_batch_generator) mon_sess.run([train_op], feed_dict=feed_dict)
def test_config_to_json_string(self): config = modeling.BertConfig(s_vocab=99, d_hidden=37) obj = json.loads(config.to_json_string()) self.assertEqual(obj["s_vocab"], 99) self.assertEqual(obj["d_hidden"], 37)
#input_ids = tf.constant(np.random.randint(1,128, [2, 3])) input_ids = tf.placeholder(shape=[2, 3], dtype=tf.int32, name='input_ids') #input_mask = tf.constant([[1, 1, 1], [1, 1, 0]]) #input_mask = tf.constant(np.random.randint(0,1, [2, 3])) input_mask = tf.placeholder(shape=[2, 3], dtype=tf.int32, name='input_mask') #token_type_ids = tf.constant([[0, 0, 1], [0, 2, 0]]) #token_type_ids = tf.constant(np.random.randint(0,2, [2, 3])) token_type_ids = tf.placeholder(shape=[2, 3], dtype=tf.int32, name='token_type_ids') config = modeling.BertConfig(vocab_size=32000, hidden_size=768, num_hidden_layers=8, num_attention_heads=6, intermediate_size=1024) model = modeling.BertModel(config=config, is_training=True, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids) label_embeddings = tf.get_variable( name="word_embeddings", shape=[768, 12], initializer=tf.truncated_normal_initializer(0.02)) pooled_output = model.get_pooled_output() logits = tf.matmul(pooled_output, label_embeddings)
def test_config_to_json_string(self): config = modeling.BertConfig(vocab_size=99, hidden_size=37) obj = json.loads(config.to_json_string()) self.assertEqual(obj["vocab_size"], 99) self.assertEqual(obj["hidden_size"], 37)
def build_graph(opts, iterations_per_step=1, is_training=True): train_graph = tf.Graph() with train_graph.as_default(): if opts["groupbert"]: bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.GroupBertConfig(vocab_size=None)) else: bert_config = bert_ipu.BertConfig.from_dict( opts, config=bert_ipu.BertConfig(vocab_size=None)) bert_config.dtype = tf.float32 if opts[ "precision"] == '32' else tf.float16 placeholders = dict() if is_training: placeholders['learning_rate'] = tf.placeholder(bert_config.dtype, shape=[]) learning_rate = placeholders['learning_rate'] else: learning_rate = None # Need to load the Glue File here label_list = opts["pass_in"][1] bert_config.num_lables = len(label_list) if opts['do_training'] and opts['current_mode'] == 'train': input_file = os.path.join(opts["output_dir"], f"train_{opts['task_type']}.tf_record") elif opts['do_eval'] and opts['current_mode'] == 'eval': input_file = os.path.join(opts["output_dir"], f"eval_{opts['task_type']}.tf_record") elif opts['do_predict'] and opts['current_mode'] == 'predict': input_file = os.path.join( opts["output_dir"], f"predict_{opts['task_type']}.tf_record") else: raise NotImplementedError() opts['input_file'] = input_file opts['drop_remainder'] = True train_iterator = ipu_infeed_queue.IPUInfeedQueue( data_loader.load(opts, is_training=is_training)) outfeed_queue = ipu_outfeed_queue.IPUOutfeedQueue() def bert_net(): return build_network(train_iterator, outfeed_queue, iterations_per_step, bert_config, opts, learning_rate, is_training) with ipu_scope('/device:IPU:0'): train = ipu.ipu_compiler.compile(bert_net, []) outfeed = outfeed_queue.dequeue() log.print_trainable_variables(opts) restore = tf.train.Saver(var_list=tf.global_variables()) train_saver = tf.train.Saver(max_to_keep=5) ipu.utils.move_variable_initialization_to_cpu() train_init = tf.global_variables_initializer() tvars = tf.trainable_variables() """calculate the number of required IPU""" num_ipus = (max(opts['device_mapping']) + 1) * int(opts['replicas']) # The number of acquired IPUs must be the power of 2. if num_ipus & (num_ipus - 1) != 0: num_ipus = 2**int(math.ceil(math.log(num_ipus) / math.log(2))) ipu_config = get_config( fp_exceptions=opts["fp_exceptions"], enable_recomputation=opts["enable_recomputation"], disable_graph_outlining=False, num_required_ipus=num_ipus, enable_stochastic_rounding=opts['stochastic_rounding'], max_cross_replica_sum_buffer_size=opts[ 'max_cross_replica_sum_buffer_size'], max_reduce_scatter_buffer_size=opts['max_reduce_scatter_buffer_size'], scheduler_selection='CLUSTERING', compile_only=False, ipu_id=None, available_memory_proportion=opts["available_memory_proportion"]) ipu_config.configure_ipu_system() train_sess = tf.Session(graph=train_graph) return GraphOps(train_graph, train_sess, train_init, [train], placeholders, train_iterator, outfeed, train_saver, restore, tvars)