def __init__(self, hp, voca_size, method, is_training=True): config = bert.BertConfig( vocab_size=voca_size, hidden_size=hp.hidden_units, num_hidden_layers=hp.num_blocks, num_attention_heads=hp.num_heads, intermediate_size=hp.intermediate_size, type_vocab_size=hp.type_vocab_size, ) seq_length = hp.seq_max use_tpu = False task = Classification(data_generator.NLI.nli_info.num_classes) input_ids = tf.placeholder(tf.int64, [None, seq_length]) input_mask = tf.placeholder(tf.int64, [None, seq_length]) segment_ids = tf.placeholder(tf.int64, [None, seq_length]) label_ids = tf.placeholder(tf.int64, [None]) if method in [0, 1, 3, 4, 5, 6]: self.rf_mask = tf.placeholder(tf.float32, [None, seq_length]) elif method in [2]: self.rf_mask = tf.placeholder(tf.int32, [None, seq_length]) self.x_list = [input_ids, input_mask, segment_ids] self.y = label_ids use_one_hot_embeddings = use_tpu with tf.variable_scope("part1"): self.model1 = bert.BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) with tf.variable_scope("part2"): self.model2 = bert.BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) enc = tf.concat([ self.model1.get_sequence_output(), self.model2.get_sequence_output() ], axis=2) pred, loss = task.predict(enc, label_ids, True) self.logits = task.logits self.sout = tf.nn.softmax(self.logits) self.pred = pred self.loss = loss self.acc = task.acc
def __init__(self, hp, voca_size, method, is_training=True): config = bert.BertConfig( vocab_size=voca_size, hidden_size=hp.hidden_units, num_hidden_layers=hp.num_blocks, num_attention_heads=hp.num_heads, intermediate_size=hp.intermediate_size, type_vocab_size=hp.type_vocab_size, ) seq_length = hp.seq_max use_tpu = False task = Classification(data_generator.NLI.nli_info.num_classes) input_ids = tf.placeholder(tf.int64, [None, seq_length]) input_mask = tf.placeholder(tf.int64, [None, seq_length]) segment_ids = tf.placeholder(tf.int64, [None, seq_length]) label_ids = tf.placeholder(tf.int64, [None]) if method in [0, 1, 3, 4, 5, 6]: self.rf_mask = tf.placeholder(tf.float32, [None, seq_length]) elif method in [METHOD_CROSSENT, METHOD_HINGE]: self.rf_mask = tf.placeholder(tf.int32, [None, seq_length]) self.x_list = [input_ids, input_mask, segment_ids] self.y = label_ids use_one_hot_embeddings = use_tpu self.model = bert_get_hidden.BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) pred, loss = task.predict(self.model.get_sequence_output(), label_ids, True) self.logits = task.logits self.sout = tf.nn.softmax(self.logits) self.pred = pred self.loss = loss all_layer_grads = [] all_layers = self.model.all_layer_outputs for i in range(len(all_layers)): grad = tf.gradients(self.logits, all_layers[i]) all_layer_grads.append(grad) grad_emb = tf.gradients(self.logits, self.model.embedding_output) self.all_layer_grads = all_layer_grads self.grad_emb = grad_emb
def __init__(self, hp, voca_size, mode=1): config = bert.BertConfig( vocab_size=voca_size, hidden_size=hp.hidden_units, num_hidden_layers=hp.num_blocks, num_attention_heads=hp.num_heads, intermediate_size=hp.intermediate_size, type_vocab_size=hp.type_vocab_size, ) seq_length = hp.seq_max use_tpu = False task = Classification(data_generator.NLI.nli_info.num_classes) input_ids = tf.placeholder(tf.int64, [None, seq_length]) input_mask = tf.placeholder(tf.int64, [None, seq_length]) segment_ids = tf.placeholder(tf.int64, [None, seq_length]) scores = tf.placeholder(tf.float32, [None]) # self.rf_mask = tf.placeholder(tf.float32, [None, seq_length]) self.rf_mask = tf.placeholder(tf.int32, [None, seq_length]) self.x_list = [input_ids, input_mask, segment_ids] self.y = scores use_one_hot_embeddings = use_tpu is_training = True self.model = bert.BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) if mode == 1: enc = self.model.get_pooled_output() else: enc = self.model.get_all_encoder_layers() self.enc = enc logits = tf.layers.dense(enc, 1, name="reg_dense") # [ None, 1] self.logits = logits paired = tf.reshape(logits, [-1, 2]) y_paired = tf.reshape(self.y, [-1, 2]) raw_l = (paired[:, 1] - paired[:, 0]) losses = tf.maximum(hp.alpha - (paired[:, 1] - paired[:, 0]), 0) self.loss = tf.reduce_mean(losses) tf.summary.scalar('loss', self.loss)
def stance_cold(self): hp = hyperparams.HPColdStart() topic = "atheism" setting = shared_setting.TopicTweets2Stance(topic) model_dir = get_model_dir("stance_cold_{}".format(topic)) task = Classification(3) model = Transformer(hp, setting.vocab_size, task) param = { 'feature_columns': self.get_feature_column(), 'n_classes': 3, } estimator = tf.estimator.Estimator( model_fn=model.model_fn, model_dir=model_dir, params=param, config=None) data_source = stance_detection.DataLoader(topic, hp.seq_max, setting.vocab_filename) def train_input_fn(features, labels, batch_size): f_dict = pd.DataFrame(data=features) dataset = tf.data.Dataset.from_tensor_slices((f_dict, labels)) # Shuffle, repeat, and batch the examples. return dataset.shuffle(1000).repeat().batch(batch_size) def dev_input_fn(batch_size): features, labels = data_source.get_dev_data() f_dict = pd.DataFrame(data=features) dataset = tf.data.Dataset.from_tensor_slices((f_dict, labels)) # Shuffle, repeat, and batch the examples. return dataset.shuffle(1000).batch(batch_size) X, Y = data_source.get_train_data() num_epoch = 10 batch_size = 32 step_per_epoch = (len(Y)-1) / batch_size + 1 tf.logging.info("Logging Test") tf.logging.info("num epoch %d", num_epoch) estimator.train(lambda:train_input_fn(X, Y, batch_size), max_steps=num_epoch * step_per_epoch) print(estimator.evaluate(lambda:dev_input_fn(batch_size)))
def __init__(self, hp, voca_size, is_training=True): config = bert.BertConfig( vocab_size=voca_size, hidden_size=hp.hidden_units, num_hidden_layers=hp.num_blocks, num_attention_heads=hp.num_heads, intermediate_size=hp.intermediate_size, type_vocab_size=hp.type_vocab_size, ) seq_length = hp.seq_max use_tpu = False task = Classification(2) input_ids = tf.placeholder(tf.int64, [None, seq_length]) input_mask = tf.placeholder(tf.int64, [None, seq_length]) segment_ids = tf.placeholder(tf.int64, [None, seq_length]) label_ids = tf.placeholder(tf.int64, [None]) self.x_list = [input_ids, input_mask, segment_ids] self.y = label_ids use_one_hot_embeddings = use_tpu self.model = bert.BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) pred, loss = task.predict(self.model.get_sequence_output(), label_ids, True) self.logits = task.logits self.sout = tf.nn.softmax(self.logits) self.pred = pred self.loss = loss self.acc = task.acc tf.summary.scalar('loss', self.loss) tf.summary.scalar('acc', self.acc)
def train_classification(self, data_loader): hp = HP() tpu_cluster_resolver = None if FLAGS.use_tpu: model_dir = FLAGS.model_dir hp.batch_size = FLAGS.batch_size data_dir = FLAGS.data_dir input_pattern = os.path.join(data_dir, "Thus.train_*") tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu) init_checkpoint = FLAGS.init_checkpoint else: model_dir = get_model_dir("causal") input_pattern = os.path.join(cpath.data_path, "causal", "Thus.train_*") init_checkpoint = os.path.join(cpath.model_path, "runs", FLAGS.init_checkpoint) vocab_size = 30522 task = Classification(3) model = transformer_est.TransformerEst(hp, vocab_size, task, FLAGS.use_tpu, init_checkpoint) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, master=None, model_dir=model_dir, save_checkpoints_steps=1000, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=1000, num_shards=8, per_host_input_for_training=is_per_host)) estimator = tf.contrib.tpu.TPUEstimator( use_tpu=FLAGS.use_tpu, model_fn=model.model_fn, config=run_config, train_batch_size=hp.batch_size, eval_batch_size=hp.batch_size) input_files = tf.gfile.Glob(input_pattern) for input_file in input_files: tf.logging.info(" %s" % input_file) train_files = data_loader.get_train eval_files = input_files[:1] tf.enable_eager_execution() tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", hp.batch_size) train_input_fn = input_fn_builder( input_files=train_files, max_seq_length=hp.seq_max, is_training=True) # `sloppy` mode means that the interleaving is not exact. This adds # even more randomness to the training pipeline. class _LoggerHook(tf.train.SessionRunHook): def __init__(self, log_frequency): self.log_frequency = log_frequency def begin(self): self._step = -1 self._start_time = time.time() def before_run(self, run_context): self._step += 1 return tf.train.SessionRunArgs(task.loss) # Asks for loss value. def after_run(self, run_context, run_values): if self._step % self.log_frequency == 0: current_time = time.time() duration = current_time - self._start_time self._start_time = current_time loss_value = run_values.results examples_per_sec = self.log_frequency * 16 / duration sec_per_batch = float(duration / self.log_frequency) format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; %.3f ' 'sec/batch)') print(format_str % (datetime.now(), self._step, loss_value, examples_per_sec, sec_per_batch)) hook = _LoggerHook(100) estimator.train(input_fn=train_input_fn, hooks= [hook], max_steps = FLAGS.train_steps ) eval_input_fn = input_fn_builder( input_files=eval_files, max_seq_length=hp.seq_max, is_training=False) result = estimator.evaluate( input_fn=eval_input_fn, steps=20, ) tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key]))
def __init__(self, hp, voca_size, num_class_list, is_training=True): config = bert.BertConfig( vocab_size=voca_size, hidden_size=hp.hidden_units, num_hidden_layers=hp.num_blocks, num_attention_heads=hp.num_heads, intermediate_size=hp.intermediate_size, type_vocab_size=hp.type_vocab_size, ) seq_length = hp.seq_max use_tpu = False input_ids = tf.placeholder(tf.int64, [None, seq_length], name="input_ids") input_mask = tf.placeholder(tf.int64, [None, seq_length], name="input_mask") segment_ids = tf.placeholder(tf.int64, [None, seq_length], name="segment_ids") self.x_list = [input_ids, input_mask, segment_ids] self.y1 = tf.placeholder(tf.int64, [None], name="y1") self.y2 = tf.placeholder(tf.int64, [None], name="y2") self.y = [self.y1, self.y2] summary1 = {} summary2 = {} self.summary_list = [summary1, summary2] use_one_hot_embeddings = use_tpu self.model = bert.BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) task = Classification(num_class_list[0]) pred, loss = task.predict(self.model.get_sequence_output(), self.y1, True) self.logits = task.logits self.sout = tf.nn.softmax(self.logits) self.pred = pred self.loss = loss self.acc = task.acc summary1['loss1'] = tf.summary.scalar('loss', self.loss) summary1['acc1'] = tf.summary.scalar('acc', self.acc) with tf.variable_scope("cls2"): task2 = Classification(num_class_list[1]) pred, loss = task2.predict(self.model.get_sequence_output(), self.y2, True) self.logits2 = task2.logits self.sout2 = tf.nn.softmax(self.logits2) self.pred2 = pred self.loss2 = loss self.acc2 = task2.acc summary2['loss2'] = tf.summary.scalar('loss2', self.loss2) summary2['acc2'] = tf.summary.scalar('acc2', self.acc2) self.logit_list = [self.logits, self.logits2] self.loss_list = [self.loss, self.loss2] self.pred_list = [self.pred, self.pred2]
def __init__(self, hp, voca_size, is_training): config = bert.BertConfig( vocab_size=voca_size, hidden_size=hp.hidden_units, num_hidden_layers=hp.num_blocks, num_attention_heads=hp.num_heads, intermediate_size=hp.intermediate_size, type_vocab_size=hp.type_vocab_size, ) seq_length = hp.seq_max use_tpu = False task = Classification(data_generator.NLI.nli_info.num_classes) input_ids = tf.placeholder(tf.int64, [None, seq_length]) input_mask = tf.placeholder(tf.int64, [None, seq_length]) segment_ids = tf.placeholder(tf.int64, [None, seq_length]) label_ids = tf.placeholder(tf.int64, [None]) # self.rf_mask = tf.placeholder(tf.float32, [None, seq_length]) self.rf_mask = tf.placeholder(tf.int32, [None, seq_length]) self.x_list = [input_ids, input_mask, segment_ids] self.y = label_ids self.encoded_embedding_in = tf.placeholder( tf.float32, [None, seq_length, hp.hidden_units]) self.attention_mask_in = tf.placeholder(tf.float32, [None, seq_length, seq_length]) use_one_hot_embeddings = use_tpu self.model = bert.BertEmbeddingInOut( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, embeddding_as_input=(self.encoded_embedding_in, self.attention_mask_in), ) self.encoded_embedding_out = self.model.embedding_output self.attention_mask_out = self.model.attention_mask pred, loss = task.predict(self.model.get_sequence_output(), label_ids, True) self.logits = task.logits self.sout = tf.nn.softmax(self.logits) self.pred = pred self.loss = loss self.acc = task.acc tf.summary.scalar('loss', self.loss) tf.summary.scalar('acc', self.acc) cl = tf.layers.dense(self.model.get_sequence_output(), 1, name="aux_conflict") cl = tf.reshape(cl, [-1, seq_length]) #cl = tf.nn.sigmoid(cl) #cl = tf.contrib.layers.layer_norm(cl) self.conf_logits = cl #self.pkc = self.conf_logits * self.rf_mask #rl_loss_list = tf.reduce_sum(self.pkc, axis=1) rl_loss_list = tf.reduce_sum(self.conf_logits * tf.cast(self.rf_mask, tf.float32), axis=1) num_tagged = tf.nn.relu(self.conf_logits + 1) self.verbose_loss = tf.reduce_mean(tf.reduce_sum(num_tagged, axis=1)) self.rl_loss = tf.reduce_mean(rl_loss_list)
def __init__(self, hp, voca_size, method, is_training=True): config = bert.BertConfig( vocab_size=voca_size, hidden_size=hp.hidden_units, num_hidden_layers=hp.num_blocks, num_attention_heads=hp.num_heads, intermediate_size=hp.intermediate_size, type_vocab_size=hp.type_vocab_size, ) seq_length = hp.seq_max use_tpu = False task = Classification(data_generator.NLI.nli_info.num_classes) input_ids = placeholder(tf.int64, [None, seq_length]) input_mask = placeholder(tf.int64, [None, seq_length]) segment_ids = placeholder(tf.int64, [None, seq_length]) label_ids = placeholder(tf.int64, [None]) if method in [0, 1, 3, 4, 5, 6]: self.rf_mask = placeholder(tf.float32, [None, seq_length]) elif method in [METHOD_CROSSENT, METHOD_HINGE]: self.rf_mask = placeholder(tf.int32, [None, seq_length]) self.x_list = [input_ids, input_mask, segment_ids] self.y = label_ids use_one_hot_embeddings = use_tpu self.model = bert.BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) pred, loss = task.predict(self.model.get_sequence_output(), label_ids, True) self.logits = task.logits self.sout = tf.nn.softmax(self.logits) self.pred = pred self.loss = loss self.acc = task.acc tf.summary.scalar('loss', self.loss) tf.summary.scalar('acc', self.acc) if method == 0: cl = tf.layers.dense(self.model.get_sequence_output(), 1, name="aux_conflict") cl = tf.reshape(cl, [-1, seq_length]) cl = tf.nn.sigmoid(cl) # cl = tf.contrib.layers.layer_norm(cl) self.conf_logits = cl # self.pkc = self.conf_logits * self.rf_mask # rl_loss_list = tf.reduce_sum(self.pkc, axis=1) rl_loss_list = tf.reduce_sum(self.conf_logits * tf.cast(self.rf_mask, tf.float32), axis=1) self.rl_loss = tf.reduce_mean(rl_loss_list) elif method == 1: cl = tf.layers.dense(self.model.get_sequence_output(), 1, name="aux_conflict") cl = tf.reshape(cl, [-1, seq_length]) cl = tf.contrib.layers.layer_norm(cl) self.conf_logits = cl #rl_loss_list = tf_module.cossim(cl, self.rf_mask) #self.pkc = self.conf_logits * self.rf_mask rl_loss_list = tf.reduce_sum(self.conf_logits * self.rf_mask, axis=1) self.rl_loss = tf.reduce_mean(rl_loss_list) elif method == METHOD_CROSSENT: cl = tf.layers.dense(self.model.get_sequence_output(), 2, name="aux_conflict") probs = tf.nn.softmax(cl) losses = tf.losses.softmax_cross_entropy(onehot_labels=tf.one_hot( self.rf_mask, 2), logits=cl) self.conf_logits = probs[:, :, 1] - 0.5 self.rl_loss = tf.reduce_mean(losses) elif method == 3: cl = tf.layers.dense(self.model.get_sequence_output(), 1, name="aux_conflict") cl = tf.reshape(cl, [-1, seq_length]) self.bias = tf.Variable(0.0) self.conf_logits = (cl + self.bias) rl_loss_list = tf.nn.relu(1 - self.conf_logits * self.rf_mask) rl_loss_list = tf.reduce_mean(rl_loss_list, axis=1) self.rl_loss = tf.reduce_mean(rl_loss_list) labels = tf.greater(self.rf_mask, 0) hinge_losses = tf.losses.hinge_loss(labels, self.conf_logits) self.hinge_loss = tf.reduce_sum(hinge_losses) elif method == 4: cl = tf.layers.dense(self.model.get_sequence_output(), 1, name="aux_conflict") cl = tf.reshape(cl, [-1, seq_length]) cl = tf.contrib.layers.layer_norm(cl) self.conf_logits = cl labels = tf.greater(self.rf_mask, 0) hinge_losses = tf.losses.hinge_loss(labels, self.conf_logits) self.rl_loss = hinge_losses elif method == 5: cl = tf.layers.dense(self.model.get_sequence_output(), 1, name="aux_conflict") cl = tf.reshape(cl, [-1, seq_length]) #cl = tf.contrib.layers.layer_norm(cl) self.conf_logits = cl self.labels = tf.cast(tf.greater(self.rf_mask, 0), tf.float32) self.rl_loss = tf.reduce_mean( tf_module.correlation_coefficient_loss(cl, -self.rf_mask)) elif method == 6: cl = tf.layers.dense(self.model.get_sequence_output(), 1, name="aux_conflict") #cl = tf.layers.dense(cl1, 1, name="aux_conflict2") cl = tf.reshape(cl, [-1, seq_length]) #cl = tf.nn.sigmoid(cl) #cl = tf.contrib.layers.layer_norm(cl) self.conf_logits = cl #rl_loss_list = tf.reduce_sum(self.conf_logits * self.rf_mask , axis=1) self.rl_loss = tf.reduce_mean( tf_module.correlation_coefficient_loss(cl, -self.rf_mask)) elif method == METHOD_HINGE: cl = tf.layers.dense(self.model.get_sequence_output(), 1, name="aux_conflict") cl = tf.reshape(cl, [-1, seq_length]) self.conf_logits = cl labels = tf.greater(self.rf_mask, 0) hinge_losses = tf.losses.hinge_loss(labels, self.conf_logits) self.rl_loss = tf.reduce_sum(hinge_losses) self.conf_softmax = tf.nn.softmax(self.conf_logits, axis=-1)
def __init__(self, hp, voca_size, method, is_training=True): config = bert.BertConfig(vocab_size=voca_size, hidden_size=hp.hidden_units, num_hidden_layers=hp.num_blocks, num_attention_heads=hp.num_heads, intermediate_size=hp.intermediate_size, type_vocab_size=hp.type_vocab_size, ) seq_length = hp.seq_max use_tpu = False task = Classification(data_generator.NLI.nli_info.num_classes) task2_num_classes = 3 input_ids = tf.placeholder(tf.int64, [None, seq_length]) input_mask = tf.placeholder(tf.int64, [None, seq_length]) segment_ids = tf.placeholder(tf.int64, [None, seq_length]) label_ids = tf.placeholder(tf.int64, [None]) if method in [0,1,3,4,5,6]: self.rf_mask = tf.placeholder(tf.float32, [None, seq_length]) elif method in [2]: self.rf_mask = tf.placeholder(tf.int32, [None, seq_length]) self.x_list = [input_ids, input_mask, segment_ids] self.y = label_ids self.y1 = tf.placeholder(tf.int64, [None], name="y1") self.y2 = tf.placeholder(tf.int64, [None], name="y2") self.f_loc1 = tf.placeholder(tf.int64, [None], name="f_loc1") self.f_loc2 = tf.placeholder(tf.int64, [None], name="f_loc2") use_one_hot_embeddings = use_tpu self.model = bert.BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings) pred, loss = task.predict(self.model.get_sequence_output(), label_ids, True) self.logits = task.logits self.sout = tf.nn.softmax(self.logits) self.pred = pred self.loss = loss self.acc = task.acc #tf.summary.scalar('loss', self.loss) #tf.summary.scalar('acc', self.acc) enc = self.model.get_sequence_output() # [Batch, Seq_len, hidden_dim] logits_raw = tf.layers.dense(enc, 3) # [Batch, seq_len, 3] def select(logits, f_loc): mask = tf.reshape(tf.one_hot(f_loc, seq_length), [-1,seq_length, 1]) # [Batch, seq_len, 1] t = tf.reduce_sum(logits * mask, axis=1) return t logits1 = select(logits_raw, self.f_loc1) # [Batch, 3] logits2 = select(logits_raw, self.f_loc2) # [Batch, 3] self.logits1 = logits1 self.logits2 = logits2 label1 = tf.one_hot(self.y1, task2_num_classes) # [Batch, num_class] label2 = tf.one_hot(self.y2, task2_num_classes) losses1_arr = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits1, labels=label1) losses2_arr = tf.nn.softmax_cross_entropy_with_logits_v2( logits=logits2, labels=label2) self.loss_paired = tf.reduce_mean(losses1_arr) #+ tf.reduce_mean(losses2_arr)