Пример #1
0
def train_text_cnn(data, num_attr):
  lr = FLAGS.lr
  batch_size = FLAGS.batch_size

  train_sents, train_y, test_sents, test_y = data
  train_x, train_m, test_x, test_m = preprocess_raw_data(
    train_sents, test_sents)

  inputs = tf.placeholder(tf.int64, (None, None), name="inputs")
  masks = tf.placeholder(tf.int32, (None, None), name="masks")

  labels = tf.placeholder(tf.int64, (None,), name="labels")
  training = tf.placeholder(tf.bool, name='training')

  text_cnn = TextCNN(vocab_size=50001, emb_dim=100, num_filter=128,
                     init_word_emb=None)
  classifier = build_model(num_attr, FLAGS.hidden_size)

  model_fn = lambda x, m, t: classifier(text_cnn.forward(x, m, t), t)

  logits = model_fn(inputs, masks, training)
  loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                        logits=logits)
  loss = tf.reduce_mean(loss)
  opt_loss = loss
  accuracies, top5_accuracies, predictions = acc_metrics(logits, labels,
                                                         num_attr)
  eval_fetches = [loss, accuracies, top5_accuracies]

  t_vars = tf.trainable_variables()
  post_ops = [tf.assign(v, v * (1 - FLAGS.wd)) for v in t_vars if
              'kernel' in v.name]

  optimizer = tf.train.AdamOptimizer(lr)
  grads_and_vars = optimizer.compute_gradients(opt_loss, t_vars)
  train_ops = optimizer.apply_gradients(
    grads_and_vars, global_step=tf.train.get_or_create_global_step())

  with tf.control_dependencies([train_ops]):
    train_ops = tf.group(*post_ops)

  log('Train attack model...')
  with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
    sess.run(tf.global_variables_initializer())

    def train_fn(batch_idx):
      feed = {inputs: train_x[batch_idx], masks: train_m[batch_idx],
              labels: train_y[batch_idx], training: True}
      err, _ = sess.run([loss, train_ops], feed_dict=feed)
      return err

    def eval_fn(batch_idx):
      feed = {inputs: test_x[batch_idx], masks: test_m[batch_idx],
              labels: test_y[batch_idx], training: False}
      return sess.run(eval_fetches,  feed_dict=feed)

    n_train, n_test = len(train_y), len(test_y)
    train_loops(FLAGS.epochs, n_train, n_test, train_fn, eval_fn, batch_size)
Пример #2
0
def train_loops(epochs, n_train, n_test, train_fn, eval_fn, batch_size,
                n_unlabeled=0, interleave_batch=False):
  if n_unlabeled:
    include_last = not interleave_batch
    u_batch_size = FLAGS.u_batch_size if include_last else batch_size
    unlabeled_data_sampler = inf_batch_iterator(n_unlabeled, u_batch_size)
  else:
    include_last = True
    unlabeled_data_sampler = None

  for epoch in range(epochs):
    train_iterations = 0
    train_loss = 0
    train_u_loss = 0

    for batch_idx in iterate_minibatches_indices(n_train, batch_size, True,
                                                 include_last=include_last):
      if unlabeled_data_sampler is None:
        err = train_fn(batch_idx)
      else:
        batch_u_idx = next(unlabeled_data_sampler)
        err, err_u = train_fn(batch_idx, batch_u_idx)
        train_u_loss += err_u

      train_loss += err
      train_iterations += 1

    test_loss = 0
    test_acc = 0
    test_top5_acc = 0
    test_iterations = 0
    for batch_idx in iterate_minibatches_indices(n_test, 512, False):
      err, acc, top5_acc = eval_fn(batch_idx)
      test_acc += acc
      test_top5_acc += top5_acc
      test_loss += err
      test_iterations += 1

    if (epoch + 1) % 10 == 0:
      log("Epoch: {}, train loss: {:.4f}, train l2u loss {:.4f}, "
          "test loss={:.4f}, test acc={:.2f}%, test top5 acc={:.2f}%".format(
            epoch + 1, train_loss / train_iterations,
            train_u_loss / train_iterations,
            test_loss / test_iterations,
            test_acc / n_test * 100,
            test_top5_acc / n_test * 100))
Пример #3
0
def main(_):
  split_word = FLAGS.model_name in {'textcnn', 'quickthought', 'transformer'}
  if FLAGS.data_name == 'bookcorpus':
    train_sents, train_authors, test_sents, test_authors,\
        unlabeled_sents, unlabeled_authors = bookcorpus_author_data(
          train_size=FLAGS.train_size, test_size=FLAGS.test_size,
          unlabeled_size=FLAGS.unlabeled_size, split_by_book=True,
          split_word=split_word, top_attr=FLAGS.top_attr, min_len=10)
  elif FLAGS.data_name == 'reddit':
    train_sents, train_authors, test_sents, test_authors,\
        unlabeled_sents, unlabeled_authors = reddit_author_data(
          train_size=FLAGS.train_size, test_size=FLAGS.test_size,
          unlabeled_size=FLAGS.unlabeled_size, split_word=split_word,
          top_attr=FLAGS.top_attr)
  else:
    raise ValueError(FLAGS.data_name)

  author_to_ids = get_attrs_to_ids(train_authors)

  train_y = np.asarray([author_to_ids[author] for author in train_authors],
                       dtype=np.int64)
  test_y = np.asarray([author_to_ids[author] for author in test_authors],
                      dtype=np.int64)
  num_attr = len(author_to_ids)

  log('{} training, {} testing'.format(len(train_y), len(test_y)))
  test_label_count = Counter(test_y)
  log('Majority baseline: {:.4f}% out of {} authors'.format(
      test_label_count.most_common(1)[0][1] / len(test_y) * 100,
      len(test_label_count)))

  data = train_sents, train_y, test_sents, test_y

  if FLAGS.model_name == 'textcnn':
    train_text_cnn(data, num_attr)
  elif FLAGS.model_name == 'charcnn':
    train_text_char_cnn(data, num_attr)
  else:
    train_embedding_classifier(data, unlabeled_sents, num_attr)
Пример #4
0
def optimization_inversion():
    _, _, x, y = load_inversion_data()
    y = sents_to_labels(y)

    max_iters = FLAGS.max_iters
    batch_size = FLAGS.batch_size
    seq_len = FLAGS.seq_len

    embed_module = "https://tfhub.dev/google/universal-sentence-encoder-lite/2"
    embed = hub.Module(embed_module)

    sp = spm.SentencePieceProcessor()
    sp.Load(SPM_MODEL_PATH)

    input_placeholder = tf.sparse_placeholder(tf.int64,
                                              shape=[batch_size, None],
                                              name='sparse_placeholder')

    # dummy call to setup the graph
    embed(inputs=dict(values=input_placeholder.values,
                      indices=input_placeholder.indices,
                      dense_shape=input_placeholder.dense_shape))

    emb_lookup = LAYER_NAMES[0]
    start_vars = set(v.name for v in tf.global_variables())

    word_emb = tf.global_variables()[0]

    logit_inputs = tf.get_variable(name='logit_inputs',
                                   shape=(batch_size, seq_len, 8002),
                                   initializer=tf.random_normal_initializer(
                                       -0.1, 0.1))

    permute_inputs = tf.get_variable(name='permute_inputs',
                                     shape=(batch_size, seq_len, seq_len),
                                     initializer=tf.random_normal_initializer(
                                         -0.1, 0.1))
    permute_matrix = sinkhorn(permute_inputs / FLAGS.temp, 10)

    prob_inputs = tf.nn.softmax(logit_inputs / FLAGS.temp, axis=-1)
    preds = tf.argmax(prob_inputs, axis=-1)

    emb_inputs = tf.matmul(prob_inputs, word_emb, name='new_embedding_lookup')
    emb_inputs = tf.matmul(permute_matrix, emb_inputs)

    if FLAGS.low_layer_idx == 0:
        encoded = mean_pool(emb_inputs)
    else:
        replace_graph(emb_lookup, emb_inputs)
        encoded = get_fetch_by_layer(FLAGS.low_layer_idx)

    targets = tf.placeholder(tf.float32,
                             name='target',
                             shape=(batch_size, encoded.shape.as_list()[-1]))

    loss = get_similarity_metric(encoded, targets, FLAGS.metric, rtn_loss=True)
    loss = tf.reduce_sum(loss)

    optimizer = tf.train.AdamOptimizer(FLAGS.lr)
    grads_and_vars = optimizer.compute_gradients(
        loss, [logit_inputs, permute_inputs])
    train_ops = optimizer.apply_gradients(
        grads_and_vars, global_step=tf.train.get_or_create_global_step())

    end_vars = tf.global_variables()
    new_vars = [v for v in end_vars if v.name not in start_vars]
    batch_init_ops = tf.variables_initializer(new_vars)

    total_it = len(x) // batch_size

    dummy_inputs = prepare_dummpy_sparse(batch_size, seq_len)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        def invert_one_batch(batch_targets):
            sess.run(batch_init_ops)
            feed_dict = {
                targets: batch_targets,
                'sparse_placeholder/values:0': dummy_inputs[0],
                'sparse_placeholder/indices:0': dummy_inputs[1],
                'sparse_placeholder/shape:0': dummy_inputs[2]
            }
            prev = 1e6
            for i in range(max_iters):
                curr, _ = sess.run([loss, train_ops], feed_dict)
                # stop if no progress
                if (i + 1) % (max_iters // 10) == 0 and curr > prev:
                    break
                prev = curr
            return sess.run([preds, loss], feed_dict)

        start_time = time.time()
        it = 0.0
        all_tp, all_fp, all_fn, all_err = 0.0, 0.0, 0.0, 0.0

        for batch_idx in iterate_minibatches_indices(len(x), batch_size, False,
                                                     False):
            y_pred, err = invert_one_batch(x[batch_idx])
            tp, fp, fn = tp_fp_fn_metrics_np(y_pred, y[batch_idx])

            it += 1.0
            all_err += err
            all_tp += tp
            all_fp += fp
            all_fn += fn

            all_pre = all_tp / (all_tp + all_fp + 1e-7)
            all_rec = all_tp / (all_tp + all_fn + 1e-7)
            all_f1 = 2 * all_pre * all_rec / (all_pre + all_rec + 1e-7)

            if it % FLAGS.print_every == 0:
                it_time = (time.time() - start_time) / it
                log("Iter {:.2f}%, err={}, pre={:.2f}%, rec={:.2f}%, f1={:.2f}%,"
                    " {:.2f} sec/it".format(it / total_it * 100, all_err / it,
                                            all_pre * 100, all_rec * 100,
                                            all_f1 * 100, it_time))

        all_pre = all_tp / (all_tp + all_fp + 1e-7)
        all_rec = all_tp / (all_tp + all_fn + 1e-7)
        all_f1 = 2 * all_pre * all_rec / (all_pre + all_rec + 1e-7)
        log("Final err={}, pre={:.2f}%, rec={:.2f}%, f1={:.2f}%".format(
            all_err / it, all_pre * 100, all_rec * 100, all_f1 * 100))
Пример #5
0
def load_inversion_data():
    module = hub.Module(
        "https://tfhub.dev/google/universal-sentence-encoder-lite/2")

    sp = spm.SentencePieceProcessor()
    sp.Load(SPM_MODEL_PATH)

    input_placeholder = tf.sparse_placeholder(tf.int64,
                                              shape=[None, None],
                                              name='sparse_placeholder')
    module(inputs=dict(values=input_placeholder.values,
                       indices=input_placeholder.indices,
                       dense_shape=input_placeholder.dense_shape))

    learn_mapping = FLAGS.high_layer_idx != FLAGS.low_layer_idx
    if learn_mapping:
        outputs = [
            get_fetch_by_layer(FLAGS.low_layer_idx),
            get_fetch_by_layer(FLAGS.high_layer_idx)
        ]
    else:
        outputs = get_fetch_by_layer(FLAGS.low_layer_idx)

    train_sents, _, test_sents, _, _, _ = load_bookcorpus_author(
        train_size=FLAGS.train_size,
        test_size=FLAGS.test_size,
        unlabeled_size=0,
        split_by_book=True,
        split_word=False,
        top_attr=800,
        remove_punct=False)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    def encode_sents(sents):
        y = [
            np.asarray(sp.EncodeAsIds(x)[:FLAGS.max_seq_length])
            for x in train_sents
        ]
        y = np.asarray(y)
        n_data = len(sents)
        pbar = tqdm.tqdm(total=n_data)
        embs_low, embs_high = [], []
        for b_idx in iterate_minibatches_indices(n_data, 512):
            values, indices, dense_shape = sents_to_sparse(y[b_idx])
            emb = sess.run(outputs,
                           feed_dict={
                               input_placeholder.values: values,
                               input_placeholder.indices: indices,
                               input_placeholder.dense_shape: dense_shape
                           })
            if learn_mapping:
                embs_low.append(emb[0])
                embs_high.append(emb[1])
            else:
                embs_low.append(emb)
            pbar.update(len(b_idx))

        pbar.close()
        if learn_mapping:
            return [np.vstack(embs_low), np.vstack(embs_high)], y
        else:
            return np.vstack(embs_low), y

    train_x, train_y = encode_sents(train_sents)
    test_x, test_y = encode_sents(test_sents)
    tf.keras.backend.clear_session()

    if learn_mapping:
        log('Training high to low mapping...')
        if FLAGS.mapper == 'linear':
            mapping = linear_mapping(train_x[1], train_x[0])
        elif FLAGS.mapper == 'mlp':
            mapping = mlp_mapping(train_x[1],
                                  train_x[0],
                                  epochs=10,
                                  activation=tf.tanh)
        elif FLAGS.mapper == 'gan':
            mapping = gan_mapping(train_x[1],
                                  train_x[0],
                                  disc_iters=5,
                                  batch_size=64,
                                  gamma=1.0,
                                  epoch=100,
                                  activation=tf.tanh)
        else:
            raise ValueError(FLAGS.mapper)
        test_x = mapping(test_x[1])

    return train_x, train_y, test_x, test_y
Пример #6
0
def train_embedding_classifier(data, unlabeled_data, num_attr):
  batch_size = FLAGS.batch_size
  interleave_batch = FLAGS.interleave

  train_sents, train_y, test_sents, test_y = data
  train_embs, test_embs, unlabeled_embs = encode_sentences(
    train_sents, test_sents, unlabeled_data)

  semi_supervised = len(unlabeled_embs) > 0
  n_train, n_test = len(train_y), len(test_y)

  encoder_dim = train_embs.shape[1]
  inputs = tf.placeholder(tf.float32, (None, encoder_dim), name='inputs')
  unlabeled_inputs = tf.placeholder(tf.float32, (None, encoder_dim),
                                    name="u_inputs")
  labels = tf.placeholder(tf.int64, (None,), name='labels')

  training = tf.placeholder(tf.bool, name='training')
  model_fn = build_model(num_attr, FLAGS.hidden_size)

  def augment_unlabeled(u):
    u = tf.nn.dropout(u, rate=0.25)
    u = add_gaussian_noise(u, gamma=0.1)
    u = batch_interpolation(u, alpha=0.9, random=True)
    return u

  if not semi_supervised:
    logits = model_fn(inputs, training)
    accuracies, top5_accuracies, _ = acc_metrics(logits, labels, num_attr)
    loss_xe = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                             logits=logits)
    loss_xe = tf.reduce_mean(loss_xe)
    loss_l2u = tf.constant(0.)
    loss = loss_xe
    eval_loss = loss_xe
  elif FLAGS.algo == 'mixmatch':
    augment = MixMode(FLAGS.mixmode)
    us = []
    logits_us = []
    for _ in range(FLAGS.k):
      u = augment_unlabeled(unlabeled_inputs)
      logits_u = model_fn(u, training)
      logits_us.append(logits_u)
      us.append(u)

    guess = guess_label(logits_us, temp=FLAGS.temp)
    lu = tf.stop_gradient(guess)
    lx = tf.one_hot(labels, num_attr)

    xu, labels_xu = augment([inputs] + us, [lx] + [lu] * FLAGS.k,
                            [FLAGS.beta, FLAGS.beta])
    labels_x, labels_us = labels_xu[0], tf.concat(labels_xu[1:], 0)

    if interleave_batch:
      xu = interleave(xu, batch_size)

    logits_x = model_fn(xu[0], training)
    logits_us = []
    for u in xu[1:]:
      logits_u = model_fn(u, training)
      logits_us.append(logits_u)

    logits_xu = [logits_x] + logits_us
    if interleave_batch:
      logits_xu = interleave(logits_xu, batch_size)

    logits_x = logits_xu[0]
    loss_xe = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels_x,
                                                         logits=logits_x)
    loss_xe = tf.reduce_mean(loss_xe)

    logits_us = tf.concat(logits_xu[1:], 0)
    loss_l2u = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels_us,
                                                          logits=logits_us)
    # loss_l2u = tf.square(labels_us - tf.nn.softmax(logits_us))
    loss_l2u = tf.reduce_mean(loss_l2u)
    global_step = tf.train.get_or_create_global_step()
    w_match = tf.clip_by_value(
      tf.cast(global_step, tf.float32) /
      (FLAGS.epochs * (int(n_train // batch_size) + 1)), 0, 1)

    loss = FLAGS.lambda_u * w_match * loss_l2u + loss_xe
    test_logits = model_fn(inputs, training)
    test_loss_xe = tf.nn.sparse_softmax_cross_entropy_with_logits(
      labels=labels, logits=test_logits)
    accuracies, top5_accuracies, _ = acc_metrics(test_logits, labels, num_attr)
    eval_loss = tf.reduce_mean(test_loss_xe)
  elif FLAGS.algo == 'uda':
    model_fn = build_ae_model(num_attr, 256, encoder_dim)
    us = []
    logits_us = []
    for _ in range(FLAGS.k):
      u = augment_unlabeled(unlabeled_inputs)
      logits_u = model_fn(u, training)[0]
      logits_us.append(logits_u)
      us.append(u)

    labels_u = guess_label(logits_us, temp=FLAGS.temp)
    labels_us = tf.concat([labels_u] * FLAGS.k, 0)

    logits_x, recon_x = model_fn(inputs, training)

    logits_us = []
    recon_us = []
    for u in us:
      logits_u, recon_u = model_fn(u, training)
      logits_us.append(logits_u)
      recon_us.append(recon_u)

    recon_loss = tf.reduce_mean(
      tf.reduce_sum(tf.square(inputs - recon_x), axis=-1))
    recon_us = tf.concat(recon_us, 0)
    us = tf.concat(us, 0)
    recon_loss += tf.reduce_mean(
      tf.reduce_sum(tf.square(us - recon_us), axis=-1))

    loss_xe = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                             logits=logits_x)
    loss_xe = tf.reduce_mean(loss_xe)

    logits_us = tf.concat(logits_us, 0)
    loss_l2u = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels_us,
                                                          logits=logits_us)
    # loss_l2u = tf.square(labels_us - tf.nn.softmax(logits_us))
    loss_l2u = tf.reduce_mean(loss_l2u)
    global_step = tf.train.get_or_create_global_step()
    w_match = tf.clip_by_value(
      tf.cast(global_step, tf.float32) /
      (FLAGS.epochs * (int(n_train // batch_size) + 1)), 0, 1)

    loss = FLAGS.lambda_u * w_match * loss_l2u + loss_xe + recon_loss
    test_logits = model_fn(inputs, training)[0]
    test_loss_xe = tf.nn.sparse_softmax_cross_entropy_with_logits(
      labels=labels, logits=test_logits)
    accuracies, top5_accuracies, _ = acc_metrics(test_logits, labels, num_attr)
    eval_loss = tf.reduce_mean(test_loss_xe)
  else:
    raise ValueError(FLAGS.algo)

  eval_fetches = [eval_loss, accuracies, top5_accuracies]
  t_vars = tf.trainable_variables()
  post_ops = [tf.assign(v, v * (1 - FLAGS.wd)) for v in t_vars if
              'kernel' in v.name]

  optimizer = tf.train.AdamOptimizer(FLAGS.lr)
  grads_and_vars = optimizer.compute_gradients(loss, t_vars)
  train_ops = optimizer.apply_gradients(
      grads_and_vars, global_step=tf.train.get_or_create_global_step())
  with tf.control_dependencies([train_ops]):
    train_ops = tf.group(*post_ops)

  log('Train attack model...')
  with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
    sess.run(tf.global_variables_initializer())

    def train_fn(*batch_idx):
      if len(batch_idx) == 1:
        batch_idx = batch_idx[0]
        feed = {inputs: train_embs[batch_idx], labels: train_y[batch_idx],
                training: True}
      else:
        feed = {inputs: train_embs[batch_idx[0]], labels: train_y[batch_idx[0]],
                unlabeled_inputs: unlabeled_embs[batch_idx[1]], training: True}
      err_xe, err_l2u, _ = sess.run([loss_xe, loss_l2u, train_ops],
                                    feed_dict=feed)
      if semi_supervised:
        return err_xe, err_l2u
      return err_xe

    def eval_fn(batch_idx):
      feed = {inputs: test_embs[batch_idx],
              labels: test_y[batch_idx], training: False}
      return sess.run(eval_fetches, feed_dict=feed)

    train_loops(FLAGS.epochs, n_train, n_test, train_fn, eval_fn, batch_size,
                len(unlabeled_embs), interleave_batch)
Пример #7
0
def encode_sentences(train_sents, test_sents, unlabeled_sents):
  query_size = 2048
  vocab = bookcorpus_vocab(0, rebuild=False)
  local_models = {'quickthought', 'transformer'}

  log('Encoding sentences...')
  if FLAGS.model_name in local_models:
    ckpt_name = get_model_ckpt_name(FLAGS.model_name,
                                    epoch=FLAGS.encoder_epoch, batch_size=800,
                                    gamma=FLAGS.gamma, attr='author')

    model_path = os.path.join(FLAGS.model_dir, ckpt_name)
    config = get_model_config(FLAGS.model_name)
    vocab, init_word_emb = expand_vocabulary(model_path, vocab)
    vocab_size = len(vocab) + 1

    model = QuickThoughtModel(vocab_size, config['emb_dim'],
                              config['encoder_dim'], 1,  init_word_emb=None,
                              cell_type=config['cell_type'], train=False)

    inputs = tf.placeholder(tf.int64, (None, None), name='inputs')
    masks = tf.placeholder(tf.int32, (None, None), name='masks')
    encode_emb = tf.nn.embedding_lookup(model.word_in_emb, inputs)
    encoded = model.encode(encode_emb, masks, model.in_cells, model.proj_in)
    if FLAGS.norm:
      encoded = tf.nn.l2_normalize(encoded, axis=-1)
    # model_vars = tf.trainable_variables()
    model_vars = {v.name[:-2]: v
                  for v in tf.trainable_variables()
                  if not v.name.startswith('emb')}

    saver = tf.train.Saver(model_vars)
    sess = tf.Session()
    emb_plhdr = tf.placeholder(tf.float32,
                               shape=(vocab_size, config['emb_dim']))
    sess.run(model.word_in_emb.assign(emb_plhdr),
             {emb_plhdr: init_word_emb})

    print('Loading weight from {}'.format(model_path))
    saver.restore(sess, os.path.join(model_path, 'model.ckpt'))
    encoder_fn = lambda s: sess.run(encoded, feed_dict={inputs: s[0],
                                                        masks: s[1]})
  elif FLAGS.model_name == 'skipthought':
    from models.skip_thoughts import encoder_manager
    from models.skip_thoughts import configuration
    model_dir = os.path.join(NFS_DIR, 'models/skip/')
    vocab_file = os.path.join(model_dir, 'vocab.txt')
    embedding_file = os.path.join('./skip_thoughts/', 'embeddings.npy')
    ckpt_path = os.path.join(model_dir, 'model.ckpt-500008')
    encoder = encoder_manager.EncoderManager()
    encoder.load_model(configuration.model_config(bidirectional_encoder=True,
                                                  shuffle_input_data=False),
                       vocabulary_file=vocab_file,
                       embedding_matrix_file=embedding_file,
                       checkpoint_path=ckpt_path)
    encoder_fn = lambda s: encoder.encode(s, batch_size=query_size,
                                          use_norm=False)
    sess = encoder.sessions[0]
  elif FLAGS.model_name == 'use':
    embed_module = 'https://tfhub.dev/google/' \
                   'universal-sentence-encoder-large/3'
    embed = hub.Module(embed_module, trainable=False)
    inputs = tf.placeholder(tf.string, shape=(None,))
    encoded = embed(inputs)
    sess = tf.Session()
    sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
    encoder_fn = lambda s: sess.run(encoded, feed_dict={inputs: s})
  elif FLAGS.model_name == 'elmo':
    query_size = 512
    embed_module = 'https://tfhub.dev/google/elmo/2'
    embed = hub.Module(embed_module, trainable=False)
    inputs = tf.placeholder(tf.string, shape=(None,))
    encoded = embed(inputs, signature='default', as_dict=True)['default']
    sess = tf.Session()
    sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
    encoder_fn = lambda s: sess.run(encoded, feed_dict={inputs: s})
  elif FLAGS.model_name == 'infersent':
    from models.infersent.models import InferSent
    import torch
    model_version = 2
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0,
                    'version': model_version}
    encoder = InferSent(params_model)
    encoder.load_state_dict(torch.load("./infersent/infersent%s.pkl" %
                                       model_version))
    encoder.cuda()
    encoder.set_w2v_path('./infersent/crawl-300d-2M.vec')
    encoder.build_vocab_k_words(K=100000)
    encoder_fn = lambda s: encoder.encode(s, tokenize=False)
  else:
    raise ValueError(FLAGS.model)

  def encode_sents(s, n):
    embs = []
    pbar = tqdm.tqdm(total=n)
    tuple_inputs = isinstance(s, tuple)
    for batch_idx in iterate_minibatches_indices(n, query_size, False):
      if tuple_inputs:
        batch_embs = encoder_fn((s[0][batch_idx], s[1][batch_idx]))
      else:
        batch_embs = encoder_fn(s[batch_idx])
      embs.append(batch_embs)
      pbar.update(len(batch_embs))
    pbar.close()
    return np.vstack(embs)

  n_train, n_test, n_unlabeled = len(train_sents), len(test_sents),\
                                 len(unlabeled_sents)

  unlabeled_embs = []
  if FLAGS.model_name in local_models:
    rtn = preprocess_raw_data(train_sents, test_sents, unlabeled_sents,
                              vocab=vocab)
    train_embs = encode_sents((rtn[0], rtn[1]), n_train)
    test_embs = encode_sents((rtn[2], rtn[3]), n_test)
    if n_unlabeled:
      unlabeled_embs = encode_sents((rtn[4], rtn[5]), n_unlabeled)
  else:
    train_embs = encode_sents(train_sents, n_train)
    test_embs = encode_sents(test_sents, n_test)
    if n_unlabeled:
      unlabeled_embs = encode_sents(unlabeled_sents, n_unlabeled)

  tf.keras.backend.clear_session()
  log('Encoded train {}, test {}'.format(train_embs.shape, test_embs.shape))
  return train_embs, test_embs, unlabeled_embs
Пример #8
0
def trained_metric_attack():
    freq_min = FLAGS.freq_min

    # load data part
    train_filenames, test_filenames = split_bookcorpus(0)
    member_embeds, nonmember_embeds = load_book_embedding(
        train_filenames, test_filenames, freq_min)

    membership_labels = np.concatenate(
        [np.ones(len(member_embeds)),
         np.zeros(len(nonmember_embeds))])

    all_embeds = member_embeds + nonmember_embeds
    train_indices, test_indices, _ = membership_split(
        (all_embeds, membership_labels))

    def indices_to_data(indices):
        embeds, labels, weights = [], [], []
        for idx in indices:
            embeds.append(all_embeds[idx])
            labels.append(membership_labels[idx])

        return embeds, labels

    train_embeds, train_labels = indices_to_data(train_indices)
    test_embeds, test_labels = indices_to_data(test_indices)

    train_y = []
    for emb, label in zip(train_embeds, train_labels):
        train_y.append(np.ones(len(emb)) * label)

    train_y = np.concatenate(train_y).astype(np.float32)
    train_x = np.vstack(train_embeds)

    # define attack model
    config = get_model_config(FLAGS.model_name)
    encoder_dim = config["encoder_dim"]

    optimizer = tf.train.AdamOptimizer(1e-4)

    inputs_a = tf.placeholder(tf.float32, (None, encoder_dim), name="inputs_a")
    inputs_b = tf.placeholder(tf.float32, (None, encoder_dim), name="inputs_b")
    labels = tf.placeholder(tf.float32, (None, ), name="labels")
    training = tf.placeholder(tf.bool, name="training")

    if FLAGS.model == 'deepset':
        model = DeepSetModel(encoder_dim // 2)
    elif FLAGS.model == 'bilinear':
        model = BilinearMetricModel(encoder_dim)
    elif FLAGS.model == 'linear':
        model = LinearMetricModel(encoder_dim // 2)
    else:
        raise ValueError(FLAGS.model)

    logits = model.forward(inputs_a, inputs_b, training=training)
    learned_sim = logits

    loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels,
                                                   logits=logits)
    loss = tf.reduce_mean(loss)

    if FLAGS.metric == 'dot':
        sim = tf.reduce_sum(tf.multiply(inputs_a, inputs_b), axis=1)
    elif FLAGS.metric == 'cosine':
        sim = tf.reduce_sum(tf.multiply(tf.nn.l2_normalize(inputs_a, axis=-1),
                                        tf.nn.l2_normalize(inputs_b, axis=-1)),
                            axis=1)
    elif FLAGS.metric == 'l2':
        sim = -tf.reduce_sum(tf.square(inputs_a - inputs_b), axis=1)
    else:
        raise ValueError(FLAGS.metric)

    t_vars = tf.trainable_variables()
    post_ops = [
        tf.assign(v, v * (1 - FLAGS.wd)) for v in t_vars if 'kernel' in v.name
    ]

    grads_and_vars = optimizer.compute_gradients(loss, t_vars)
    train_ops = optimizer.apply_gradients(
        grads_and_vars, global_step=tf.train.get_or_create_global_step())

    with tf.control_dependencies([train_ops]):
        train_ops = tf.group(*post_ops)

    inputs = [inputs_a, inputs_b]
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
        sess.run(tf.global_variables_initializer())

        def split_metrics(ms, ls):
            member_ms, nonmember_ms = [], []
            for m, l in zip(ms, ls):
                if l == 1:
                    member_ms.append(m)
                else:
                    nonmember_ms.append(m)

            return member_ms, nonmember_ms

        def weighted_average(x):
            return np.mean(x)

        def calculate_adversarial_advantage(fetch):
            test_metrics = collect_scores(inputs, test_embeds, sess, fetch,
                                          training)
            test_member_ms, test_nonmember_ms = split_metrics(
                test_metrics, test_labels)

            compute_adversarial_advantage(np.concatenate(test_member_ms),
                                          np.concatenate(test_nonmember_ms))

            if FLAGS.book_level:
                compute_adversarial_advantage(
                    [weighted_average(m) for m in test_member_ms],
                    [weighted_average(m) for m in test_nonmember_ms])

        calculate_adversarial_advantage(sim)
        print('Training attack model with {} embs...'.format(len(train_y)))
        for epoch in range(10):
            iterations = 0
            train_loss = 0

            for batch_idx in iterate_minibatches_indices(
                    len(train_y), batch_size=FLAGS.batch_size, shuffle=True):
                feed = {
                    inputs_a: train_x[batch_idx][:, :encoder_dim],
                    inputs_b: train_x[batch_idx][:, encoder_dim:],
                    labels: train_y[batch_idx],
                    training: True
                }
                err, _ = sess.run([loss, train_ops], feed_dict=feed)
                train_loss += err
                iterations += 1

            log("\nEpoch: {}, Loss: {:.4f}".format(epoch,
                                                   train_loss / iterations))
            calculate_adversarial_advantage(learned_sim)
def optimization_inversion():
  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case,
      spm_model_file=FLAGS.spm_model_file)
  cls_id = tokenizer.vocab['[CLS]']
  sep_id = tokenizer.vocab['[SEP]']
  mask_id = tokenizer.vocab['[MASK]']

  _, _, x, y = load_inversion_data()
  y = y[0]
  filters = [cls_id, sep_id, mask_id]
  y = filter_labels(y, filters)

  batch_size = FLAGS.batch_size
  seq_len = FLAGS.seq_len
  max_iters = FLAGS.max_iters

  albert_config = modeling.AlbertConfig.from_json_file(FLAGS.albert_config_file)
  input_ids = tf.ones((batch_size, seq_len + 2), tf.int32)
  input_mask = tf.ones_like(input_ids, tf.int32)
  input_type_ids = tf.zeros_like(input_ids, tf.int32)

  model = modeling.AlbertModel(
      config=albert_config,
      is_training=False,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=input_type_ids,
      use_one_hot_embeddings=False)

  word_emb = model.output_embedding_table

  albert_vars = tf.trainable_variables()
  (assignment_map,
   _) = modeling.get_assignment_map_from_checkpoint(albert_vars,
                                                    FLAGS.init_checkpoint)
  tf.train.init_from_checkpoint(FLAGS.init_checkpoint, assignment_map)

  batch_cls_ids = tf.ones((batch_size, 1), tf.int32) * cls_id
  batch_sep_ids = tf.ones((batch_size, 1), tf.int32) * sep_id
  cls_emb = tf.nn.embedding_lookup(word_emb, batch_cls_ids)
  sep_emb = tf.nn.embedding_lookup(word_emb, batch_sep_ids)

  prob_mask = np.zeros((albert_config.vocab_size,), np.float32)
  prob_mask[filters] = -1e9
  prob_mask = tf.constant(prob_mask, dtype=np.float32)

  logit_inputs = tf.get_variable(
      name='inputs',
      shape=(batch_size, seq_len, albert_config.vocab_size),
      initializer=tf.random_uniform_initializer(-0.1, 0.1))

  t_vars = [logit_inputs]
  t_var_names = {logit_inputs.name}

  logit_inputs += prob_mask
  prob_inputs = tf.nn.softmax(logit_inputs / FLAGS.temp, axis=-1)

  emb_inputs = tf.matmul(prob_inputs, word_emb)
  emb_inputs = tf.concat([cls_emb, emb_inputs, sep_emb], axis=1)

  if FLAGS.low_layer_idx == 0:
    encoded = mean_pool(emb_inputs, input_mask)
  else:
    encoded = encode(emb_inputs, input_mask, input_type_ids, albert_config)
  targets = tf.placeholder(
        tf.float32, shape=(batch_size, encoded.shape.as_list()[-1]))

  loss = get_similarity_metric(encoded, targets, FLAGS.metric, rtn_loss=True)
  loss = tf.reduce_sum(loss)

  optimizer = tf.train.AdamOptimizer(FLAGS.lr)

  start_vars = set(v.name for v in tf.global_variables()
                   if v.name not in t_var_names)
  grads_and_vars = optimizer.compute_gradients(loss, t_vars)
  train_ops = optimizer.apply_gradients(
      grads_and_vars, global_step=tf.train.get_or_create_global_step())

  end_vars = tf.global_variables()
  new_vars = [v for v in end_vars if v.name not in start_vars]

  preds = tf.argmax(prob_inputs, axis=-1)
  batch_init_ops = tf.variables_initializer(new_vars)

  total_it = len(x) // batch_size
  with tf.Session() as sess:
    sess.run([tf.global_variables_initializer(), tf.tables_initializer()])

    def invert_one_batch(batch_targets):
      sess.run(batch_init_ops)
      feed_dict = {targets: batch_targets}
      prev = 1e6
      for i in range(max_iters):
        curr, _ = sess.run([loss, train_ops], feed_dict)
        # stop if no progress
        if (i + 1) % (max_iters // 10) == 0 and curr > prev:
          break
        prev = curr
      return sess.run([preds, loss], feed_dict)

    start_time = time.time()
    it = 0.0
    all_tp, all_fp, all_fn, all_err = 0.0, 0.0, 0.0, 0.0

    for batch_idx in iterate_minibatches_indices(len(x), batch_size,
                                                 False, False):
      y_pred, err = invert_one_batch(x[batch_idx])
      tp, fp, fn = tp_fp_fn_metrics_np(y_pred, y[batch_idx])
      # for yp, yt in zip(y_pred, y[batch_idx]):
      #   print(' '.join(set(tokenizer.convert_ids_to_tokens(yp))))
      #   print(' '.join(set(tokenizer.convert_ids_to_tokens(yt))))

      it += 1.0
      all_err += err
      all_tp += tp
      all_fp += fp
      all_fn += fn

      all_pre = all_tp / (all_tp + all_fp + 1e-7)
      all_rec = all_tp / (all_tp + all_fn + 1e-7)
      all_f1 = 2 * all_pre * all_rec / (all_pre + all_rec + 1e-7)

      if it % FLAGS.print_every == 0:
        it_time = (time.time() - start_time) / it
        log("Iter {:.2f}%, err={}, pre={:.2f}%, rec={:.2f}%, f1={:.2f}%,"
            " {:.2f} sec/it".format(it / total_it * 100, all_err / it,
                                    all_pre * 100, all_rec * 100,
                                    all_f1 * 100, it_time))

    all_pre = all_tp / (all_tp + all_fp + 1e-7)
    all_rec = all_tp / (all_tp + all_fn + 1e-7)
    all_f1 = 2 * all_pre * all_rec / (all_pre + all_rec + 1e-7)
    log("Final err={}, pre={:.2f}%, rec={:.2f}%, f1={:.2f}%".format(
      all_err / it, all_pre * 100, all_rec * 100, all_f1 * 100))
Пример #10
0
def load_inversion_data():
    vocab = build_vocabulary(rebuild=False)

    if FLAGS.data_name == 'bookcorpus':
        train_sents, _, test_sents, _, _, _ = load_bookcorpus_author(
            train_size=FLAGS.train_size,
            test_size=FLAGS.test_size,
            unlabeled_size=0,
            split_by_book=True,
            split_word=True,
            top_attr=800)
    elif FLAGS.data_name == 'reddit':
        train_sents, _, test_sents, _, _, _ = reddit_author_data(
            train_size=FLAGS.train_size,
            test_size=1,
            unlabeled_size=0,
            split_word=True,
            top_attr=0)
    else:
        raise ValueError(FLAGS.data_name)

    if FLAGS.cross_domain:
        train_sents = load_cross_domain_data(800000, split_word=True)
        log('Loaded {} cross domain sentences'.format(len(train_sents)))

    ckpt_name = get_model_ckpt_name(FLAGS.model_name,
                                    epoch=FLAGS.epoch,
                                    batch_size=FLAGS.batch_size,
                                    gamma=FLAGS.gamma,
                                    num_layer=3,
                                    attr=FLAGS.attr)
    model_path = os.path.join(FLAGS.model_dir, ckpt_name, 'model.ckpt')
    config = get_model_config(FLAGS.model_name)

    train_data, test_data = encode_sentences(
        vocab,
        model_path,
        config,
        train_sents,
        test_sents,
        low_layer_idx=FLAGS.low_layer_idx,
        high_layer_idx=FLAGS.high_layer_idx)
    # clear session data for later optimization or learning
    tf.keras.backend.clear_session()

    train_x, train_y, train_m = train_data
    test_x, test_y, test_m = test_data

    if FLAGS.low_layer_idx != FLAGS.high_layer_idx:
        log('Training high to low mapping...')
        if FLAGS.mapper == 'linear':
            mapping = linear_mapping(train_x[1], train_x[0])
        elif FLAGS.mapper == 'mlp':
            mapping = mlp_mapping(train_x[1],
                                  train_x[0],
                                  epochs=50,
                                  activation=tf.nn.relu)
        elif FLAGS.mapper == 'gan':
            mapping = gan_mapping(train_x[1],
                                  train_x[0],
                                  disc_iters=5,
                                  batch_size=64,
                                  gamma=1.0,
                                  epoch=100,
                                  activation=tf.tanh)
        else:
            raise ValueError(FLAGS.mapper)
        test_x = mapping(test_x[1])
        train_x = train_x[0]

    log('Loaded {} embeddings for inversion with shape {}'.format(
        test_x.shape[0], test_x.shape[1]))

    data = (train_x, test_x, train_y, test_y, train_m, test_m)
    return data
Пример #11
0
def optimization_invert(data,
                        lr=1e-3,
                        attack_batch_size=8,
                        seq_len=5,
                        max_iters=1000):
    # use softmax to select words
    _, x, _, y = data[:4]
    y = sents_to_labels(y)

    config = get_model_config(FLAGS.model_name)
    num_words = config['vocab_size']
    model = QuickThoughtModel(num_words,
                              config['emb_dim'],
                              config['encoder_dim'],
                              1,
                              init_word_emb=None,
                              cell_type=config['cell_type'],
                              num_layer=config['num_layer'],
                              train=False)
    word_emb = model.word_in_emb
    targets = tf.placeholder(tf.float32, shape=(attack_batch_size, x.shape[1]))

    log('Inverting {} words from {} embeddings'.format(num_words, len(x)))

    if FLAGS.permute:
        # modeling the top k words then permute the order
        logit_inputs = tf.get_variable(
            name='inputs',
            shape=(attack_batch_size, seq_len, num_words - 1),
            initializer=tf.random_uniform_initializer(-0.1, 0.1))
        t_vars = [logit_inputs]

        prob_inputs = continuous_topk_v2(logit_inputs, seq_len, FLAGS.temp)
        pad_inputs = tf.zeros((attack_batch_size, seq_len, 1))
        prob_inputs = tf.concat([pad_inputs, prob_inputs], axis=2)
        emb_inputs = tf.matmul(prob_inputs, word_emb)

        permute_inputs = tf.get_variable(
            name='permute_inputs',
            shape=(attack_batch_size, seq_len, seq_len),
            initializer=tf.truncated_normal_initializer(0, 0.1))
        t_vars.append(permute_inputs)

        permute_matrix = sinkhorn(permute_inputs / FLAGS.temp, 20)
        emb_inputs = tf.matmul(permute_matrix, emb_inputs)
    else:
        logit_inputs = tf.get_variable(
            name='inputs',
            shape=(attack_batch_size, seq_len, num_words - 1),
            initializer=tf.random_uniform_initializer(-0.1, 0.1))
        t_vars = [logit_inputs]

        pad_inputs = tf.ones(
            (attack_batch_size, seq_len, 1), tf.float32) * (-1e9)
        logit_inputs = tf.concat([pad_inputs, logit_inputs], axis=2)
        prob_inputs = tf.nn.softmax(logit_inputs / FLAGS.temp, axis=-1)
        emb_inputs = tf.matmul(prob_inputs, word_emb)

    preds = tf.argmax(prob_inputs, axis=-1)
    t_var_names = set([v.name for v in t_vars])

    masks = tf.ones(shape=(attack_batch_size, seq_len), dtype=tf.int32)
    all_layers = model.encode(emb_inputs,
                              masks,
                              model.in_cells,
                              model.proj_in,
                              return_all_layers=True)
    encoded = all_layers[FLAGS.low_layer_idx]

    loss = get_similarity_metric(encoded, targets, FLAGS.metric, rtn_loss=True)
    loss = tf.reduce_sum(loss)

    if FLAGS.alpha > 0.:
        # encourage the words to be different
        diff = tf.expand_dims(prob_inputs, 2) - tf.expand_dims(prob_inputs, 1)
        reg = tf.reduce_mean(-tf.exp(tf.reduce_sum(diff**2, axis=-1)), [1, 2])
        loss += FLAGS.alpha * tf.reduce_sum(reg)

    optimizer = tf.train.AdamOptimizer(lr)
    model_vars = [
        v for v in tf.global_variables() if v.name not in t_var_names
    ]
    saver = tf.train.Saver(model_vars)
    start_vars = set(v.name for v in model_vars)

    grads_and_vars = optimizer.compute_gradients(loss, t_vars)
    train_ops = optimizer.apply_gradients(
        grads_and_vars, global_step=tf.train.get_or_create_global_step())
    end_vars = tf.global_variables()
    new_vars = [v for v in end_vars if v.name not in start_vars]

    batch_init_ops = tf.variables_initializer(new_vars)

    total_it = len(x) // attack_batch_size
    with tf.Session() as sess:
        ckpt_name = get_model_ckpt_name(FLAGS.model_name,
                                        epoch=FLAGS.epoch,
                                        batch_size=FLAGS.batch_size,
                                        num_layer=3,
                                        gamma=FLAGS.gamma,
                                        attr=FLAGS.attr)
        ckpt_path = os.path.join(FLAGS.model_dir, ckpt_name, 'model.ckpt')
        log('Restoring model from {}'.format(ckpt_path))
        saver.restore(sess, ckpt_path)

        def invert_one_batch(batch_targets):
            sess.run(batch_init_ops)
            feed_dict = {targets: batch_targets}
            prev = 1e6
            for i in range(max_iters):
                curr, _ = sess.run([loss, train_ops], feed_dict)
                # stop if no progress
                if (i + 1) % (max_iters // 10) == 0 and curr > prev:
                    break
                prev = curr
            return sess.run([preds, loss], feed_dict)

        it = 0.0
        all_tp, all_fp, all_fn, all_err = 0.0, 0.0, 0.0, 0.0

        start_time = time.time()

        # vocab = build_vocabulary(exp_id=0, rebuild=False)
        # inv_vocab = dict((v, k) for k, v in vocab.items())

        for batch_idx in iterate_minibatches_indices(len(x), attack_batch_size,
                                                     False, False):
            y_pred, err = invert_one_batch(x[batch_idx])
            tp, fp, fn = tp_fp_fn_metrics_np(y_pred, y[batch_idx])
            # for yy, pp in zip(y[batch_idx], y_pred):
            #   matched = np.intersect1d(np.unique(yy), np.unique(pp))
            #   if len(matched) >= 0.75 * len(yy):
            #     print(' '.join([inv_vocab[w] for w in yy]))
            #     print(' '.join([inv_vocab[w] for w in np.unique(pp)]))

            it += 1.0
            all_err += err
            all_tp += tp
            all_fp += fp
            all_fn += fn

            all_pre = all_tp / (all_tp + all_fp + 1e-7)
            all_rec = all_tp / (all_tp + all_fn + 1e-7)
            all_f1 = 2 * all_pre * all_rec / (all_pre + all_rec + 1e-7)

            if it % FLAGS.print_every == 0:
                it_time = (time.time() - start_time) / it
                log('Iter {:.2f}%, err={}, pre={:.2f}%, rec={:.2f}%, f1={:.2f}%,'
                    ' {:.2f} sec/it'.format(it / total_it * 100, all_err / it,
                                            all_pre * 100, all_rec * 100,
                                            all_f1 * 100, it_time))

        all_pre = all_tp / (all_tp + all_fp + 1e-7)
        all_rec = all_tp / (all_tp + all_fn + 1e-7)
        all_f1 = 2 * all_pre * all_rec / (all_pre + all_rec + 1e-7)
        log('Final err={}, pre={:.2f}%, rec={:.2f}%, f1={:.2f}%'.format(
            all_err / it, all_pre * 100, all_rec * 100, all_f1 * 100))
Пример #12
0
def learning_invert(data, batch_size):
    train_x, test_x, train_y, test_y, train_m, test_m = data

    config = get_model_config(FLAGS.model_name)
    num_words = config['vocab_size']

    if FLAGS.model != 'rnn':
        train_y, test_y = sents_to_labels(train_y), sents_to_labels(test_y)

    label_freq = count_label_freq(train_y, num_words)
    log('Imbalace ratio: {}'.format(np.max(label_freq) / np.min(label_freq)))

    label_margin = tf.constant(np.reciprocal(label_freq**0.25),
                               dtype=tf.float32)
    C = FLAGS.C

    log('Build attack model for {} words...'.format(num_words))

    encoder_dim = train_x.shape[1]
    inputs = tf.placeholder(tf.float32, (None, encoder_dim), name="inputs")
    labels = tf.placeholder(tf.float32, (None, num_words), name="labels")
    masks = None
    training = tf.placeholder(tf.bool, name='training')

    if FLAGS.model == 'multiset':
        if num_words == 50001:
            init_word_emb = load_initialized_word_emb()
            emb_dim = init_word_emb.shape[1]
        else:
            init_word_emb = None
            emb_dim = 512

        model = MultiSetInversionModel(emb_dim,
                                       num_words,
                                       FLAGS.seq_len,
                                       init_word_emb,
                                       C=C,
                                       label_margin=label_margin)
        preds, loss = model.forward(inputs, labels, training)
        true_pos, false_pos, false_neg = tp_fp_fn_metrics(labels, preds)
        eval_fetch = [loss, true_pos, false_pos, false_neg]
    elif FLAGS.model == 'rnn':
        labels = tf.placeholder(tf.int64, (None, None), name="labels")
        masks = tf.placeholder(tf.int32, (None, None), name="masks")

        init_word_emb = load_initialized_word_emb(glove_only=True)
        model = RecurrentInversionModel(init_word_emb.shape[1],
                                        num_words,
                                        FLAGS.seq_len,
                                        init_word_emb,
                                        beam_size=5,
                                        C=C,
                                        label_margin=label_margin)
        preds, loss = model.forward(inputs, labels, masks, training)
        eval_fetch = [loss, preds]
    elif FLAGS.model == 'multilabel':
        model = MultiLabelInversionModel(num_words,
                                         C=C,
                                         label_margin=label_margin)
        preds, loss = model.forward(inputs, labels, training)
        true_pos, false_pos, false_neg = tp_fp_fn_metrics(labels, preds)
        eval_fetch = [loss, true_pos, false_pos, false_neg]
    else:
        raise ValueError(FLAGS.model)

    t_vars = tf.trainable_variables()
    wd = FLAGS.wd
    post_ops = [
        tf.assign(v, v * (1 - wd)) for v in t_vars if 'kernel' in v.name
    ]

    optimizer = tf.train.AdamOptimizer(FLAGS.lr)
    grads_and_vars = optimizer.compute_gradients(
        loss + tf.losses.get_regularization_loss(), t_vars)
    train_ops = optimizer.apply_gradients(
        grads_and_vars, global_step=tf.train.get_or_create_global_step())

    with tf.control_dependencies([train_ops]):
        train_ops = tf.group(*post_ops)

    log('Train attack model with {} data...'.format(len(train_x)))
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
        sess.run(tf.global_variables_initializer())

        for epoch in range(30):
            train_iterations = 0
            train_loss = 0

            for batch_idx in iterate_minibatches_indices(
                    len(train_y), batch_size, True):
                if masks is None:
                    one_hot_labels = np.zeros((len(batch_idx), num_words),
                                              dtype=np.float32)
                    for i, idx in enumerate(batch_idx):
                        one_hot_labels[i][train_y[idx]] = 1
                    feed = {
                        inputs: train_x[batch_idx],
                        labels: one_hot_labels,
                        training: True
                    }
                else:
                    feed = {
                        inputs: train_x[batch_idx],
                        labels: train_y[batch_idx],
                        masks: train_m[batch_idx],
                        training: True
                    }

                err, _ = sess.run([loss, train_ops], feed_dict=feed)
                train_loss += err
                train_iterations += 1

            test_iterations = 0
            test_loss = 0
            test_tp, test_fp, test_fn = 0, 0, 0

            for batch_idx in iterate_minibatches_indices(len(test_y),
                                                         batch_size=512,
                                                         shuffle=False):
                if masks is None:
                    one_hot_labels = np.zeros((len(batch_idx), num_words),
                                              dtype=np.float32)
                    for i, idx in enumerate(batch_idx):
                        one_hot_labels[i][test_y[idx]] = 1
                    feed = {
                        inputs: test_x[batch_idx],
                        labels: one_hot_labels,
                        training: False
                    }
                else:
                    feed = {
                        inputs: test_x[batch_idx],
                        labels: test_y[batch_idx],
                        masks: test_m[batch_idx],
                        training: False
                    }

                fetch = sess.run(eval_fetch, feed_dict=feed)
                if len(fetch) == 2:
                    err, pred = fetch
                    tp, fp, fn = tp_fp_fn_metrics_np(pred, test_y[batch_idx])
                else:
                    err, tp, fp, fn = fetch

                # for yy, pp in zip(test_y[batch_idx], pred):
                #   matched = np.intersect1d(np.unique(yy), np.unique(pp))
                #   if len(matched) >= 0.8 * len(yy):
                #     print(' '.join([inv_vocab[w] for w in yy]))
                #     print(' '.join([inv_vocab[w] for w in np.unique(pp)]))

                test_iterations += 1
                test_loss += err
                test_tp += tp
                test_fp += fp
                test_fn += fn

            precision = test_tp / (test_tp + test_fp) * 100
            recall = test_tp / (test_tp + test_fn) * 100
            f1 = 2 * precision * recall / (precision + recall)

            log("Epoch: {}, train loss: {:.4f}, test loss: {:.4f}, "
                "pre: {:.2f}%, rec: {:.2f}%, f1: {:.2f}%".format(
                    epoch, train_loss / train_iterations,
                    test_loss / test_iterations, precision, recall, f1))
def main(_):
    epochs = FLAGS.epochs
    gamma = FLAGS.gamma
    batch_size = FLAGS.batch_size

    sents, sent_masks, authors, vocab = \
        load_bookcorpus_sentences(load_author=True)
    num_author = len(np.unique(authors))

    init_word_emb = load_initialized_word_emb()

    vocab_size = len(vocab) + 1

    log("training with {} sents and {} vocabs and {} authors".format(
        sents.shape, vocab_size, num_author))

    if init_word_emb is not None and init_word_emb.shape[1] != FLAGS.emb_dim:
        offset = FLAGS.emb_dim - init_word_emb.shape[1]
        if offset > 0:
            random_emb = np.random.uniform(
                -0.1, 0.1, (vocab_size, offset)).astype(np.float32)
            init_word_emb = np.hstack([init_word_emb, random_emb])
        else:
            init_word_emb = init_word_emb[:, :FLAGS.emb_dim]

    model = QuickThoughtModel(vocab_size,
                              FLAGS.emb_dim,
                              FLAGS.encoder_dim,
                              FLAGS.context_size,
                              cell_type=FLAGS.cell_type,
                              num_layer=FLAGS.num_layer,
                              init_word_emb=init_word_emb,
                              train=True,
                              drop_p=0.15)

    global_step = tf.train.get_or_create_global_step()
    lr = get_lr(global_step) if FLAGS.cell_type == 'TRANS' else FLAGS.lr
    optimizer = tf.train.AdamOptimizer(lr)

    i_inputs = tf.placeholder(tf.int64, (None, None), name="i_inputs")
    i_masks = tf.placeholder(tf.int32, (None, None), name="i_masks")
    p_inputs = tf.placeholder(tf.int64, (None, None), name="p_inputs")
    p_masks = tf.placeholder(tf.int32, (None, None), name="p_masks")
    r_inputs = tf.placeholder(tf.int64, (None, None), name="r_inputs")
    r_masks = tf.placeholder(tf.int32, (None, None), name="r_masks")

    output_tensors = [(p_inputs, p_masks), (r_inputs, r_masks)]
    accs, loss = model.forward_triplet((i_inputs, i_masks), output_tensors,
                                       batch_size)

    thought_vector = model.thought_vector
    thought_vector = flip_gradient(thought_vector, gamma)

    if FLAGS.attr == 'author':
        labels = tf.placeholder(tf.int64, (None, ), name="labels")
        adv_model = build_model(num_author, FLAGS.encoder_dim // 2)
        adv_logits = adv_model(thought_vector, tf.constant(True,
                                                           dtype=tf.bool))
        adv_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            labels=labels, logits=adv_logits)
        adv_loss = tf.reduce_mean(adv_loss)
        adv_acc = tf.reduce_mean(
            tf.cast(tf.equal(labels, tf.argmax(adv_logits, axis=-1)),
                    tf.float32))
    elif FLAGS.attr == 'word':
        labels = tf.placeholder(tf.float32, (None, None), name='labels')
        adv_model = build_model(vocab_size, FLAGS.encoder_dim // 2)
        adv_logits = adv_model(thought_vector, tf.constant(True,
                                                           dtype=tf.bool))
        adv_loss = tf.nn.sigmoid_cross_entropy_with_logits(
            labels=labels[:, 1:], logits=adv_logits[:, 1:])
        adv_loss = tf.reduce_mean(tf.reduce_sum(adv_loss, axis=-1))
        adv_predictions = tf.round(tf.nn.sigmoid(adv_logits))
        tp, fp, fn = tp_fp_fn_metrics(labels[:, 1:], adv_predictions[:, 1:])
        pre = tp / (tp + fp)
        rec = tp / (tp + fn)
        adv_acc = 2 * (pre * rec) / (pre + rec)
    else:
        raise ValueError(FLAGS.attr)

    accs.append(adv_acc)
    opt_loss = loss + gamma * adv_loss

    t_vars = tf.trainable_variables()
    grads_and_vars = optimizer.compute_gradients(opt_loss, t_vars)
    grads, variables = zip(*grads_and_vars)
    grads, _ = tf.clip_by_global_norm(grads, 10.0)
    grads_and_vars = zip(grads, variables)
    train_ops = optimizer.apply_gradients(grads_and_vars,
                                          global_step=global_step)

    iterations = epochs * len(sents) // batch_size
    print_every = FLAGS.print_every
    saver = tf.train.Saver(max_to_keep=epochs)

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
        iteration = 0
        train_loss = 0
        train_adv_loss = 0
        fw_accs = 0
        bw_accs = 0
        adv_accs = 0

        sess.run(tf.global_variables_initializer())
        for e in range(epochs):
            start = time.time()
            for batch in iterate_triplet_minibatches(sents, sent_masks,
                                                     authors, batch_size):
                xx, xp, xr, y = batch
                if FLAGS.attr == 'word':
                    b = len(y)
                    y = np.zeros((b, vocab_size), dtype=np.float32)
                    for i, idx in enumerate(range(b)):
                        y[i][xx[0][i]] = 1.0

                feed = {
                    i_inputs: xx[0],
                    i_masks: xx[1],
                    p_inputs: xp[0],
                    p_masks: xp[1],
                    r_inputs: xr[0],
                    r_masks: xr[1],
                    labels: y
                }

                fetch = sess.run([train_ops, loss, adv_loss] + accs,
                                 feed_dict=feed)
                train_loss += fetch[1]
                train_adv_loss += fetch[2]
                fw_accs += fetch[3]
                bw_accs += fetch[4]
                adv_accs += fetch[5]
                iteration += 1
                if iteration % print_every == 0:
                    end = time.time()
                    log("Iteration: {:.4f}%, Loss: {:.4f}, Adv Loss:{:.4f},"
                        " FW Acc:{:.2f}%, BW Acc:{:.2f}%, Adv Perf: {:.2f}%,"
                        " {:.4f} sec/batch".format(
                            iteration / iterations * 100,
                            train_loss / print_every,
                            train_adv_loss / print_every,
                            fw_accs / print_every * 100,
                            bw_accs / print_every * 100,
                            adv_accs / print_every * 100,
                            (end - start) / print_every))

                    train_loss = 0
                    train_adv_loss = 0
                    fw_accs = 0
                    bw_accs = 0
                    adv_accs = 0
                    start = time.time()

            model_type = FLAGS.cell_type

            if model_type == 'TRANS':
                model_type += 'l{}'.format(FLAGS.num_layer)

            model_name = 'bookcorpus_e{}_{}_b{}_{}_adv{}'.format(
                e, model_type, batch_size, FLAGS.attr, gamma)
            save_path = os.path.join(FLAGS.save_dir, model_name)

            if not os.path.exists(save_path):
                os.makedirs(save_path)

            saver.save(sess, os.path.join(save_path, "model.ckpt"))
Пример #14
0
def encode_parsed_sentences(config, model_path, *data, **kwargs):
    high_layer_idx = kwargs.get('high_layer_idx', -1)
    low_layer_idx = kwargs.get('low_layer_idx', -1)
    query_size = kwargs.get('query_size', 2048)

    log('Encoding sentences on the fly...')
    model = QuickThoughtModel(config['vocab_size'],
                              config['emb_dim'],
                              config['encoder_dim'],
                              1,
                              init_word_emb=None,
                              cell_type=config['cell_type'],
                              num_layer=config['num_layer'],
                              train=False)

    inputs = tf.placeholder(tf.int64, (None, None), name='inputs')
    masks = tf.placeholder(tf.int32, (None, None), name='masks')

    encode_emb = tf.nn.embedding_lookup(model.word_in_emb, inputs)
    all_layers = model.encode(encode_emb,
                              masks,
                              model.in_cells,
                              model.proj_in,
                              return_all_layers=True)

    learn_mapping = high_layer_idx != low_layer_idx
    if high_layer_idx == low_layer_idx:
        encoded = all_layers[high_layer_idx]
    else:
        encoded = (all_layers[low_layer_idx], all_layers[high_layer_idx])

    model_vars = tf.trainable_variables()
    saver = tf.train.Saver(model_vars)
    sess = tf.Session()

    saver.restore(sess, model_path)
    encoder_fn = lambda s: sess.run(encoded, {inputs: s[0], masks: s[1]})

    def encode_sents(s, n):
        embs_low, embs_high = [], []
        pbar = tqdm.tqdm(total=n)
        for batch_idx in iterate_minibatches_indices(n, query_size, False):
            emb = encoder_fn((s[0][batch_idx], s[1][batch_idx]))
            if learn_mapping:
                embs_low.append(emb[0])
                embs_high.append(emb[1])
                n_batch = len(emb[0])
            else:
                embs_low.append(emb)
                n_batch = len(emb)
            pbar.update(n_batch)

        pbar.close()
        if learn_mapping:
            return np.vstack(embs_low), np.vstack(embs_high)
        else:
            return np.vstack(embs_low)

    rtn_data = []
    for y, m in data:
        n_sent = len(y)
        x = encode_sents((y, m.astype(np.int32)), n_sent)
        rtn_data.append(x)

    return rtn_data
Пример #15
0
def gan_mapping(x,
                y,
                lr=1e-4,
                lmbda=10.,
                gamma=0.,
                beta1=0.5,
                activation=tf.nn.relu,
                epoch=30,
                disc_iters=10,
                batch_size=128):
    n_data, x_dim = x.shape
    y_dim = y.shape[1]
    model = WGANGP(x_dim,
                   y_dim,
                   lr=lr,
                   lmbda=lmbda,
                   gamma=gamma,
                   beta1=beta1,
                   activation=activation)

    gen_sampler = inf_batch_iterator(n_data, batch_size)
    disc_sampler = inf_batch_iterator(n_data, batch_size)
    num_batch_per_epoch = n_data // batch_size + (n_data % batch_size) != 0

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    for e in range(epoch):
        train_d_loss = []
        train_g_loss = []
        train_l2_loss = []
        for _ in range(num_batch_per_epoch):
            # train disc first
            for _ in range(disc_iters):
                disc_idx = next(disc_sampler)
                disc_x, disc_y = x[disc_idx], y[disc_idx]
                d_err = model.train_disc_one_batch(sess, disc_x, disc_y)
                train_d_loss.append(d_err)

            gen_idx = next(gen_sampler)
            gen_x, gen_y = x[gen_idx], y[gen_idx]
            g_err, l2_err = model.train_gen_one_batch(sess, gen_x, gen_y)
            train_g_loss.append(g_err)
            train_l2_loss.append(l2_err)

        train_d_loss = np.mean(train_d_loss)
        train_g_loss = np.mean(train_g_loss)
        train_l2_loss = np.mean(train_l2_loss)
        log('Epoch: {}, disc loss: {:.4f}, gen loss: {:.4f},'
            ' l2 loss: {:.4f}'.format(e + 1, train_d_loss, train_g_loss,
                                      train_l2_loss))

    def mapping(z):
        mapped = []
        for idx in iterate_minibatches_indices(len(z), batch_size=2048):
            batch_mapped = model.generate(sess, z[idx])
            mapped.append(batch_mapped)

        tf.keras.backend.clear_session()
        return np.vstack(mapped)

    return mapping
Пример #16
0
def main(_):
    assert FLAGS.dpsgd
    exp_id = FLAGS.exp_id
    num_gpu = FLAGS.num_gpu
    train_words, unigrams, word_sample_int = preprocess_texts(exp_id)
    n_vocab = len(unigrams)
    n_sampled = FLAGS.n_sampled
    n_embedding = FLAGS.hidden_size
    init_width = 0.5 / n_embedding
    epochs = FLAGS.epochs
    window_size = 5
    batch_size = FLAGS.batch_size
    learning_rate = FLAGS.learning_rate
    delta = 1 / len(train_words)

    cumtable = tf.constant(np.cumsum(unigrams))
    inputs = tf.placeholder(tf.int64, [None], name='inputs')
    labels = tf.placeholder(tf.int64, [None, 1], name='labels')
    embedding = tf.Variable(tf.random_uniform((n_vocab, n_embedding),
                                              -init_width, init_width),
                            name="emb")
    sm_w_t = embedding
    sm_b = tf.Variable(tf.zeros(n_vocab), name="sm_b")

    curr_words = tf.Variable(0, trainable=False)
    update_curr_words = curr_words.assign_add(batch_size)
    lr = learning_rate * tf.maximum(
        0.0001,
        1.0 - tf.cast(curr_words, tf.float32) / len(train_words) / epochs)
    num_microbatches = FLAGS.microbatches

    if FLAGS.dpsgd:
        optimizer = SparseDPAdamGaussianOptimizer(
            l2_norm_clip=FLAGS.l2_norm_clip,
            noise_multiplier=FLAGS.noise_multiplier,
            num_microbatches=num_microbatches
            if num_microbatches > 0 else None,
            learning_rate=lr)
    else:
        optimizer = tf.train.AdamOptimizer(lr)

    t_vars = tf.trainable_variables()

    def model(x, y):
        nb = tf.shape(x)[0]
        example_emb = tf.nn.embedding_lookup(embedding, x)

        # Negative sampling.
        random_ints = tf.random.uniform((n_sampled * nb, ),
                                        maxval=cumtable[-1],
                                        dtype=tf.int64)
        sampled_ids = tf.searchsorted(cumtable, random_ints, out_type=tf.int64)

        y_vec = tf.squeeze(y)
        true_w = tf.nn.embedding_lookup(sm_w_t, y_vec)
        true_b = tf.nn.embedding_lookup(sm_b, y_vec)
        true_logits = tf.reduce_sum(tf.multiply(example_emb, true_w),
                                    1) + true_b

        sampled_w = tf.nn.embedding_lookup(sm_w_t, sampled_ids)
        sampled_b = tf.nn.embedding_lookup(sm_b, sampled_ids)

        sampled_w_mat = tf.reshape(sampled_w, [nb, n_sampled, n_embedding])
        sampled_b_vec = tf.reshape(sampled_b, [nb, n_sampled])
        example_emb_mat = tf.reshape(example_emb, [nb, n_embedding, 1])

        sampled_logits = tf.squeeze(tf.matmul(sampled_w_mat,
                                              example_emb_mat)) + sampled_b_vec

        # Calculate the loss using negative sampling
        true_xent = tf.nn.sigmoid_cross_entropy_with_logits(
            labels=tf.ones_like(true_logits), logits=true_logits)
        sampled_xent = tf.nn.sigmoid_cross_entropy_with_logits(
            labels=tf.zeros_like(sampled_logits), logits=sampled_logits)

        sampled_mask = 1 - tf.cast(
            tf.equal(y, tf.reshape(sampled_ids, [nb, n_sampled])), tf.float32)
        vector_loss = true_xent + tf.reduce_sum(sampled_xent * sampled_mask,
                                                axis=1)
        scalar_loss = tf.reduce_mean(vector_loss)

        if FLAGS.dpsgd:
            grads = optimizer.compute_gradients(
                vector_loss, t_vars, colocate_gradients_with_ops=num_gpu > 1)
        else:
            grads = optimizer.compute_gradients(
                scalar_loss, t_vars, colocate_gradients_with_ops=num_gpu > 1)

        return grads, scalar_loss

    if num_gpu > 1:
        tower_grads, scalar_loss = make_parallel(model,
                                                 optimizer,
                                                 num_gpu,
                                                 x=inputs,
                                                 y=labels)
        train_ops = rigid_op_sequence(tower_grads)
    else:
        grads_and_vars, scalar_loss = model(inputs, labels)
        train_ops = optimizer.apply_gradients(grads_and_vars)

    saver = tf.train.Saver()
    iterations = epochs * len(train_words) // batch_size
    print_every = FLAGS.print_every

    with tf.Session() as sess:
        iteration = 0
        train_loss = 0
        sess.run(tf.global_variables_initializer())

        for e in range(1, epochs + 1):
            start = time.time()
            for x, y in get_batches(train_words, batch_size, word_sample_int,
                                    window_size):
                b = len(x)
                if num_microbatches > 0:
                    offset = b - b % num_microbatches
                    x, y = x[:offset], y[:offset]

                feed = {inputs: x, labels: np.array(y)[:, None]}
                err, _, _ = sess.run(
                    [scalar_loss, train_ops, update_curr_words],
                    feed_dict=feed)

                train_loss += err
                iteration += 1

                if iteration % print_every == 0:
                    end = time.time()
                    log("Iteration: {:.4f}%, Loss: {:.4f}, {:.4f} sec/batch".
                        format(iteration / iterations * 100,
                               train_loss / print_every,
                               (end - start) / print_every))
                    train_loss = 0
                    start = time.time()
                    if FLAGS.dpsgd:
                        eps = compute_epsilon(iteration, len(train_words))
                        log('The current epsilon is: {:.2f} for delta={}'.
                            format(eps, delta))

            model_name = 'tfw2v_{}'.format(exp_id)

            if FLAGS.dpsgd:
                model_name += 'e{}_n{}_l{}_mb{}'.format(
                    e, FLAGS.noise_multiplier, FLAGS.l2_norm_clip,
                    num_microbatches)
            eps = compute_epsilon(iteration, len(train_words))

            save_path = os.path.join(FLAGS.save_dir, model_name)
            if not os.path.exists(save_path):
                os.makedirs(save_path)

            saver.save(sess, os.path.join(save_path, "model.ckpt"))

            if FLAGS.dpsgd:
                with open(os.path.join(save_path, 'eps{:.2f}'.format(eps)),
                          'w'):
                    pass
def load_inversion_data():
  albert_config = modeling.AlbertConfig.from_json_file(FLAGS.albert_config_file)
  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case,
      spm_model_file=FLAGS.spm_model_file
  )

  train_sents, _, test_sents, _, _, _ = load_bookcorpus_author(
      train_size=FLAGS.train_size, test_size=FLAGS.test_size,
      unlabeled_size=0, split_by_book=True, split_word=False,
      top_attr=800)

  if FLAGS.cross_domain:
    train_sents = load_cross_domain_data(800000, split_word=False)

  def sents_to_examples(sents):
    examples = read_examples(sents, tokenization.convert_to_unicode)
    return convert_examples_to_features(examples=examples,
                                        seq_length=FLAGS.max_seq_length,
                                        tokenizer=tokenizer)

  input_ids, input_mask, input_type_ids, outputs = model_fn_builder(
      albert_config=albert_config,
      init_checkpoint=FLAGS.init_checkpoint,
      use_one_hot_embeddings=False)

  sess = tf.Session()
  sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
  learn_mapping = FLAGS.high_layer_idx != FLAGS.low_layer_idx

  def encode_example(features):
    n_data = len(features[0])
    embs_low, embs_high = [], []
    pbar = tqdm.tqdm(total=n_data)
    for b_idx in iterate_minibatches_indices(n_data, 128):
      emb = sess.run(outputs, feed_dict={input_ids: features[0][b_idx],
                                         input_mask: features[1][b_idx],
                                         input_type_ids: features[2][b_idx]})
      if learn_mapping:
        embs_low.append(emb[0])
        embs_high.append(emb[1])
        n_batch = len(emb[0])
      else:
        embs_low.append(emb)
        n_batch = len(emb)
      pbar.update(n_batch)
    pbar.close()

    if learn_mapping:
      return np.vstack(embs_low), np.vstack(embs_high)
    else:
      return np.vstack(embs_low)

  train_features = sents_to_examples(train_sents)
  train_x = encode_example(train_features)

  test_features = sents_to_examples(test_sents)
  test_x = encode_example(test_features)
  tf.keras.backend.clear_session()

  if learn_mapping:
    log('Training high to low mapping...')
    if FLAGS.mapper == 'linear':
      mapping = linear_mapping(train_x[1], train_x[0])
    elif FLAGS.mapper == 'mlp':
      mapping = mlp_mapping(train_x[1], train_x[0], epochs=50,
                            activation=modeling.gelu)
    elif FLAGS.mapper == 'gan':
      mapping = gan_mapping(train_x[1], train_x[0], disc_iters=5,
                            batch_size=64, gamma=1.0, epoch=100,
                            activation=tf.tanh)
    else:
      raise ValueError(FLAGS.mapper)
    test_x = mapping(test_x[1])

  return train_x, train_features, test_x, test_features
Пример #18
0
def train_text_char_cnn(data, num_attr):
  lr = FLAGS.lr
  batch_size = FLAGS.batch_size

  train_sents, train_y, test_sents, test_y = data
  num_chars = 129
  max_char_len = 400

  def sents_to_chars(sents):
    max_len = 0
    chars = np.ones((len(sents), max_char_len), dtype=np.int64) * num_chars
    for i, sent in enumerate(sents):
      sent_chars = [ord(c) for c in str(sent)]
      max_len = max(len(sent_chars), max_len)
      if len(sent_chars) > max_char_len:
        sent_chars = sent_chars[:max_char_len]
      chars[i, :len(sent_chars)] = sent_chars
    return chars

  train_x = sents_to_chars(train_sents)
  test_x = sents_to_chars(test_sents)

  inputs = tf.placeholder(tf.int64, (None, max_char_len), name="inputs")
  labels = tf.placeholder(tf.int64, (None,), name="labels")
  training = tf.placeholder(tf.bool, name='training')

  text_cnn = TextCharCNN(num_chars, hidden_size=512, num_filter=128)
  classifier = build_model(num_attr, FLAGS.hidden_size)

  model_fn = lambda x, t: classifier(text_cnn.forward(x, t), t)

  logits = model_fn(inputs, training)
  loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels,
                                                        logits=logits)
  loss = tf.reduce_mean(loss)
  opt_loss = loss
  accuracies, top5_accuracies, predictions = acc_metrics(logits, labels,
                                                         num_attr)
  eval_fetches = [loss, accuracies, top5_accuracies]

  t_vars = tf.trainable_variables()
  post_ops = [tf.assign(v, v * (1 - FLAGS.wd)) for v in t_vars if
              'kernel' in v.name]

  optimizer = tf.train.AdamOptimizer(lr)
  grads_and_vars = optimizer.compute_gradients(opt_loss, t_vars)
  train_ops = optimizer.apply_gradients(
    grads_and_vars, global_step=tf.train.get_or_create_global_step())

  with tf.control_dependencies([train_ops]):
    train_ops = tf.group(*post_ops)

  log('Train attack model...')
  with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
    sess.run(tf.global_variables_initializer())

    def train_fn(batch_idx):
      feed = {inputs: train_x[batch_idx], labels: train_y[batch_idx],
              training: True}
      err, _ = sess.run([loss, train_ops], feed_dict=feed)
      return err

    def eval_fn(batch_idx):
      feed = {inputs: test_x[batch_idx], labels: test_y[batch_idx],
              training: False}
      return sess.run(eval_fetches,  feed_dict=feed)

    n_train, n_test = len(train_y), len(test_y)
    train_loops(FLAGS.epochs, n_train, n_test, train_fn, eval_fn, batch_size)
def learning_inversion():
  assert FLAGS.low_layer_idx == FLAGS.high_layer_idx == -1

  albert_config = modeling.AlbertConfig.from_json_file(FLAGS.albert_config_file)
  num_words = albert_config.vocab_size

  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case,
      spm_model_file=FLAGS.spm_model_file)

  cls_id = tokenizer.vocab['[CLS]']
  sep_id = tokenizer.vocab['[SEP]']
  mask_id = tokenizer.vocab['[MASK]']

  train_x, train_y, test_x, test_y = load_inversion_data()
  filters = [cls_id, sep_id, mask_id, 0]
  train_y = filter_labels(train_y[0], filters)
  test_y = filter_labels(test_y[0], filters)

  label_freq = count_label_freq(train_y, num_words)
  log('Imbalace ratio: {}'.format(np.max(label_freq) / np.min(label_freq)))

  label_margin = tf.constant(np.reciprocal(label_freq ** 0.25),
                             dtype=tf.float32)
  C = FLAGS.C

  log('Build attack model for {} words...'.format(num_words))

  encoder_dim = train_x.shape[1]
  inputs = tf.placeholder(tf.float32, (None, encoder_dim), name="inputs")
  labels = tf.placeholder(tf.float32, (None, num_words), name="labels")
  training = tf.placeholder(tf.bool, name='training')

  if FLAGS.model == 'multiset':
    emb_dim = 512
    model = MultiSetInversionModel(emb_dim, num_words, FLAGS.seq_len, None,
                                   C=C, label_margin=label_margin)
  elif FLAGS.model == 'multilabel':
    model = MultiLabelInversionModel(num_words, C=C, label_margin=label_margin)
  else:
    raise ValueError(FLAGS.model)

  preds, loss = model.forward(inputs, labels, training)
  true_pos, false_pos, false_neg = tp_fp_fn_metrics(labels, preds)
  eval_fetch = [loss, true_pos, false_pos, false_neg]

  t_vars = tf.trainable_variables()
  wd = FLAGS.wd
  post_ops = [tf.assign(v, v * (1 - wd)) for v in t_vars if 'kernel' in v.name]

  optimizer = tf.train.AdamOptimizer(FLAGS.lr)
  grads_and_vars = optimizer.compute_gradients(
    loss + tf.losses.get_regularization_loss(), t_vars)
  train_ops = optimizer.apply_gradients(
    grads_and_vars, global_step=tf.train.get_or_create_global_step())

  with tf.control_dependencies([train_ops]):
    train_ops = tf.group(*post_ops)

  log('Train attack model with {} data...'.format(len(train_x)))
  with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch in range(30):
      train_iterations = 0
      train_loss = 0

      for batch_idx in iterate_minibatches_indices(len(train_y),
                                                   FLAGS.batch_size, True):
        one_hot_labels = np.zeros((len(batch_idx), num_words),
                                  dtype=np.float32)
        for i, idx in enumerate(batch_idx):
          one_hot_labels[i][train_y[idx]] = 1
        feed = {inputs: train_x[batch_idx], labels: one_hot_labels,
                training: True}
        err, _ = sess.run([loss, train_ops], feed_dict=feed)
        train_loss += err
        train_iterations += 1

      test_iterations = 0
      test_loss = 0
      test_tp, test_fp, test_fn = 0, 0, 0

      for batch_idx in iterate_minibatches_indices(len(test_y), batch_size=512,
                                                   shuffle=False):
        one_hot_labels = np.zeros((len(batch_idx), num_words),
                                  dtype=np.float32)
        for i, idx in enumerate(batch_idx):
          one_hot_labels[i][test_y[idx]] = 1
        feed = {inputs: test_x[batch_idx], labels: one_hot_labels,
                training: False}

        fetch = sess.run(eval_fetch, feed_dict=feed)
        err, tp, fp, fn = fetch

        test_iterations += 1
        test_loss += err
        test_tp += tp
        test_fp += fp
        test_fn += fn

      precision = test_tp / (test_tp + test_fp) * 100
      recall = test_tp / (test_tp + test_fn) * 100
      f1 = 2 * precision * recall / (precision + recall)

      log("Epoch: {}, train loss: {:.4f}, test loss: {:.4f}, "
          "pre: {:.2f}%, rec: {:.2f}%, f1: {:.2f}%".format(
            epoch, train_loss / train_iterations,
            test_loss / test_iterations,
            precision, recall, f1))
Пример #20
0
def main(_):
  epochs = FLAGS.epochs
  batch_size = FLAGS.batch_size

  sents, sent_masks, vocab = load_bookcorpus_sentences()

  vocab_size = len(vocab) + 1
  log('training with {} sents and {} vocabs'.format(sents.shape, vocab_size))

  init_word_emb = load_initialized_word_emb()
  if init_word_emb.shape[1] < FLAGS.emb_dim:
    offset = FLAGS.emb_dim - init_word_emb.shape[1]
    random_emb = np.random.uniform(-0.1, 0.1, (vocab_size, offset))
    init_word_emb = np.hstack([init_word_emb, random_emb.astype(np.float32)])
  init_word_emb = init_word_emb[:, :FLAGS.emb_dim]

  model = QuickThoughtModel(vocab_size, FLAGS.emb_dim,
                            FLAGS.encoder_dim, FLAGS.context_size,
                            cell_type=FLAGS.cell_type,
                            num_layer=FLAGS.num_layer,
                            init_word_emb=init_word_emb,
                            drop_p=FLAGS.drop_p, train=True)

  global_step = tf.train.get_or_create_global_step()
  lr = get_lr(global_step) if FLAGS.cell_type == 'TRANS' else FLAGS.lr
  optimizer = tf.train.AdamOptimizer(lr)

  # use negative examples from shuffled sentences
  i_inputs = tf.placeholder(tf.int64, (None, None), name='i_inputs')
  i_masks = tf.placeholder(tf.int32, (None, None), name='i_masks')
  p_inputs = tf.placeholder(tf.int64, (None, None), name='p_inputs')
  p_masks = tf.placeholder(tf.int32, (None, None), name='p_masks')
  r_inputs = tf.placeholder(tf.int64, (None, None), name='r_inputs')
  r_masks = tf.placeholder(tf.int32, (None, None), name='r_masks')
  output_tensors = [(p_inputs, p_masks), (r_inputs, r_masks)]
  accs, loss = model.forward_triplet((i_inputs, i_masks),
                                     output_tensors, batch_size)

  t_vars = tf.trainable_variables()
  grads_and_vars = optimizer.compute_gradients(loss, t_vars)
  grads, variables = zip(*grads_and_vars)
  grads, _ = tf.clip_by_global_norm(grads, 10.0)
  grads_and_vars = zip(grads, variables)
  train_ops = optimizer.apply_gradients(grads_and_vars, global_step=global_step)

  iterations = epochs * len(sents) // batch_size
  print_every = FLAGS.print_every
  saver = tf.train.Saver(max_to_keep=epochs)

  with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
    iteration = 0
    train_loss = 0
    fw_accs = 0
    bw_accs = 0

    sess.run(tf.global_variables_initializer())
    for e in range(epochs):
      start = time.time()
      for batch in iterate_triplet_minibatches(sents, sent_masks, batch_size):
        xx, xp, xr = batch
        feed = {i_inputs: xx[0], i_masks: xx[1],
                p_inputs: xp[0], p_masks: xp[1],
                r_inputs: xr[0], r_masks: xr[1]}

        fetch = sess.run([loss, train_ops] + list(accs), feed_dict=feed)
        train_loss += fetch[0]
        fw_accs += fetch[-2] if len(fetch) == 4 else fetch[-1]
        bw_accs += fetch[-1]

        iteration += 1
        if iteration % print_every == 0:
          end = time.time()
          log('Iteration: {:.4f}%, Loss: {:.4f}, FW Acc:{:.2f}%, '
              'BW Acc:{:.2f}%, {:.4f} sec/batch'.format(
                  iteration / iterations * 100, train_loss / print_every,
                  fw_accs / print_every * 100, bw_accs / print_every * 100,
                  (end - start) / print_every))

          train_loss = 0
          fw_accs = 0
          bw_accs = 0
          start = time.time()

      model_type = FLAGS.cell_type

      if model_type == 'TRANS':
        model_type += 'l{}'.format(FLAGS.num_layer)

      model_name = 'bookcorpus_e{}_{}_b{}'.format(e, model_type, batch_size)
      save_path = os.path.join(FLAGS.save_dir, model_name)
      if not os.path.exists(save_path):
        os.makedirs(save_path)

      saver.save(sess, os.path.join(save_path, 'model.ckpt'))