示例#1
0
def get_scores():
    tf.compat.v1.logging.set_verbosity(tf.logging.INFO)

    tokenization.validate_case_matches_checkpoint(LOWER_CASE, BERT_INIT_CHKPNT)
    tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB,
                                           do_lower_case=LOWER_CASE)
    bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

    inputs = read_examples(INPUT_FILE)
    features, all_tokens = convert_examples_to_features(
        inputs, SEQ_LEN, tokenizer)

    input_ids, input_mask, segment_ids, masked_lm_positions, masked_lm_ids = \
        features_to_vectors(features)
    print(masked_lm_ids)

    tf.reset_default_graph()
    sess = tf.Session()
    model = Model(bert_config)
    sess.run(tf.global_variables_initializer())

    losses = sess.run(model.masked_lm_example_loss,
                      feed_dict={
                          model.input_ids: input_ids,
                          model.input_mask: input_mask,
                          model.token_type: segment_ids,
                          model.masked_lm_positions: masked_lm_positions,
                          model.masked_lm_ids: masked_lm_ids
                      })

    parse_result(losses, all_tokens)
示例#2
0
def validate_flags_or_throw(bert_config):
    """Validate the input FLAGS or throw an exception."""
    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if FLAGS.do_train:
        if not FLAGS.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if FLAGS.do_predict:
        if not FLAGS.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified.")

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    if FLAGS.max_seq_length <= FLAGS.max_query_length + 3:
        raise ValueError(
            "The max_seq_length (%d) must be greater than max_query_length "
            "(%d) + 3" % (FLAGS.max_seq_length, FLAGS.max_query_length))
示例#3
0
    def load_models(self):
        """
        Load and initialize saved BERT model.

        :return: Tensorflow Estimator with saved weights.
        """
        bert_config = modeling.BertConfig.from_json_file(self.config)

        tokenization.validate_case_matches_checkpoint(True, self.init_ckpt)
        self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab, do_lower_case=True)

        run_config = tf.estimator.RunConfig(
            model_dir=self.out_dir,
            save_summary_steps=0,
            keep_checkpoint_max=1,
            save_checkpoints_steps=0)
        model_fn = ut.model_fn_builder(
            bert_config=bert_config,
            num_labels=self.n_classes,
            init_checkpoint=self.init_ckpt,
            learning_rate=0,
            num_train_steps=0,
            num_warmup_steps=0,
            use_tpu=False,
            use_one_hot_embeddings=False)
        estimator = tf.estimator.Estimator(
            model_fn=model_fn,
            config=run_config,
            params={"batch_size": self.batch_size})

        return estimator
def chinese_tokenizer():

    BERT_INIT_CHKPNT = './chinese_L-12_H-768_A-12/bert_model.ckpt'
    BERT_VOCAB = './chinese_L-12_H-768_A-12/vocab.txt'

    tokenization.validate_case_matches_checkpoint(True, BERT_INIT_CHKPNT)
    tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB,
                                           do_lower_case=True)
    return tokenizer
示例#5
0
def create_tokenizer_from_hub_module(hp):
    """
    create tokenizer
    :return:
    """
    tokenization.validate_case_matches_checkpoint(True, hp.BERT_INIT_CHKPNT)

    return tokenization.FullTokenizer(vocab_file=hp.BERT_VOCAB,
                                      do_lower_case=True)
 def __init__(self):
     tokenization.validate_case_matches_checkpoint(True, BERT_INIT_CHKPNT)
     self.tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB, do_lower_case=True)
     self.max_seq_length = MAX_SEQ_LENGTH
     self.train_data_path = train_data_path
     self.train_tf_record_path = train_tf_record_path
     self.eval_tf_record_path = eval_tf_record_path
     self.train_val_ratio =TRAIN_VAL_RATIO
     self.train_examples = None
     self.eval_examples = None
示例#7
0
    def __init__(self,
                 model,
                 bert_config_file,
                 vocab_file,
                 init_checkpoint,
                 batch_size=32,
                 max_seq_length=128,
                 do_lower_case=False,
                 finetune_embedding=False,
                 split_args=False,
                 is_training=False,
                 truncation_mode="normal",
                 padding_action='normal',
                 scope=None):
        self.model = model
        self.is_mask_attentional_model = self.model.startswith("mask")
        self.bert_config_file = bert_config_file
        self.vocab_file = vocab_file
        self.init_checkpoint = init_checkpoint
        self.batch_size = batch_size
        self.max_seq_length = max_seq_length
        self.max_arg_length = int(max_seq_length / 2)
        self.do_lower_case = do_lower_case
        self.split_args = split_args
        self.finetune_embedding = finetune_embedding
        self.truncation_mode = truncation_mode
        self.padding_action = padding_action

        # Word-Piece tokenizer
        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=vocab_file, do_lower_case=do_lower_case)

        # load bert
        tokenization.validate_case_matches_checkpoint(self.do_lower_case,
                                                      self.init_checkpoint)
        self.bert_config = copy.deepcopy(
            modeling.BertConfig.from_json_file(self.bert_config_file))

        self.is_training = is_training
        if not self.is_training:
            self.bert_config.hidden_dropout_prob = 0.0
            self.bert_config.attention_probs_dropout_prob = 0.0

        self._embedding_table = None
        self._vocab = tokenization.load_vocab(self.vocab_file)

        # max_position_embeddings==512
        if self.max_seq_length > self.bert_config.max_position_embeddings:
            raise ValueError(
                "Cannot use sequence length %d because the BERT model "
                "was only trained up to sequence length %d" %
                (self.max_seq_length,
                 self.bert_config.max_position_embeddings))

        self.build()
示例#8
0
    def read_config(self):
        #tf.logging.set_verbosity(tf.logging.INFO)

        processors = {
            "multilabel": MultilabelClassfier,
        }

        tokenization.validate_case_matches_checkpoint(self.do_lower_case,
                                                      self.init_checkpoint)

        #if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        #    raise ValueError(
        #        "At least one of `do_train`, `do_eval` or `do_predict' must be True.")

        self.bert_config = modeling.BertConfig.from_json_file(
            self.bert_config_file)

        if self.max_seq_length > self.bert_config.max_position_embeddings:
            raise ValueError(
                "Cannot use sequence length %d because the BERT model "
                "was only trained up to sequence length %d" %
                (self.max_seq_length,
                 self.bert_config.max_position_embeddings))

        tf.gfile.MakeDirs(self.output_dir)

        task_name = self.task_name.lower()

        if task_name not in processors:
            raise ValueError("Task not found: %s" % (task_name))

        self.processor = processors[task_name]()

        self.label_list = self.processor.get_labels(self.data_dir)

        self.tokenizer = tokenization.FullTokenizer(
            vocab_file=self.vocab_file, do_lower_case=self.do_lower_case)

        tpu_cluster_resolver = None
        if self.use_tpu and self.tpu_name:
            tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
                self.tpu_name, zone=self.tpu_zone, project=self.gcp_project)

        is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
        self.run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            master=self.master,
            model_dir=self.output_dir,
            save_checkpoints_steps=self.save_checkpoints_steps,
            tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=self.iterations_per_loop,
                num_shards=self.num_tpu_cores,
                per_host_input_for_training=is_per_host))
def get_scores():
    tf.compat.v1.logging.set_verbosity(tf.logging.INFO)
    tokenization.validate_case_matches_checkpoint(LOWER_CASE, BERT_INIT_CHKPNT)
    tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB,
                                           do_lower_case=LOWER_CASE)
    bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)

    # for v in tf.train.list_variables(BERT_INIT_CHKPNT):
    #     print(v)

    tf.reset_default_graph()
    sess = tf.Session()
    model = Model(bert_config)
    tvars = tf.trainable_variables()
    print(len(tvars))
    (assignment_map,
     initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(
         tvars, BERT_INIT_CHKPNT)
    tf.train.init_from_checkpoint(BERT_INIT_CHKPNT, assignment_map)
    sess.run(tf.global_variables_initializer())

    inputs = read_examples(INPUT_FILE)
    # inputs = [inputs[0]]

    not_masked_ids, arrays = \
        get_all_tokens(inputs, tokenizer, SEQ_LEN)
    print(arrays[0])
    preds, logits, inp = sess.run(
        [tf.nn.softmax(model.logits), model.logits, model.input_ids],
        feed_dict={
            model.input_ids: arrays[0],
            model.input_mask: arrays[1],
            model.token_type: arrays[2]
        })

    print("input:", inp)
    print("logits:", logits)
    print("softmax:", preds)
    print(preds.shape)
    first_index = 0
    sent_probs = []
    for ids in not_masked_ids:
        print(ids)
        sent_preds = preds[first_index:first_index + len(ids), :, :]
        word_probs = [sent_preds[i, i + 1, x] for i, x in enumerate(ids)]
        print(word_probs)
        sent_prob = np.prod(word_probs)
        sent_probs.append(sent_prob)
        first_index += len(ids)

    print(list(zip(inputs, sent_probs)))
    probs = np.array(sent_probs) / sum(sent_probs)
    print(list(zip(inputs, probs)))
示例#10
0
def create_estimator():

    tf.logging.set_verbosity(tf.logging.INFO)

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    label_list = get_labels()

    # tokenizer = tokenization.FullTokenizer(
    #     vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

    run_config = tf.contrib.tpu.RunConfig(
        # cluster=tpu_cluster_resolver,
        # master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        # tpu_config=tf.contrib.tpu.TPUConfig(
        #     iterations_per_loop=FLAGS.iterations_per_loop,
        #     num_shards=FLAGS.num_tpu_cores,
        #     per_host_input_for_training=is_per_host))
    )

    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list),
        init_checkpoint=FLAGS.init_checkpoint,
        # learning_rate=FLAGS.learning_rate,
        # num_train_steps=num_train_steps,
        # num_warmup_steps=num_warmup_steps,
        # use_tpu=FLAGS.use_tpu,
        # use_one_hot_embeddings=FLAGS.use_tpu
    )

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=False,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        # eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)
    return estimator
示例#11
0
    def __init__(self, bert_path, output_path):
        self.BERT_VOCAB = os.path.join(bert_path, 'vocab.txt')
        self.BERT_INIT_CHKPNT = os.path.join(bert_path, 'bert_model.ckpt')
        self.BERT_CONFIG = os.path.join(bert_path, 'bert_config.json')

        tokenization.validate_case_matches_checkpoint(True,
                                                      self.BERT_INIT_CHKPNT)
        self.tokenizer = tokenization.FullTokenizer(vocab_file=self.BERT_VOCAB,
                                                    do_lower_case=True)

        self.ID = 'guid'
        self.DATA_COLUMN = 'txt'
        self.LABEL_COLUMNS = [
            'Safety', 'CleanlinessView', 'Information', 'Service', 'Comfort',
            'PersonnelCard', 'Additional'
        ]

        self.MAX_SEQ_LENGTH = 128

        # Compute train and warmup steps from batch size
        # These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
        self.BATCH_SIZE = 32
        self.LEARNING_RATE = 2e-5
        self.NUM_TRAIN_EPOCHS = 1

        # Warmup is a period of time where hte learning rate

        # is small and gradually increases--usually helps training.
        self.WARMUP_PROPORTION = 0.1

        # Model configs
        self.SAVE_CHECKPOINTS_STEPS = 1000
        self.SAVE_SUMMARY_STEPS = 500

        self.run_config = tf.estimator.RunConfig(
            model_dir=output_path,
            save_summary_steps=self.SAVE_SUMMARY_STEPS,
            keep_checkpoint_max=1,
            save_checkpoints_steps=self.SAVE_CHECKPOINTS_STEPS)

        self.train_file = os.path.join(output_path, "train.tf_record")
        if not os.path.exists(self.train_file):
            open(self.train_file, 'w', encoding='utf8').close()

        self.eval_file = os.path.join(output_path, "eval.tf_record")
        if not os.path.exists(self.eval_file):
            open(self.eval_file, 'w', encoding='utf8').close()

        self.output_eval_file = os.path.join(output_path, "eval_results.txt")
示例#12
0
    def create_mini_bert_weights(model_dir=None):
        model_dir = model_dir if model_dir is not None else tempfile.TemporaryDirectory(
        ).name
        os.makedirs(model_dir, exist_ok=True)

        from bert.loader import StockBertConfig

        bert_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
        bert_config = StockBertConfig(
            attention_probs_dropout_prob=0.1,
            hidden_act="gelu",
            hidden_dropout_prob=0.1,
            hidden_size=8,
            initializer_range=0.02,
            intermediate_size=32,
            max_position_embeddings=32,
            num_attention_heads=2,
            num_hidden_layers=2,
            type_vocab_size=2,
            vocab_size=len(string.ascii_lowercase) * 2 + len(bert_tokens),
        )

        print("creating mini BERT at:", model_dir)

        bert_config_file = os.path.join(model_dir, "bert_config.json")
        bert_vocab_file = os.path.join(model_dir, "vocab.txt")

        with open(bert_config_file, "w") as f:
            f.write(bert_config.to_json_string())
        with open(bert_vocab_file, "w") as f:
            f.write("\n".join(list(string.ascii_lowercase) + bert_tokens))
            f.write("\n".join(
                ["##" + tok for tok in list(string.ascii_lowercase)]))

        with tf.Graph().as_default():
            _ = MiniBertFactory.create_stock_bert_graph(bert_config_file, 16)
            saver = tf.compat.v1.train.Saver(max_to_keep=1,
                                             save_relative_paths=True)

            with tf.compat.v1.Session() as sess:
                sess.run(tf.compat.v1.global_variables_initializer())
                ckpt_path = os.path.join(model_dir, "bert_model.ckpt")
                save_path = saver.save(sess, ckpt_path, write_meta_graph=True)
                print("saving to:", save_path)

        validate_case_matches_checkpoint(True, save_path)

        return save_path
示例#13
0
def bert(validate=True):
    """
    Load BERT similarity model.

    Parameters
    ----------
    validate: bool, optional (default=True)
        if True, malaya will check model availability and download if not available.

    Returns
    -------
    SIMILARITY_BERT : malaya._models._tensorflow_model.SIAMESE_BERT class
    """
    if not isinstance(validate, bool):
        raise ValueError('validate must be a boolean')
    try:
        from bert import tokenization
    except:
        raise Exception(
            'bert-tensorflow not installed. Please install it using `pip3 install bert-tensorflow` and try again.'
        )
    if validate:
        check_file(PATH_SIMILARITY['bert'], S3_PATH_SIMILARITY['bert'])
    else:
        if not check_available(PATH_SIMILARITY['bert']):
            raise Exception(
                'toxic/bert is not available, please `validate = True`')

    tokenization.validate_case_matches_checkpoint(True, '')
    tokenizer = tokenization.FullTokenizer(
        vocab_file=PATH_SIMILARITY['bert']['vocab'], do_lower_case=True)
    try:
        g = load_graph(PATH_SIMILARITY['bert']['model'])
    except:
        raise Exception(
            "model corrupted due to some reasons, please run malaya.clear_cache('similarity/bert') and try again"
        )

    return SIAMESE_BERT(
        X=g.get_tensor_by_name('import/Placeholder:0'),
        segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
        input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
        logits=g.get_tensor_by_name('import/logits:0'),
        sess=generate_session(graph=g),
        tokenizer=tokenizer,
        maxlen=100,
        label=['not similar', 'similar'],
    )
示例#14
0
    def build_model(self):
        # Placeholders for input, output

        BERT_VOCAB = '../chinese_L-12_H-768_A-12/vocab.txt'
        BERT_INIT_CHKPNT = '../chinese_L-12_H-768_A-12/bert_model.ckpt'
        BERT_CONFIG = '../chinese_L-12_H-768_A-12/bert_config.json'
        tokenization.validate_case_matches_checkpoint(True, '')
        bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)
        tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB,
                                               do_lower_case=True)

        bert_config = modeling.BertConfig(
            vocab_size=self.vocab_size,
            hidden_size=self.size_layer,
            num_hidden_layers=self.num_layers,
            num_attention_heads=self.size_layer // 4,
            intermediate_size=self.size_layer * 2,
        )

        self.input_ids = tf.placeholder(tf.int32, [None, self.seq_len])
        self.input_mask = tf.placeholder(tf.int32, [None, self.seq_len])
        self.segment_ids = tf.placeholder(tf.int32, [None, self.seq_len])
        self.label_ids = tf.placeholder(tf.int32, [None])
        self.is_training = tf.placeholder(tf.bool)

        use_one_hot_embeddings = False
        self.loss, self.logits, probabilities, model, self.accuracy = create_model(
            bert_config,
            self.is_training,
            self.input_ids,
            self.input_mask,
            self.segment_ids,
            self.label_ids,
            self.num_classes,
            use_one_hot_embeddings,
        )
        global_step = tf.Variable(0, trainable=False, name='Global_Step')
        self.optimizer = tf.contrib.layers.optimize_loss(
            self.loss,
            global_step=global_step,
            learning_rate=self.learning_rate,
            optimizer='Adam',
            clip_gradients=3.0,
        )
        tf.summary.scalar("loss", self.loss)
        self.summary_op = tf.summary.merge_all()
        self.saver = tf.train.Saver(tf.global_variables())
示例#15
0
    def get_processor(self, task_name="pico"):

        processors = {"pico": PICOProcessor}
        tokenization.validate_case_matches_checkpoint(
            config.do_lower_case, config.init_checkpoint_dependency)
        bert_config = modeling.BertConfig.from_json_file(
            config.bert_config_file)

        if config.max_seq_length > bert_config.max_position_embeddings:
            raise ValueError(
                "Cannot use sequence length %d because the BERT model "
                "was only trained up to sequence length %d" %
                (FLAGS.max_seq_length, bert_config.max_position_embeddings))

        if task_name not in processors:
            raise ValueError("Task not found: %s" % (task_name))

        processor = processors[task_name]()
        return processor
示例#16
0
    def get_estimator(self, processor):
        tokenization.validate_case_matches_checkpoint(
            config.do_lower_case, config.init_checkpoint_dependency)
        bert_config = modeling.BertConfig.from_json_file(
            config.bert_config_file)
        tokenizer = tokenization.FullTokenizer(
            vocab_file=config.vocab_file, do_lower_case=config.do_lower_case)
        tpu_cluster_resolver = None

        is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
        run_config = tf.contrib.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            master=None,
            model_dir=config.bluebert_dependency_dir,
            save_checkpoints_steps=1000,
            tpu_config=tf.contrib.tpu.TPUConfig(
                iterations_per_loop=1000,
                num_shards=8,
                per_host_input_for_training=is_per_host))

        train_examples = None
        num_train_steps = None
        num_warmup_steps = None
        label_list = processor.get_labels()
        model_fn = model_fn_builder(
            bert_config=bert_config,
            num_labels=len(label_list),
            init_checkpoint=config.init_checkpoint_dependency,
            learning_rate=config.learning_rate,
            num_train_steps=num_train_steps,
            num_warmup_steps=num_warmup_steps,
            use_tpu=False,
            use_one_hot_embeddings=False)

        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=False,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=config.pred_batch_size,
            eval_batch_size=config.pred_batch_size,
            predict_batch_size=config.pred_batch_size)
        return estimator
示例#17
0
def bert(path, s3_path, class_name, label, validate=True):
    try:
        from bert import tokenization
    except:
        raise Exception(
            'bert-tensorflow not installed. Please install it using `pip3 install bert-tensorflow` and try again.'
        )
    if validate:
        check_file(path['bert'], s3_path['bert'])
    else:
        if not check_available(path['bert']):
            raise Exception(
                '%s/bert is not available, please `validate = True`' %
                (class_name))

    tokenization.validate_case_matches_checkpoint(False, '')
    tokenizer = tokenization.FullTokenizer(vocab_file=path['bert']['vocab'],
                                           do_lower_case=False)
    try:
        g = load_graph(path['bert']['model'])
    except:
        raise Exception(
            "model corrupted due to some reasons, please run malaya.clear_cache('%s/bert') and try again"
            % (class_name))

    if len(label) > 2:
        selected_class = MULTICLASS_BERT
    else:
        selected_class = BINARY_BERT

    return selected_class(
        X=g.get_tensor_by_name('import/Placeholder:0'),
        segment_ids=g.get_tensor_by_name('import/Placeholder_1:0'),
        input_masks=g.get_tensor_by_name('import/Placeholder_2:0'),
        logits=g.get_tensor_by_name('import/logits:0'),
        sess=generate_session(graph=g),
        tokenizer=tokenizer,
        maxlen=100,
        label=label,
    )
示例#18
0
def test_main():
    ID = 'id'
    DATA_COLUMN = 'content'
    LABEL_COLUMNS = ['environment', 'price_level', 'traffic', 'food']
    num_labels = len(LABEL_COLUMNS)
    use_one_hot_embeddings = False
    MAX_SEQ_LENGTH = 128
    BATCH_SIZE = 4

    os.chdir(r'E:\Toxic_BERT_multi_task')

    # 加载分词和模型
    BERT_VOCAB = 'chinese_L-12_H-768_A-12/vocab.txt'
    # 模型词汇表
    BERT_INIT_CHKPNT = 'output/model.ckpt'
    # 模型预训练权重
    BERT_CONFIG = 'chinese_L-12_H-768_A-12/bert_config.json'
    # BERT模型架构

    tokenization.validate_case_matches_checkpoint(True, BERT_INIT_CHKPNT)
    # 检查checkpoint的合法性
    tokenizer = tokenization.FullTokenizer(vocab_file=BERT_VOCAB,
                                           do_lower_case=True)
    tokenizer.tokenize('查看中文分词效果。')

    # test = pd.read_csv('reforcement_test.csv')
    # x_test = test[:100][['id', 'content']] #testing a small sample
    # x_test = x_test.reset_index(drop=True)

    bert_config = modeling.BertConfig.from_json_file(BERT_CONFIG)
    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(LABEL_COLUMNS),
                                init_checkpoint=BERT_INIT_CHKPNT,
                                use_one_hot_embeddings=False)
    estimator = tf.estimator.Estimator(model_fn,
                                       params={"batch_size": BATCH_SIZE})

    return estimator, tokenizer
示例#19
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    tf.io.gfile.makedirs(FLAGS.output_dir)
    tf.logging.info("***** FLAGS *****")
    writer = tf.io.gfile.GFile(
        f"{FLAGS.output_dir}/{FLAGS.al_query_strategy}_flags.txt", "w+")
    for key, val in FLAGS.__flags.items():
        tf.logging.info("  %s = %s", key, str(val.value))
        writer.write("%s = %s\n" % (key, str(val.value)))
    writer.close()

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)
    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
    is_per_host = tf.compat.v1.estimator.tpu.InputPipelineConfig.PER_HOST_V2

    # Active learning procedure
    results = active_learning_procedure(
        FLAGS.al_query_strategy, tokenizer, bert_config, FLAGS.data_dir,
        FLAGS.output_dir, FLAGS.finetune_module, tpu_cluster_resolver,
        is_per_host, FLAGS.max_seq_length, FLAGS.use_tpu,
        FLAGS.predict_batch_size, "train", FLAGS.n_queries, FLAGS.n_instances,
        FLAGS.sample_size, FLAGS.num_init_train_epochs,
        FLAGS.num_query_train_epochs, FLAGS.retrain_all,
        FLAGS.convert_tsv_to_tfrecord)
示例#20
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    if FLAGS.input_file_processor == "run_classifier":
        processors = {
            "sst-2": rc.SST2Processor,
            "mnli": rc.MnliProcessor,
        }
    elif FLAGS.input_file_processor == "run_classifier_distillation":
        processors = {
            "sst-2": rc.SST2ProcessorDistillation,
            "mnli": rc.MNLIProcessorDistillation,
        }
    else:
        raise ValueError("Invalid --input_file_processor flag value")

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    task_name = FLAGS.task_name.lower()
    processor = processors[task_name]()
    label_list = processor.get_labels()
    num_labels = len(label_list)

    input_ids_placeholder = tf.placeholder(dtype=tf.int32,
                                           shape=[None, FLAGS.max_seq_length])

    bert_input_mask_placeholder = tf.placeholder(
        dtype=tf.int32, shape=[None, FLAGS.max_seq_length])

    token_type_ids_placeholder = tf.placeholder(
        dtype=tf.int32, shape=[None, FLAGS.max_seq_length])

    prob_vector_placeholder = tf.placeholder(dtype=tf.float32,
                                             shape=[None, num_labels])

    one_hot_input_ids = tf.one_hot(input_ids_placeholder,
                                   depth=bert_config.vocab_size)

    input_tensor, _ = em_util.run_one_hot_embeddings(
        one_hot_input_ids=one_hot_input_ids, config=bert_config)

    flex_input_obj, per_eg_obj, probs = em_util.model_fn(
        input_tensor=input_tensor,
        bert_input_mask=bert_input_mask_placeholder,
        token_type_ids=token_type_ids_placeholder,
        bert_config=bert_config,
        num_labels=num_labels,
        obj_type=FLAGS.obj_type,
        prob_vector=prob_vector_placeholder)

    if FLAGS.obj_type.startswith("min"):
        final_obj = -1 * flex_input_obj
    elif FLAGS.obj_type.startswith("max"):
        final_obj = flex_input_obj

    # Calculate the gradient of the final loss function with respect to
    # the one-hot input space
    grad_obj_one_hot = tf.gradients(ys=final_obj, xs=one_hot_input_ids)[0]

    # gradients with respect to position in one hot input space with 1s in it
    # this is one term in the directional derivative of HotFlip,
    # Eq1 in https://arxiv.org/pdf/1712.06751.pdf
    #
    # grad_obj_one_hot.shape = [batch_size, seq_length, vocab_size]
    # input_ids_placeholder.shape = [batch_size, seq_length]
    # original_token_gradients.shape = [batch_size, seq_length]
    original_token_gradients = tf.gather(params=grad_obj_one_hot,
                                         indices=tf.expand_dims(
                                             input_ids_placeholder, -1),
                                         batch_dims=2)
    original_token_gradients = tf.tile(original_token_gradients,
                                       multiples=[1, 1, FLAGS.beam_size])

    # These are the gradients / indices whose one-hot position has the largest
    # gradient magnitude, the performs part of the max calculation in Eq10 of
    # https://arxiv.org/pdf/1712.06751.pdf
    biggest_gradients, biggest_indices = tf.nn.top_k(input=grad_obj_one_hot,
                                                     k=FLAGS.beam_size)

    # Eq10 of https://arxiv.org/pdf/1712.06751.pdf
    grad_difference = biggest_gradients - original_token_gradients

    tvars = tf.trainable_variables()

    assignment_map, _ = modeling.get_assignment_map_from_checkpoint(
        tvars, FLAGS.init_checkpoint)

    tf.logging.info("Variables mapped = %d / %d", len(assignment_map),
                    len(tvars))

    tf.train.init_from_checkpoint(FLAGS.init_checkpoint, assignment_map)

    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    if FLAGS.input_file:
        custom_examples = processor.get_custom_examples(FLAGS.input_file)
        custom_templates = [
            em_util.input_to_template(x, label_list) for x in custom_examples
        ]
    else:
        prob_vector = [float(x) for x in FLAGS.prob_vector.split(",")]
        custom_templates = [(FLAGS.input_template, prob_vector)]

    num_input_sequences = custom_templates[0][0].count("[SEP]")

    if FLAGS.flipping_mode == "beam_search":
        FLAGS.batch_size = 1

    detok_partial = functools.partial(em_util.detokenize, tokenizer=tokenizer)

    # Since input files will often be quite large, this flag allows processing
    # only a slice of the input file
    if FLAGS.input_file_range:
        start_index, end_index = FLAGS.input_file_range.split("-")
        if start_index == "start":
            start_index = 0
        if end_index == "end":
            end_index = len(custom_templates)
        start_index, end_index = int(start_index), int(end_index)
    else:
        start_index = 0
        end_index = len(custom_templates)

    tf.logging.info("Processing examples in range %d, %d", start_index,
                    end_index)

    all_elements = []

    too_long = 0

    for ip_num, (ip_template, prob_vector) in enumerate(
            custom_templates[start_index:end_index]):
        # Parse the input template into a list of IDs and the corresponding mask.
        # Different segments in template are separated by " <piece> "
        # Each segment is associated with a word piece (or [EMPTY] to get flex
        # inputs) and a frequency. (which is separated by "<freq>"). * can be used
        # to choose a frequency till the end of the string
        #
        # Here is an example 2-sequence template for tasks like MNLI to optimize
        # 20 vectors, (10 for each sequence)
        # [CLS]<freq>1 <piece> [EMPTY]<freq>10 <piece> [SEP]<freq>1 <piece> \
        # [EMPTY]<freq>10 <piece> [SEP]<freq>1 <piece> [PAD]<freq>*
        (input_ids, input_mask, bert_input_mask,
         token_type_ids) = em_util.template_to_ids(
             template=ip_template,
             config=bert_config,
             tokenizer=tokenizer,
             max_seq_length=FLAGS.max_seq_length)

        if len(input_ids) > FLAGS.max_seq_length:
            # truncate them!
            input_ids = input_ids[:FLAGS.max_seq_length]
            input_mask = input_mask[:FLAGS.max_seq_length]
            bert_input_mask = bert_input_mask[:FLAGS.max_seq_length]
            token_type_ids = token_type_ids[:FLAGS.max_seq_length]
            too_long += 1

        all_elements.append({
            "input_ids": input_ids,
            "original_input_ids": [ii for ii in input_ids],
            "ip_num": start_index + ip_num,
            "score": 0.0,
            "bert_input_mask": bert_input_mask,
            "input_mask": input_mask,
            "token_type_ids": token_type_ids,
            "prob_vector": prob_vector,
            "stopped": False,
            "steps_taken": 0
        })

    tf.logging.info("%d / %d were too long and hence truncated.", too_long,
                    len(all_elements))

    iteration_number = 0
    consistent_output_sequences = []

    while all_elements and iteration_number < 10:

        steps_taken = []
        output_sequences = []
        failures = []
        zero_step_instances = 0

        iteration_number += 1
        tf.logging.info("Starting iteration number %d", iteration_number)
        tf.logging.info("Pending items = %d / %d", len(all_elements),
                        len(custom_templates[start_index:end_index]))

        batch_elements = []
        for ip_num, input_object in enumerate(all_elements):
            batch_elements.append(input_object)
            # wait until the input has populated up to the batch size
            if (len(batch_elements) < FLAGS.batch_size
                    and ip_num < len(all_elements) - 1):
                continue

            # optimize a part of the flex_input (depending on the template)
            for step_num in range(FLAGS.total_steps):
                feed_dict = {
                    input_ids_placeholder:
                    np.array([x["input_ids"] for x in batch_elements]),
                    bert_input_mask_placeholder:
                    np.array([x["bert_input_mask"] for x in batch_elements]),
                    token_type_ids_placeholder:
                    np.array([x["token_type_ids"] for x in batch_elements]),
                    prob_vector_placeholder:
                    np.array([x["prob_vector"] for x in batch_elements])
                }

                if FLAGS.flipping_mode == "random":
                    # Avoiding the gradient computation when the flipping mode is random
                    peo, pr = sess.run([per_eg_obj, probs],
                                       feed_dict=feed_dict)
                else:
                    peo, gd, bi, pr = sess.run(
                        [per_eg_obj, grad_difference, biggest_indices, probs],
                        feed_dict=feed_dict)

                if FLAGS.print_flips:
                    output_log = "\n" + "\n".join([
                        "Objective = %.4f, Score = %.4f, Element %d = %s" %
                        (obj, elem["score"], kk,
                         detok_partial(elem["input_ids"]))
                        for kk, (obj,
                                 elem) in enumerate(zip(peo, batch_elements))
                    ])
                    tf.logging.info("Step = %d %s\n", step_num, output_log)

                should_stop = evaluate_stopping(
                    stopping_criteria=FLAGS.stopping_criteria,
                    obj_prob_vector=np.array(
                        [x["prob_vector"] for x in batch_elements]),
                    curr_prob_vector=pr,
                    per_example_objective=peo)

                for elem, stop_bool in zip(batch_elements, should_stop):
                    if stop_bool and (not elem["stopped"]):
                        if step_num == 0:
                            # don't actually stop the perturbation since we want a new input
                            zero_step_instances += 1
                        else:
                            elem["stopped"] = True
                            elem["steps_taken"] = step_num

                if np.all([elem["stopped"] for elem in batch_elements]):
                    steps_taken.extend(
                        [elem["steps_taken"] for elem in batch_elements])
                    output_sequences.extend([elem for elem in batch_elements])
                    batch_elements = []
                    break

                if step_num == FLAGS.total_steps - 1:
                    failures.extend([
                        elem for elem in batch_elements if not elem["stopped"]
                    ])
                    steps_taken.extend([
                        elem["steps_taken"] for elem in batch_elements
                        if elem["stopped"]
                    ])
                    output_sequences.extend(
                        [elem for elem in batch_elements if elem["stopped"]])
                    batch_elements = []
                    break

                # Flip a token / word-piece either systematically or randomly
                # For instances where hotflip was not successful, do some random
                # perturbations before doing hotflip
                if (FLAGS.flipping_mode == "random" or
                    (iteration_number > 1 and step_num < iteration_number)):
                    for element in batch_elements:
                        # don't perturb elements which have stopped
                        if element["stopped"]:
                            continue

                        random_seq_index = np.random.choice([
                            ii
                            for ii, mask_id in enumerate(element["input_mask"])
                            if mask_id > 0.5
                        ])

                        random_token_id = np.random.randint(
                            len(tokenizer.vocab))
                        while (tokenizer.inv_vocab[random_token_id][0] == "["
                               and tokenizer.inv_vocab[random_token_id][-1]
                               == "]"):
                            random_token_id = np.random.randint(
                                len(tokenizer.vocab))

                        element["input_ids"][
                            random_seq_index] = random_token_id

                elif FLAGS.flipping_mode == "greedy":
                    batch_elements = greedy_updates(
                        old_elements=batch_elements,
                        grad_difference=gd,
                        biggest_indices=bi,
                        max_seq_length=FLAGS.max_seq_length)

                elif FLAGS.flipping_mode == "beam_search":
                    # only supported with a batch size of 1!
                    batch_elements = beam_search(
                        old_beams=batch_elements,
                        grad_difference=gd,
                        biggest_indices=bi,
                        beam_size=FLAGS.beam_size,
                        accumulate_scores=FLAGS.accumulate_scores,
                        max_seq_length=FLAGS.max_seq_length)

                else:
                    raise ValueError("Invalid --flipping_mode flag value")

            tf.logging.info("steps = %.4f (%d failed, %d non-zero, %d zero)",
                            np.mean([float(x) for x in steps_taken if x > 0]),
                            len(failures),
                            len([x for x in steps_taken if x > 0]),
                            zero_step_instances)

        # measure consistency of final dataset - run a forward pass through the
        # entire final dataset and verify it satisfies the original objective. This
        # if the code runs correctly, total_inconsistent = 0
        tf.logging.info("Measuring consistency of final dataset")

        total_inconsistent = 0
        total_lossy = 0

        for i in range(0, len(output_sequences), FLAGS.batch_size):
            batch_elements = output_sequences[i:i + FLAGS.batch_size]
            feed_dict = {
                input_ids_placeholder:
                np.array([x["input_ids"] for x in batch_elements]),
                bert_input_mask_placeholder:
                np.array([x["bert_input_mask"] for x in batch_elements]),
                token_type_ids_placeholder:
                np.array([x["token_type_ids"] for x in batch_elements]),
                prob_vector_placeholder:
                np.array([x["prob_vector"] for x in batch_elements])
            }
            peo, pr = sess.run([per_eg_obj, probs], feed_dict=feed_dict)
            consistency_flags = evaluate_stopping(
                stopping_criteria=FLAGS.stopping_criteria,
                obj_prob_vector=np.array(
                    [x["prob_vector"] for x in batch_elements]),
                curr_prob_vector=pr,
                per_example_objective=peo)
            total_inconsistent += len(batch_elements) - np.sum(
                consistency_flags)

            # Next, apply a lossy perturbation to the input (conversion to a string)
            # This is often lossy since it eliminates impossible sequences and
            # incorrect tokenizations. We check how many consistencies still hold true
            all_detok_strings = [
                em_util.ids_to_strings(elem["input_ids"], tokenizer)
                for elem in batch_elements
            ]

            all_ip_examples = []
            if num_input_sequences == 1:
                for ds, be in zip(all_detok_strings, batch_elements):
                    prob_vector_labels = be["prob_vector"].tolist()
                    all_ip_examples.append(
                        rc.InputExample(text_a=ds[0],
                                        text_b=None,
                                        label=prob_vector_labels,
                                        guid=None))
            else:
                for ds, be in zip(all_detok_strings, batch_elements):
                    prob_vector_labels = be["prob_vector"].tolist()
                    all_ip_examples.append(
                        rc.InputExample(text_a=ds[0],
                                        text_b=ds[1],
                                        label=prob_vector_labels,
                                        guid=None))

            all_templates = [
                em_util.input_to_template(aie, label_list)
                for aie in all_ip_examples
            ]
            all_new_elements = []
            for ip_template, prob_vector in all_templates:
                (input_ids, input_mask, bert_input_mask,
                 token_type_ids) = em_util.template_to_ids(
                     template=ip_template,
                     config=bert_config,
                     tokenizer=tokenizer,
                     max_seq_length=FLAGS.max_seq_length)

                if len(input_ids) > FLAGS.max_seq_length:
                    input_ids = input_ids[:FLAGS.max_seq_length]
                    input_mask = input_mask[:FLAGS.max_seq_length]
                    bert_input_mask = bert_input_mask[:FLAGS.max_seq_length]
                    token_type_ids = token_type_ids[:FLAGS.max_seq_length]

                all_new_elements.append({
                    "input_ids": input_ids,
                    "input_mask": input_mask,
                    "bert_input_mask": bert_input_mask,
                    "token_type_ids": token_type_ids,
                    "prob_vector": prob_vector
                })
            feed_dict = {
                input_ids_placeholder:
                np.array([x["input_ids"] for x in all_new_elements]),
                bert_input_mask_placeholder:
                np.array([x["bert_input_mask"] for x in all_new_elements]),
                token_type_ids_placeholder:
                np.array([x["token_type_ids"] for x in all_new_elements]),
                prob_vector_placeholder:
                np.array([x["prob_vector"] for x in all_new_elements])
            }
            peo, pr = sess.run([per_eg_obj, probs], feed_dict=feed_dict)
            lossy_consistency_flags = evaluate_stopping(
                stopping_criteria=FLAGS.stopping_criteria,
                obj_prob_vector=np.array(
                    [x["prob_vector"] for x in all_new_elements]),
                curr_prob_vector=pr,
                per_example_objective=peo)

            total_lossy += len(all_new_elements) - np.sum(
                lossy_consistency_flags)

            net_consistency_flags = np.logical_and(consistency_flags,
                                                   lossy_consistency_flags)

            for elem, ncf in zip(batch_elements, net_consistency_flags):
                if ncf:
                    consistent_output_sequences.append(elem)
                else:
                    failures.append(elem)

        tf.logging.info("Total inconsistent found = %d / %d",
                        total_inconsistent, len(output_sequences))
        tf.logging.info("Total lossy inconsistent found = %d / %d",
                        total_lossy, len(output_sequences))
        tf.logging.info("Total consistent outputs so far = %d / %d",
                        len(consistent_output_sequences),
                        len(custom_templates[start_index:end_index]))

        # Getting ready for next iteration of processing
        if iteration_number < 10:
            for elem in failures:
                elem["input_ids"] = [x for x in elem["original_input_ids"]]
                elem["stopped"] = False
                elem["steps_taken"] = 0
                elem["score"] = 0.0
            all_elements = failures

    tf.logging.info("Giving up on %d instances!", len(failures))
    for elem in failures:
        consistent_output_sequences.append(elem)

    if FLAGS.output_file:
        final_output = []
        for op_num, elem in enumerate(consistent_output_sequences):
            detok_strings = em_util.ids_to_strings(elem["input_ids"],
                                                   tokenizer)

            if num_input_sequences == 1:
                final_output.append("%d\t%d\t%s" %
                                    (op_num, elem["ip_num"], detok_strings[0]))
            elif num_input_sequences == 2:
                final_output.append("%d\t%d\t%s\t%s" %
                                    (op_num, elem["ip_num"], detok_strings[0],
                                     detok_strings[1]))

        if num_input_sequences == 1:
            header = "index\toriginal_index\tsentence"
        elif num_input_sequences == 2:
            header = "index\toriginal_index\tsentence1\tsentence2"

        final_output = [header] + final_output

        with tf.gfile.Open(FLAGS.output_file, "w") as f:
            f.write("\n".join(final_output) + "\n")

    return
示例#21
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    if not FLAGS.do_train and not FLAGS.do_eval:
        raise ValueError("At least one of `do_train`, `do_eval` must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
    run_config = contrib_tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=contrib_tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        num_train_steps = int(
            FLAGS.train_data_size / FLAGS.train_batch_size) * FLAGS.epochs
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = contrib_tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        if not tf.gfile.Exists(FLAGS.train_file):
            tf.logging.info(
                "DANITER:File doesn't exist, creating tfrecord data")
            examples = model_builder.load_hellaswag(FLAGS.train_raw_data)
            tf.logging.info("DANITER:Read raw data as json")
            model_builder.file_based_convert_examples_for_bilinear(
                examples, 512, tokenizer, FLAGS.train_file, do_copa=True)
        train_input_fn = file_based_input_fn_builder(
            input_file=FLAGS.train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, steps=num_train_steps)

    if FLAGS.do_eval:
        # This tells the estimator to run through the entire set.
        if FLAGS.eval_data_size < 0:
            eval_steps = None
        else:
            eval_steps = int(FLAGS.eval_data_size / FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        if not tf.gfile.Exists(FLAGS.eval_file):
            examples = model_builder.load_hellaswag(FLAGS.eval_raw_data)
            model_builder.file_based_convert_examples_for_bilinear(
                examples, 512, tokenizer, FLAGS.eval_file, do_copa=True)
        eval_input_fn = file_based_input_fn_builder(
            input_file=FLAGS.eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        def _find_valid_cands(curr_step):
            filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
            candidates = []
            for filename in filenames:
                if filename.endswith(".index"):
                    ckpt_name = filename[:-6]
                    idx = ckpt_name.split("-")[-1]
                    if idx != "best" and int(idx) > curr_step:
                        candidates.append(filename)
            return candidates

        tf.logging.info("Evaling all models in output dir")
        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        checkpoint_path = os.path.join(FLAGS.output_dir, "model.ckpt-best")
        key_name = "eval_accuracy"
        tf.logging.info("Checkpoint path " + checkpoint_path)
        if tf.gfile.Exists(checkpoint_path + ".index"):
            tf.logging.info("Found a best model... not good")
            result = estimator.evaluate(input_fn=eval_input_fn,
                                        steps=eval_steps,
                                        checkpoint_path=checkpoint_path)
            best_perf = result[key_name]
            global_step = result["global_step"]
        else:
            tf.logging.info("Setting global step to -1")
            global_step = -1
            best_perf = -1
            checkpoint_path = None
        tf.logging.info("Openning writer " + output_eval_file)
        writer = tf.gfile.GFile(output_eval_file, "w")

        steps_and_files = {}
        filenames = tf.gfile.ListDirectory(FLAGS.output_dir)
        tf.logging.info("Models found " + "\n".join(filenames))
        for filename in filenames:
            if filename.endswith(".index"):
                ckpt_name = filename[:-6]
                cur_filename = os.path.join(FLAGS.output_dir, ckpt_name)
                if cur_filename.split("-")[-1] == "best":
                    continue
                gstep = int(cur_filename.split("-")[-1])
                if gstep not in steps_and_files:
                    tf.logging.info(
                        "Add {} to eval list.".format(cur_filename))
                    steps_and_files[gstep] = cur_filename
        tf.logging.info("found {} files.".format(len(steps_and_files)))
        # steps_and_files = sorted(steps_and_files, key=lambda x: x[0])
        if not steps_and_files:
            tf.logging.info(
                "found 0 file, global step: {}. Sleeping.".format(global_step))
        else:
            for ele in sorted(steps_and_files.items()):
                step, checkpoint_path = ele
                if global_step >= step:
                    if len(_find_valid_cands(step)) > 1:
                        for ext in ["meta", "data-00000-of-00001", "index"]:
                            src_ckpt = checkpoint_path + ".{}".format(ext)
                            tf.logging.info("removing {}".format(src_ckpt))
                            # Why should we remove checkpoints?
                            # tf.gfile.Remove(src_ckpt)
                    tf.logging.info("Skipping candidate for some reason")
                    continue
                result = estimator.evaluate(input_fn=eval_input_fn,
                                            steps=eval_steps,
                                            checkpoint_path=checkpoint_path)
                global_step = result["global_step"]
                tf.logging.info("***** Eval results *****")
                for key in sorted(result.keys()):
                    tf.logging.info("  %s = %s", key, str(result[key]))
                    writer.write("%s = %s\n" % (key, str(result[key])))
                writer.write("best = {}\n".format(best_perf))

                if len(_find_valid_cands(global_step)) > 1:
                    for ext in ["meta", "data-00000-of-00001", "index"]:
                        src_ckpt = checkpoint_path + ".{}".format(ext)
                        tf.logging.info("removing {}".format(src_ckpt))
                        # tf.gfile.Remove(src_ckpt)
                writer.write("=" * 50 + "\n")
        writer.close()
示例#22
0
def main(_):
  os.environ["TF_CPP_MIN_LOG_LEVEL"] = "0"

  # Load emotion categories
  with open(FLAGS.emotion_file, "r") as f:
    all_emotions = f.read().splitlines()
    if FLAGS.add_neutral:
      all_emotions = all_emotions + ["neutral"]
    idx2emotion = {i: e for i, e in enumerate(all_emotions)}
  num_labels = len(all_emotions)
  print("%d labels" % num_labels)
  print("Multilabel: %r" % FLAGS.multilabel)

  sentiment = FLAGS.sentiment
  entailment = FLAGS.entailment
  correlation = FLAGS.correlation

  # Create emotion distance matrix
  # If the regularization parameter is set to 0, don't load matrix.
  print("Getting distance matrix...")
  empty_rels = [[0] * num_labels] * num_labels
  if sentiment == 0:
    sent_rels = empty_rels
  else:
    sent_rels = get_sent_rels(all_emotions)
  sent_groups = get_sentiment_groups(all_emotions)
  print(sent_rels)
  if entailment == 0:
    entailment_rels = empty_rels
    intensity_groups = empty_rels
  else:
    entailment_rels = get_entailment_rels(all_emotions)
    intensity_groups = get_intensity_groups(all_emotions)
  print(entailment_rels)
  if correlation == 0:
    corr_rels = empty_rels
  else:
    corr_rels = get_correlations(all_emotions)
  print(corr_rels)

  tf.logging.set_verbosity(tf.logging.INFO)

  tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                FLAGS.init_checkpoint)

  if not FLAGS.do_train and not FLAGS.do_predict:
    raise ValueError("At least one of `do_train` or `do_predict' must be True.")

  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

  if FLAGS.max_seq_length > bert_config.max_position_embeddings:
    raise ValueError(
        "Cannot use sequence length %d because the BERT model "
        "was only trained up to sequence length %d" %
        (FLAGS.max_seq_length, bert_config.max_position_embeddings))

  tf.gfile.MakeDirs(FLAGS.output_dir)

  processor = DataProcessor(num_labels, FLAGS.data_dir)  # set up preprocessor

  tokenizer = tokenization.FullTokenizer(
      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)

  run_config = tf.estimator.RunConfig(
      model_dir=FLAGS.output_dir,
      save_summary_steps=FLAGS.save_summary_steps,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
      keep_checkpoint_max=FLAGS.keep_checkpoint_max)

  train_examples = None
  num_train_steps = None
  num_warmup_steps = None

  if FLAGS.do_train:
    train_examples = processor.get_examples("train", FLAGS.train_fname)
    eval_examples = processor.get_examples("dev", FLAGS.dev_fname)
    num_eval_examples = len(eval_examples)
    num_train_steps = int(
        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    params = {
        "num_labels": num_labels,
        "learning_rate": FLAGS.learning_rate,
        "num_train_epochs": FLAGS.num_train_epochs,
        "warmup_proportion": FLAGS.warmup_proportion,
        "sentiment": FLAGS.sentiment,
        "entailment": FLAGS.entailment,
        "correlations": FLAGS.correlation,
        "batch_size": FLAGS.train_batch_size,
        "num_train_examples": len(train_examples),
        "num_eval_examples": num_eval_examples,
        "data_dir": FLAGS.data_dir,
        "output_dir": FLAGS.output_dir,
        "train_fname": FLAGS.train_fname,
        "dev_fname": FLAGS.dev_fname,
        "test_fname": FLAGS.test_fname
    }
    with open(os.path.join(FLAGS.output_dir, "config.json"), "w") as f:
      json.dump(params, f)

  model_fn = model_fn_builder(
      bert_config=bert_config,
      num_labels=num_labels,
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      multilabel=FLAGS.multilabel,
      sent_rels=sent_rels,
      sentiment=sentiment,
      entailment_rels=entailment_rels,
      entailment=entailment,
      corr_rels=corr_rels,
      correlation=correlation,
      idx2emotion=idx2emotion,
      sentiment_groups=sent_groups,
      intensity_groups=intensity_groups)

  estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config,
      params={"batch_size": FLAGS.train_batch_size})

  if FLAGS.do_train:
    train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
    file_based_convert_examples_to_features(train_examples,
                                            FLAGS.max_seq_length, tokenizer,
                                            train_file)
    eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
    file_based_convert_examples_to_features(eval_examples, FLAGS.max_seq_length,
                                            tokenizer, eval_file)

    tf.logging.info("***** Running training and evaluation *****")
    tf.logging.info("  Num train examples = %d", len(train_examples))
    tf.logging.info("  Num eval examples = %d", num_eval_examples)
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num training steps = %d", num_train_steps)
    train_input_fn = file_based_input_fn_builder(
        input_file=train_file,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True,
        num_labels=num_labels)
    train_spec = tf.estimator.TrainSpec(
        input_fn=train_input_fn, max_steps=num_train_steps)
    eval_input_fn = file_based_input_fn_builder(
        input_file=eval_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=False,
        num_labels=num_labels)
    eval_spec = tf.estimator.EvalSpec(
        input_fn=eval_input_fn,
        steps=FLAGS.eval_steps,
        start_delay_secs=0,
        throttle_secs=1000)

    tf.estimator.train_and_evaluate(
        estimator, train_spec=train_spec, eval_spec=eval_spec)

  if FLAGS.calculate_metrics:

    # Setting the parameter to "dev" ensures that we get labels for the examples
    eval_examples = processor.get_examples("dev", FLAGS.test_fname)

    tf.logging.info("***** Running evaluation *****")
    tf.logging.info("  Num eval examples = %d", len(eval_examples))
    eval_file = os.path.join(FLAGS.output_dir, FLAGS.test_fname + ".tf_record")
    file_based_convert_examples_to_features(eval_examples, FLAGS.max_seq_length,
                                            tokenizer, eval_file)
    eval_input_fn = file_based_input_fn_builder(
        input_file=eval_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=False,
        num_labels=num_labels)

    result = estimator.evaluate(input_fn=eval_input_fn, steps=None)
    output_eval_file = os.path.join(FLAGS.output_dir,
                                    FLAGS.test_fname + ".eval_results.txt")
    with tf.gfile.GFile(output_eval_file, "w") as writer:
      tf.logging.info("***** Eval results *****")
      for key in sorted(result.keys()):
        tf.logging.info("  %s = %s", key, str(result[key]))
        writer.write("%s = %s\n" % (key, str(result[key])))

  if FLAGS.do_predict:
  	# Have to change my dataset in this format
    predict_examples = processor.get_examples("test", FLAGS.test_fname)
    num_actual_predict_examples = len(predict_examples)

    predict_file = os.path.join(FLAGS.output_dir,
                                FLAGS.test_fname + ".tf_record")
    file_based_convert_examples_to_features(predict_examples,
                                            FLAGS.max_seq_length, tokenizer,
                                            predict_file)

    tf.logging.info("***** Running prediction*****")
    tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                    len(predict_examples), num_actual_predict_examples,
                    len(predict_examples) - num_actual_predict_examples)
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)

    predict_input_fn = file_based_input_fn_builder(
        input_file=predict_file,
        seq_length=FLAGS.max_seq_length,
        is_training=False,
        drop_remainder=False,
        num_labels=num_labels)

    # looks like predict_input_fn will contain the data for loading
    result = estimator.predict(input_fn=predict_input_fn)

    output_predict_file = os.path.join(FLAGS.output_dir,
                                       FLAGS.test_fname + ".predictions.tsv")
    output_labels = os.path.join(FLAGS.output_dir,
                                 FLAGS.test_fname + ".label_predictions.tsv")
    logits_file = open('logits_file_vanilla.txt', 'w')
    #prediction_file = open(FLAGS.test_fname+'_predictions.csv', 'w')
    preds_file = open(FLAGS.test_fname+'_k_bal_numb_predictions_bin.csv', 'w')
    with tf.gfile.GFile(output_predict_file, "w") as writer:
      with tf.gfile.GFile(output_labels, "w") as writer2:
        writer.write("\t".join(all_emotions) + "\n")
        writer2.write("\t".join([
            "text", "emotion_1", "prob_1", "emotion_2", "prob_2", "emotion_3",
            "prob_3"
        ]) + "\n")
        tf.logging.info("***** Predict results *****")
        num_written_lines = 0
        # Do something here
        df_file = pd.read_csv(os.path.join('data', FLAGS.test_fname), sep='\t', header=None)
        dict_store = dict()
        ctr=0
        for (i, prediction) in enumerate(result):
          ctr+=1
          if i<5:
            print(i, prediction["output_layer"], type(prediction["output_layer"]), file=logits_file)
          #dict_store[df_file.iloc[i, 2]] = prediction["output_layer"]
          dict_store[i] = prediction["output_layer"]
          probabilities = prediction["probabilities"]
          if i >= num_actual_predict_examples:
            break
          output_line = "\t".join(
              str(class_probability)
              for class_probability in probabilities) + "\n"
          sorted_idx = np.argsort(-probabilities)
          top_3_emotion = [idx2emotion[idx] for idx in sorted_idx[:3]]
          top_3_prob = [probabilities[idx] for idx in sorted_idx[:3]]
          pred_line = []
          for emotion, prob in zip(top_3_emotion, top_3_prob):
            if prob >= FLAGS.pred_cutoff:
              pred_line.extend([emotion, "%.4f" % prob])
            else:
              pred_line.extend(["", ""])
          writer.write(output_line)
          writer2.write(predict_examples[i].text + "\t" + "\t".join(pred_line) +
                        "\n")
          num_written_lines += 1
          #print(str(df_file.iloc[i,2])+","+str(top_3_emotion), file=prediction_file)
          print(str(df_file.iloc[i,2])+","+str(sorted_idx[0]), file=preds_file)
    assert num_written_lines == num_actual_predict_examples
    # Dump the dictionary into pickle
    print(len(dict_store))
    print(ctr)
    with open(FLAGS.test_fname + '_k_bal_bin.pickle', 'wb') as handle:
    	pickle.dump(dict_store, handle, protocol=pickle.HIGHEST_PROTOCOL)
示例#23
0
def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                FLAGS.init_checkpoint)

  if not FLAGS.do_train and not FLAGS.do_eval:
    raise ValueError("At least one of `do_train`, `do_eval` must be True.")

  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

  if FLAGS.max_seq_length > bert_config.max_position_embeddings:
    raise ValueError(
        "Cannot use sequence length %d because the BERT model "
        "was only trained up to sequence length %d" %
        (FLAGS.max_seq_length, bert_config.max_position_embeddings))

  tf.gfile.MakeDirs(FLAGS.output_dir)

  tpu_cluster_resolver = None
  if FLAGS.use_tpu and FLAGS.tpu_name:
    tpu_cluster_resolver = contrib_cluster_resolver.TPUClusterResolver(
        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

  is_per_host = contrib_tpu.InputPipelineConfig.PER_HOST_V2
  run_config = contrib_tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      master=FLAGS.master,
      model_dir=FLAGS.output_dir,
      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
      tpu_config=contrib_tpu.TPUConfig(
          iterations_per_loop=FLAGS.iterations_per_loop,
          num_shards=FLAGS.num_tpu_cores,
          per_host_input_for_training=is_per_host))

  num_train_steps = None
  num_warmup_steps = None
  if FLAGS.do_train:
    num_train_steps = int(FLAGS.train_data_size / FLAGS.train_batch_size)
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

  model_fn = model_fn_builder(
      bert_config=bert_config,
      init_checkpoint=FLAGS.init_checkpoint,
      learning_rate=FLAGS.learning_rate,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps,
      use_tpu=FLAGS.use_tpu,
      use_one_hot_embeddings=FLAGS.use_tpu,
      num_choices=FLAGS.num_choices,
      add_masking=FLAGS.include_mlm)

  # If TPU is not available, this will fall back to normal Estimator on CPU
  # or GPU.
  estimator = contrib_tpu.TPUEstimator(
      use_tpu=FLAGS.use_tpu,
      model_fn=model_fn,
      config=run_config,
      train_batch_size=FLAGS.train_batch_size,
      eval_batch_size=FLAGS.eval_batch_size,
      predict_batch_size=FLAGS.predict_batch_size)

  if FLAGS.do_train:
    tf.logging.info("***** Running training *****")
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)
    train_input_fn = file_based_input_fn_builder(
        input_file=FLAGS.train_file,
        is_training=True,
        drop_remainder=True,
        add_masking=FLAGS.include_mlm)
    estimator.train(input_fn=train_input_fn, steps=num_train_steps)

  if FLAGS.do_eval:
    # This tells the estimator to run through the entire set.
    if FLAGS.eval_data_size < 0:
      eval_steps = None
    else:
      eval_steps = int(FLAGS.eval_data_size / FLAGS.eval_batch_size)

    eval_drop_remainder = True if FLAGS.use_tpu else False
    # Note that we are masking inputs for eval as well as training and this will
    # decrease eval performance
    eval_input_fn = file_based_input_fn_builder(
        input_file=FLAGS.eval_file,
        is_training=False,
        drop_remainder=eval_drop_remainder,
        add_masking=FLAGS.include_mlm)

    # checkpoints_iterator blocks until a new checkpoint appears.
    for ckpt in contrib_training.checkpoints_iterator(estimator.model_dir):
      try:
        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
        tf.logging.info("********** Eval results:*******\n")
        for key in sorted(result.keys()):
          tf.logging.info("%s = %s" % (key, str(result[key])))
      except tf.errors.NotFoundError:
        tf.logging.error("Checkpoint path '%s' no longer exists.", ckpt)
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "ske_2019": SKE_2019_Sequence_labeling_Processor,
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    token_label_list = processor.get_token_labels()
    predicate_label_list = processor.get_predicate_labels()

    num_token_labels = len(token_label_list)
    num_predicate_labels = len(predicate_label_list)

    token_label_id2label = {}
    for (i, label) in enumerate(token_label_list):
        token_label_id2label[i] = label
    predicate_label_id2label = {}
    for (i, label) in enumerate(predicate_label_list):
        predicate_label_id2label[i] = label

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_token_labels=num_token_labels,
                                num_predicate_labels=num_predicate_labels,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples,
                                                token_label_list,
                                                predicate_label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(PaddingInputExample())

        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples,
                                                token_label_list,
                                                predicate_label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(PaddingInputExample())

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples,
                                                token_label_list,
                                                predicate_label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        result = estimator.predict(input_fn=predict_input_fn)
        token_label_output_predict_file = os.path.join(
            FLAGS.output_dir, "token_label_predictions.txt")
        predicate_output_predict_file = os.path.join(FLAGS.output_dir,
                                                     "predicate_predict.txt")
        predicate_output_probabilities_file = os.path.join(
            FLAGS.output_dir, "predicate_probabilities.txt")
        with open(token_label_output_predict_file, "w",
                  encoding='utf-8') as token_label_writer:
            with open(predicate_output_predict_file, "w",
                      encoding='utf-8') as predicate_predict_writer:
                with open(predicate_output_probabilities_file,
                          "w",
                          encoding='utf-8') as predicate_probabilities_writer:
                    num_written_lines = 0
                    tf.logging.info(
                        "***** token_label predict and predicate labeling results *****"
                    )
                    for (i, prediction) in enumerate(result):
                        token_label_prediction = prediction[
                            "token_label_predictions"]
                        predicate_probabilities = prediction[
                            "predicate_probabilities"]
                        predicate_prediction = prediction[
                            "predicate_prediction"]
                        if i >= num_actual_predict_examples:
                            break
                        token_label_output_line = " ".join(
                            token_label_id2label[id]
                            for id in token_label_prediction) + "\n"
                        token_label_writer.write(token_label_output_line)
                        predicate_predict_line = predicate_label_id2label[
                            predicate_prediction]
                        predicate_predict_writer.write(predicate_predict_line +
                                                       "\n")
                        predicate_probabilities_line = " ".join(
                            str(sigmoid_logit) for sigmoid_logit in
                            predicate_probabilities) + "\n"
                        predicate_probabilities_writer.write(
                            predicate_probabilities_line)
                        num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples
def finetune(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    # set parameters
    temp_dir = os.path.join(FLAGS.output_dir, 'temp')
    output_model_dir = os.path.join(FLAGS.output_dir, 'model')
    pretrained_models_dir = os.path.join(FLAGS.output_dir, 'pretrained_models',
                                         FLAGS.pretrained_model_folder)

    assert not (
                FLAGS.bert_config_file is None and FLAGS.pretrained_model_checkpoint is None and FLAGS.vocab_file is None
                and FLAGS.pretrained_model_folder is None), \
        "Either the `pretrained_model_folder` has to be specified, or all three of the following parameters: " \
        "`bert_config_file`, `output_dir`, and `pretrained_model_checkpoint`."

    if FLAGS.vocab_file is None:
        FLAGS.vocab_file = os.path.join(pretrained_models_dir, 'vocab.txt')
    if FLAGS.bert_config_file is None:
        FLAGS.bert_config_file = os.path.join(pretrained_models_dir,
                                              'bert_config.json')
    if FLAGS.pretrained_model_checkpoint is None:
        FLAGS.pretrained_model_checkpoint = os.path.join(
            pretrained_models_dir, 'bert_model.ckpt')

    # Validate the pre-trained model
    tokenization.validate_case_matches_checkpoint(
        FLAGS.do_lower_case, FLAGS.pretrained_model_checkpoint)

    # Load the BERT config
    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    # Validate the max_seq_length parameter
    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    # Create the temp output directory (if required)
    tf.gfile.MakeDirs(temp_dir)

    # Decode the label list json and initialize the CustomDataProcessor
    processor = CustomDataProcessor(str(FLAGS.data_type))
    label_list = processor.get_labels(FLAGS.train_data)

    # save the labels.txt file
    with open(os.path.join(output_model_dir, 'labels.txt'), 'w') as f:
        for label in label_list:
            f.write(str(label) + '\n')

    # Initialize the tokenizer
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    # Specify the `run_config` parameters:
    # (We don't support TPU in this version of the training script, but let's leave the original code in place.)
    tpu_cluster_resolver = None
    use_tpu = False
    master = None
    num_tpu_cores = 8
    # if use_tpu and tpu_name:
    #     tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
    #         tpu_name, zone=tpu_zone, project=gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=master,
        model_dir=temp_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        keep_checkpoint_max=1,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=num_tpu_cores,
            per_host_input_for_training=is_per_host))

    # Finetuning section.
    train_examples = processor.get_train_examples(FLAGS.train_data)
    num_train_steps = int(
        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(label_list),
        init_checkpoint=FLAGS.pretrained_model_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        use_tpu=use_tpu,
        use_one_hot_embeddings=use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size)

    train_file = os.path.join(temp_dir, "train.tf_record")
    file_based_convert_examples_to_features(train_examples, label_list,
                                            FLAGS.max_seq_length, tokenizer,
                                            train_file)
    tf.logging.info("***** Running training *****")
    params = f"  Num examples = {len(train_examples)}\n" \
        f"  Batch size = {FLAGS.train_batch_size}\n" \
        f"  Num steps = {num_train_steps}\n" \
        f"  Epochs = {FLAGS.num_train_epochs}\n" \
        f"  Learning rate = {FLAGS.learning_rate}\n" \
        f"  warmup_proportion = {FLAGS.warmup_proportion}\n" \
        f"  max_seq_length = {FLAGS.max_seq_length}\n" \
        f"  do_lower_case = {FLAGS.do_lower_case}"
    tf.logging.info(params)
    print(params)
    train_input_fn = file_based_input_fn_builder(
        input_file=train_file,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True)
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    latest_model = estimator.latest_checkpoint()

    # export to savedmodel
    tf.logging.info('exporting the model to savedmodel')
    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=latest_model,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=None,
                                num_warmup_steps=None,
                                use_tpu=False,
                                use_one_hot_embeddings=False)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    predict_batch_size = FLAGS.train_batch_size
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=False,
        model_fn=model_fn,
        config=run_config,
        predict_batch_size=predict_batch_size)

    estimator._export_to_tpu = False
    tf.logging.info(f'LATEST MODEL: {latest_model}')
    saved_model_path = estimator.export_savedmodel(
        output_model_dir, serving_input_fn,
        checkpoint_path=latest_model).decode("utf-8")
    # add the vocab.txt file as well
    shutil.move(os.path.join(pretrained_models_dir, 'vocab.txt'),
                os.path.join(os.path.dirname(saved_model_path), 'vocab.txt'))

    # clean up the temp folder
    shutil.rmtree(temp_dir, ignore_errors=True)

    # move the model files to the parent directory (to meet the WML convention)
    for filename in os.listdir(saved_model_path):
        shutil.move(os.path.join(saved_model_path, filename),
                    os.path.join(os.path.dirname(saved_model_path), filename))
    shutil.rmtree(saved_model_path, ignore_errors=True)

    # update the saved model path as well
    saved_model_path = os.path.dirname(saved_model_path)
    tf.logging.info(
        f'the saved model can be found in this directory: {saved_model_path}')
示例#26
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    np.random.seed(FLAGS.random_seed)

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model was only trained up to sequence length %d"
            % (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)
    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    data_dir = FLAGS.data_dir
    task_name = FLAGS.task_name.lower()
    processor = NluProcessor(data_dir, task_name)
    token_label_list = processor.get_token_labels()
    sent_label_list = processor.get_sent_labels()

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples()
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    model_fn = model_fn_builder(bert_config=bert_config,
                                token_label_list=token_label_list,
                                sent_label_list=sent_label_list,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        export_to_tpu=FLAGS.use_tpu,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        tf.logging.info("***** Run training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)

        train_features = convert_examples_to_features(
            examples=train_examples,
            token_label_list=token_label_list,
            sent_label_list=sent_label_list,
            max_seq_length=FLAGS.max_seq_length,
            tokenizer=tokenizer)

        train_input_fn = input_fn_builder(features=train_features,
                                          seq_length=FLAGS.max_seq_length,
                                          is_training=True,
                                          drop_remainder=True)

        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples()
        tf.logging.info("***** Run evaluation *****")
        tf.logging.info("  Num examples = %d", len(eval_examples))
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        eval_features = convert_examples_to_features(
            examples=eval_examples,
            token_label_list=token_label_list,
            sent_label_list=sent_label_list,
            max_seq_length=FLAGS.max_seq_length,
            tokenizer=tokenizer)

        eval_input_fn = input_fn_builder(features=eval_features,
                                         seq_length=FLAGS.max_seq_length,
                                         is_training=False,
                                         drop_remainder=False)

        result = estimator.evaluate(input_fn=eval_input_fn)

        token_precision = result["token_precision"]
        token_recall = result["token_recall"]
        token_f1_score = 2.0 * token_precision * token_recall / (
            token_precision + token_recall)

        sent_accuracy = result["sent_accuracy"]

        tf.logging.info("***** Evaluation result *****")
        tf.logging.info("  Precision (token-level) = %s", str(token_precision))
        tf.logging.info("  Recall (token-level) = %s", str(token_recall))
        tf.logging.info("  F1 score (token-level) = %s", str(token_f1_score))
        tf.logging.info("  Accuracy (sent-level) = %s", str(sent_accuracy))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples()
        tf.logging.info("***** Run prediction *****")
        tf.logging.info("  Num examples = %d", len(predict_examples))
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_features = convert_examples_to_features(
            examples=predict_examples,
            token_label_list=token_label_list,
            sent_label_list=sent_label_list,
            max_seq_length=FLAGS.max_seq_length,
            tokenizer=tokenizer)

        predict_input_fn = input_fn_builder(features=predict_features,
                                            seq_length=FLAGS.max_seq_length,
                                            is_training=False,
                                            drop_remainder=False)

        result = estimator.predict(input_fn=predict_input_fn)

        predicts = [{
            "input_ids": feature.input_ids,
            "input_masks": feature.input_masks,
            "token_label_ids": feature.token_label_ids,
            "sent_label_id": feature.sent_label_id,
            "token_predict_ids": predict["token_predict"].tolist(),
            "sent_predict_id": predict["sent_predict"].tolist()
        } for feature, predict in zip(predict_features, result)]

        decoded_predicts = decode_predicts(predicts=predicts,
                                           token_label_list=token_label_list,
                                           sent_label_list=sent_label_list,
                                           max_seq_length=FLAGS.max_seq_length,
                                           tokenizer=tokenizer)

        predict_tag = FLAGS.predict_tag if FLAGS.predict_tag else str(
            time.time())
        output_path = os.path.join(FLAGS.output_dir,
                                   "predict.{0}.json".format(predict_tag))
        write_to_json(decoded_predicts, output_path)

    if FLAGS.do_export:
        tf.logging.info("***** Running exporting *****")
        tf.gfile.MakeDirs(FLAGS.export_dir)
        estimator.export_savedmodel(FLAGS.export_dir,
                                    serving_input_fn,
                                    as_text=False)
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "mrpc": data_cls_helper.MrpcProcessor,
        "snli": data_cls_helper.SnliProcessor,
        "sick": data_cls_helper.SickProcessor,
        "cola": data_cls_helper.ColaProcessor,
        "cr": data_cls_helper.CrProcessor,
        "mr": data_cls_helper.MrProcessor,
        "subj": data_cls_helper.SubjProcessor,
        "sst5": data_cls_helper.Sst5Processor,
        "sst2": data_cls_helper.Sst2Processor,
        "trec": data_cls_helper.TrecProcessor
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model was only trained up to sequence "
            "length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    if not os.path.exists(FLAGS.output_dir):
        tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % task_name)

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    run_config = tf.contrib.tpu.RunConfig(
        cluster=None,
        master=None,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=8,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.
            PER_HOST_V2))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.batch_size * FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_one_hot_embeddings=False)

    # If TPU is not available, this will fall back to normal Estimator on CPU or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=False,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.batch_size,
        eval_batch_size=FLAGS.batch_size,
        predict_batch_size=FLAGS.batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)

        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        batch_tokens, batch_labels = file_based_convert_examples_to_features(
            predict_examples, label_list, FLAGS.max_seq_length, tokenizer,
            predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.batch_size)

        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=False)

        result = estimator.predict(input_fn=predict_input_fn)

        output_predict_file = os.path.join(FLAGS.output_dir,
                                           "test_results.tsv")
        total_examples, correct_predicts = 0, 0
        with tf.gfile.GFile(output_predict_file, mode="w") as writer:
            num_written_lines = 0
            tf.logging.info("***** Predict results *****")
            for i, (tokens, label, prediction) in enumerate(
                    zip(batch_tokens, batch_labels, result)):
                probabilities = prediction["probabilities"]
                predict_label = prediction["predictions"]
                if i >= num_actual_predict_examples:
                    break
                total_examples += 1
                if predict_label == label:
                    correct_predicts += 1
                sentence = " ".join(tokens)
                class_probabilities = "\t".join(
                    str(class_probability)
                    for class_probability in probabilities)
                output_line = "\t".join([
                    sentence, class_probabilities,
                    str(label),
                    str(predict_label)
                ]) + "\n"
                writer.write(output_line)
                num_written_lines += 1
        assert num_written_lines == num_actual_predict_examples
        acc = float(correct_predicts) / float(total_examples)
        print("Test accuracy: {}".format(acc))
示例#28
0
def generate_embeddings(args):
    """Generates a set of word embeddings from the final four BERT
    layers.

    Parameters
    ----------
    args : Namespace
        Parsed arguments from argparse, containing all of the input arguments
    """
    time_start = time()

    print(tf.__version__)
    print("Num GPUs Available: ",
          len(tf.config.experimental.list_physical_devices("GPU")))

    # data
    df = pd.read_pickle(args.dataframe_path)
    topics = list(df)[5:]

    xtrain, xtest, ytrain, ytest = train_test_split(df["clean_text"],
                                                    df.iloc[:, 5:],
                                                    test_size=0.2,
                                                    random_state=42)
    xtrain, xdev, ytrain, ydev = train_test_split(xtrain,
                                                  ytrain,
                                                  test_size=0.25,
                                                  random_state=42)
    print(xtrain.shape, xtest.shape, ytrain.shape, ytest.shape)
    print(xtrain.shape, xdev.shape, ytrain.shape, ydev.shape)

    df_train = pd.concat([xtrain, ytrain], axis=1, ignore_index=True)
    df_dev = pd.concat([xdev, ydev], axis=1, ignore_index=True)
    df_test = pd.concat([xtest, ytest], axis=1, ignore_index=True)
    print(f"train shape: {df_train.shape}")
    print(f"val shape: {df_dev.shape}")
    print(f"test shape: {df_test.shape}")

    if args.stage == "train":
        examples = create_examples(df_train)
    if args.stage == "dev":
        examples = create_examples(df_dev)
    if args.stage == "test":
        examples = create_examples(df_test)

    input_fn = create_input_fn_from_examples(examples, args.stage,
                                             args.base_working_path,
                                             args.max_seq_length, len(topics))

    # model init
    bert_vocab = args.base_path + "/bert_vocab.txt"
    bert_init_chckpnt = args.base_path + "/bert_model.ckpt"
    bert_config = args.base_path + "/config.json"

    tokenization.validate_case_matches_checkpoint(True, bert_init_chckpnt)
    tokenizer = tokenization.FullTokenizer(vocab_file=bert_vocab,
                                           do_lower_case=True)

    output_dir = args.base_working_path + "/output"
    run_config = tf.estimator.RunConfig(
        model_dir=output_dir,
        save_summary_steps=args.save_summary_steps,
        keep_checkpoint_max=1,
        save_checkpoints_steps=args.save_checkpoint_steps,
    )

    bert_config = modeling.BertConfig.from_json_file(bert_config)
    model_fn = model_fn_builder(
        bert_config=bert_config,
        num_labels=len(topics),
        init_checkpoint=bert_init_chckpnt,
        learning_rate=args.learning_rate,
        num_train_steps=-1,
        num_warmup_steps=-1,
        use_tpu=False,
        use_one_hot_embeddings=False,
        layer_indexes=args.layer_indices,
    )

    estimator = tf.estimator.Estimator(model_fn=model_fn,
                                       config=run_config,
                                       params={"batch_size": args.batch_size})

    print("=" * 50)
    print(f"Beginning predict")
    print("=" * 50)

    # inference
    embeddings_list = np.empty(
        [len(examples), args.max_seq_length, args.embedding_size])
    label_ids_list = np.empty([len(examples), examples[0].labels.shape[0]])

    generate_bert_embeddings(
        args.mode,
        input_fn,
        embeddings_list,
        label_ids_list,
        estimator,
        tokenizer,
        args.layer_indices,
    )

    print("=" * 50)
    print(f"Embedding list size: {len(embeddings_list)}")
    print(f"Labels list size: {len(label_ids_list)}")
    print(
        f"Embedding size: {len(embeddings_list[0][0])}, {len(embeddings_list[0])}"
    )
    print(f"Saving...")

    dump_path = f"bert_{args.stage}_{args.mode}.npy"
    np.save(dump_path, embeddings_list)

    dump_path_labels = f"bert_{args.stage}_{args.mode}_labels.npy"
    np.save(dump_path_labels, label_ids_list)

    print("DONE")
    print("=" * 50)
    print(
        f"Finished generating {args.stage} BERT token level embeddings",
        f"in {time()-time_start} seconds.\nPath: {dump_path}",
    )
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    processor = ccfKeyProcessor()

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
        raise ValueError(
            "At least one of `do_train`, `do_eval` or `do_predict' must be True."
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)
    tf.gfile.MakeDirs(FLAGS.model_dir)

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.model_dir,
        keep_checkpoint_max=10000,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = processor.get_train_examples(FLAGS.train_data_dir)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    if FLAGS.do_train:
        train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_file)
        tf.logging.info("***** Running training *****")
        tf.logging.info("  Num examples = %d", len(train_examples))
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=train_file,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.dev_data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(PaddingInputExample())

        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        file_based_convert_examples_to_features(eval_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        steps_and_files = []
        filenames = tf.gfile.ListDirectory(FLAGS.model_dir)
        for filename in filenames:
            if filename.endswith(".index"):
                ckpt_name = filename[:-6]
                cur_filename = os.path.join(FLAGS.model_dir, ckpt_name)
                global_step = int(cur_filename.split("-")[-1])
                tf.logging.info("Add {} to eval list.".format(cur_filename))
                steps_and_files.append([global_step, cur_filename])
        steps_and_files = sorted(steps_and_files, key=lambda x: x[0])

        result_list = list()
        for global_step, filename in sorted(steps_and_files,
                                            key=lambda x: x[0]):
            result = estimator.evaluate(input_fn=eval_input_fn,
                                        steps=eval_steps,
                                        checkpoint_path=filename)
            result_list.append([global_step, result])

        for step, result in result_list:

            tf.logging.info("\n\n------ step ------" + str(step))
            pre, rec, f1 = get_metrics(result["cf"], 3)
            tf.logging.info("eval_precision: {}".format(pre))
            tf.logging.info("eval_recall: {}".format(rec))
            tf.logging.info("eval_f1: {}".format(f1))
            tf.logging.info("eval_accuracy: {}".format(
                result["eval_accuracy"]))
            tf.logging.info("eval_loss: {}".format(result["eval_loss"]))
            tf.logging.info("-------------------------\n\n")

    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.test_data_dir)
        num_actual_predict_examples = len(predict_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on.
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(PaddingInputExample())

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        file_based_convert_examples_to_features(predict_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)
        """
        change
        """
        # Filter out all checkpoints in the directory
        steps_and_files = []
        filenames = tf.gfile.ListDirectory(FLAGS.model_dir)

        for filename in filenames:
            if filename.endswith(".index"):
                ckpt_name = filename[:-6]
                cur_filename = os.path.join(FLAGS.model_dir, ckpt_name)
                global_step = int(cur_filename.split("-")[-1])
                tf.logging.info("Add {} to eval list.".format(cur_filename))
                steps_and_files.append([global_step, cur_filename])
        steps_and_files = sorted(steps_and_files, key=lambda x: x[0])

        # Decide whether to evaluate all ckpts
        if not FLAGS.eval_all_ckpt:
            steps_and_files = steps_and_files[-1:]

        for global_step, filename in sorted(steps_and_files,
                                            key=lambda x: x[0]):

            tf.logging.info("------ global_step ------" + str(global_step))

            # ret = estimator.evaluate(
            #     input_fn=eval_input_fn,
            #     steps=eval_steps,
            #     checkpoint_path=filename)

            result = estimator.predict(input_fn=predict_input_fn,
                                       checkpoint_path=filename)

            output_predict_file = os.path.join(
                FLAGS.output_dir,
                str(global_step) + "_test_results.tsv")
            with tf.gfile.GFile(output_predict_file, "w") as writer:
                num_written_lines = 0
                tf.logging.info("***** Predict results *****")
                for (i, prediction) in enumerate(result):
                    probabilities = prediction["probabilities"]
                    if i >= num_actual_predict_examples:
                        break
                    output_line = "\t".join(
                        str(class_probability)
                        for class_probability in probabilities) + "\n"
                    writer.write(output_line)
                    num_written_lines += 1
            assert num_written_lines == num_actual_predict_examples
def main(_):
    record_dir = os.path.join(FLAGS.data_dir,
                              "trainrecords" + str(FLAGS.max_seq_length))
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "gap": GAProcessor,
    }

    tf.estimator.RunConfig(model_dir=FLAGS.output_dir)

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict and not FLAGS.pre_train:
        raise ValueError(
            "One of `pre_train`, `do_train`, `do_eval` or `do_predict' must be True."
        )

    if FLAGS.do_train and FLAGS.pre_train:
        raise ValueError(
            "Cannot `pre_train` and `do_train` in a single pass. First do `pre_train`"
        )

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None

    if FLAGS.do_train:
        num_train_steps = int(FLAGS.epoch_size / FLAGS.train_batch_size *
                              FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    ##############################################################################
    #                                  PRE TRAIN                                 #
    ##############################################################################
    if FLAGS.pre_train:
        tsv_dir = os.path.join(FLAGS.data_dir, "trainQ")
        tf.gfile.MakeDirs(record_dir)
        in_file = FLAGS.train_data_path
        if "/" in in_file:
            in_file = in_file[in_file.rfind("/") + 1:]
        train_examples = processor.get_train_examples(FLAGS.train_data_path)
        train_file = in_file + '.tf_record'
        train_path = os.path.join(record_dir, train_file)
        file_based_convert_examples_to_features(train_examples, label_list,
                                                FLAGS.max_seq_length,
                                                tokenizer, train_path)
        '''
    files = tf.gfile.ListDirectory(tsv_dir)
    for in_file in files:
      in_path = os.path.join(tsv_dir, in_file)
      train_examples = processor.get_train_examples(in_path)
      train_file = in_file + '.tf_record'
      train_path = os.path.join(record_dir, train_file)
      file_based_convert_examples_to_features(
          train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_path)
    '''

    ##############################################################################
    #                                  DO TRAIN                                  #
    ##############################################################################
    if FLAGS.do_train:
        tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
        tf.logging.info("  Num steps = %d", num_train_steps)
        train_input_fn = file_based_input_fn_builder(
            input_file=record_dir,
            seq_length=FLAGS.max_seq_length,
            is_training=True,
            drop_remainder=True)
        estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    ##############################################################################
    #                                  DO EVAL                                   #
    ##############################################################################
    if FLAGS.do_eval:
        eval_examples = processor.get_dev_examples(FLAGS.data_dir)
        num_actual_eval_examples = len(eval_examples)
        if FLAGS.use_tpu:
            # TPU requires a fixed batch size for all batches, therefore the number
            # of examples must be a multiple of the batch size, or else examples
            # will get dropped. So we pad with fake examples which are ignored
            # later on. These do NOT count towards the metric (all tf.metrics
            # support a per-instance weight, and these get a weight of 0.0).
            while len(eval_examples) % FLAGS.eval_batch_size != 0:
                eval_examples.append(PaddingInputExample())

        eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
        if not tf.gfile.Exists(eval_file):
            file_based_convert_examples_to_features(eval_examples, label_list,
                                                    FLAGS.max_seq_length,
                                                    tokenizer, eval_file)

        tf.logging.info("***** Running evaluation *****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(eval_examples), num_actual_eval_examples,
                        len(eval_examples) - num_actual_eval_examples)
        tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)

        # This tells the estimator to run through the entire set.
        eval_steps = None
        # However, if running eval on the TPU, you will need to specify the
        # number of steps.
        if FLAGS.use_tpu:
            assert len(eval_examples) % FLAGS.eval_batch_size == 0
            eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)

        eval_drop_remainder = True if FLAGS.use_tpu else False
        eval_input_fn = file_based_input_fn_builder(
            input_file=eval_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=eval_drop_remainder)

        result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)

        output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
        with tf.gfile.GFile(output_eval_file, "w") as writer:
            tf.logging.info("***** Eval results *****")
            for key in sorted(result.keys()):
                tf.logging.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    ##############################################################################
    #                                 DO PREDICT                                 #
    ##############################################################################
    if FLAGS.do_predict:
        predict_examples = processor.get_test_examples(FLAGS.data_dir)
        num_actual_predict_examples = len(predict_examples)
        print("***************************************" +
              str(num_actual_predict_examples))
        if FLAGS.use_tpu:
            while len(predict_examples) % FLAGS.predict_batch_size != 0:
                predict_examples.append(PaddingInputExample())

        predict_file = os.path.join(FLAGS.output_dir, "predict.tf_record")
        if not tf.gfile.Exists(predict_file):
            file_based_convert_examples_to_features(predict_examples,
                                                    label_list,
                                                    FLAGS.max_seq_length,
                                                    tokenizer, predict_file)

        tf.logging.info("***** Running prediction*****")
        tf.logging.info("  Num examples = %d (%d actual, %d padding)",
                        len(predict_examples), num_actual_predict_examples,
                        len(predict_examples) - num_actual_predict_examples)
        tf.logging.info("  Batch size = %d", FLAGS.predict_batch_size)

        predict_drop_remainder = False  # True if FLAGS.use_tpu else False
        predict_input_fn = file_based_input_fn_builder(
            input_file=predict_file,
            seq_length=FLAGS.max_seq_length,
            is_training=False,
            drop_remainder=predict_drop_remainder)

        print("**************** MAX_SEQ_LENGTH " + str(FLAGS.max_seq_length))

        result = estimator.predict(input_fn=predict_input_fn)

        print("PREDICT_EXAMPLES", len(predict_examples))
        print("INPUT_FN", predict_input_fn)
        print("RESULT", result)
        print("LABEL LIST", label_list)

        # My own algorithm for keeping probs away from extremes 0 and 1
        # Standard clipping might be better, but this is what I used for kaggle comp.
        def smooth(prob):
            return (1.0 - FLAGS.smoothing) * prob + FLAGS.smoothing / 3.0

        guids = []
        # This is a hack. If order gets shuffled ids will not match predictions.
        # So predict must NOT be parrarelised!
        # Tried to do this with feature_forwarding, but so far a fail.
        for example in predict_examples:
            guids.append(example.guid)

        output_predict_file = os.path.join(FLAGS.output_dir, FLAGS.output_file)
        print("***** PREDICT FILE " + output_predict_file)

        with tf.gfile.GFile(output_predict_file, "w") as writer:
            tf.logging.info("***** Predict results *****")
            writer.write("ID,A,B,NEITHER\n")

            for i, prediction in enumerate(result):
                print("***** Predict results ***** " + str(i), end="\r")
                guid = guids[i]
                out = prediction['probabilities'].tolist()
                output_line = guid + ',' + ",".join(
                    str(smooth(result)) for result in out) + "\n"
                writer.write(output_line)

            print()