def __init__(self, path, training=False, max_seq_length=512):
     self.max_seq_length = max_seq_length
     self.graph = tf.Graph()
     with self.graph.as_default():
         self.input_ids = tf.compat.v1.placeholder(
             tf.int32, shape=(None, self.max_seq_length))
         self.input_mask = tf.compat.v1.placeholder(
             tf.int32, shape=(None, self.max_seq_length))
         self.segment_ids = tf.compat.v1.placeholder(
             tf.int32, shape=(None, self.max_seq_length))
         self.bert_config = BertConfig.from_json_file(path +
                                                      '/bert_config.json')
         self.bert_module = BertModel(config=self.bert_config,
                                      is_training=training,
                                      input_ids=self.input_ids,
                                      input_mask=self.input_mask,
                                      token_type_ids=self.segment_ids,
                                      use_one_hot_embeddings=False)
         assignment_map, initialized_variable_names = get_assignment_map_from_checkpoint(
             tf.trainable_variables(), path + '/bert_model.ckpt')
         tf.train.init_from_checkpoint(path + '/bert_model.ckpt',
                                       assignment_map)
         self.sess = tf.compat.v1.Session()
         self.sess.run(
             tf.group(tf.compat.v1.global_variables_initializer(),
                      tf.compat.v1.tables_initializer()))
         self.bert_outputs = {
             'sequence_output': self.bert_module.get_sequence_output(),
             'pooled_output': self.bert_module.get_pooled_output(),
         }
         self.tok = tokenization.FullTokenizer(vocab_file=path +
                                               '/vocab.txt',
                                               do_lower_case=True)
Пример #2
0
    def _build_bert_model(self):
        # load pre-trained model config
        bert_config_file = self.bert_model_dir + "bert_config.json"
        bert_config = BertConfig.from_json_file(bert_config_file)

        # code to facilitate TPU usage - not used in this case so can be overlooked
        is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
        run_config = tf.contrib.tpu.RunConfig(
            master=None,
            tpu_config=tf.contrib.tpu.TPUConfig(
            num_shards=8,
            per_host_input_for_training=is_per_host)
        )

        # then load build BERT model
        checkpoint_file = self.bert_model_dir + 'bert_model.ckpt'
        
        model_fn = model_fn_builder(
            bert_config = bert_config,
            # the bert_model.ckpt file is actually three files, but is referenced as one
            init_checkpoint = checkpoint_file,
            layer_indexes = self.layer_indexes,
            use_tpu = False,
            # extract_features script reccomends this to be set to true if using TPU
            # apparently much faster
            use_one_hot_embeddings = False
        )
        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=False,
            model_fn=model_fn,
            config=run_config,
            predict_batch_size=32
        )

        return estimator
Пример #3
0
    def predict(self, inputs, **kwargs):
        """Predicts the resulting tensors.

    Args:
      inputs: A dictionary of input tensors keyed by names.

    Returns:
      predictions: A dictionary of prediction tensors keyed by name.
    """
        is_training = self._is_training
        options = self._model_proto

        token_to_id_layer = token_to_id.TokenToIdLayer(
            options.bert_vocab_file, options.bert_unk_token_id)
        bert_config = BertConfig.from_json_file(options.bert_config_file)
        slim_fc_scope = hyperparams.build_hyperparams(options.fc_hyperparams,
                                                      is_training)()

        # Prediction.
        answer_logits = self._predict_logits(
            inputs[self._field_answer_choices],
            inputs[self._field_answer_choices_len], token_to_id_layer,
            bert_config, slim_fc_scope, options.dropout_keep_prob, is_training)

        # Restore from checkpoint.
        assignment_map, _ = get_assignment_map_from_checkpoint(
            tf.global_variables(), options.bert_checkpoint_file)
        tf.compat.v1.train.init_from_checkpoint(options.bert_checkpoint_file,
                                                assignment_map)

        return {
            FIELD_ANSWER_PREDICTION: answer_logits,
        }
    def __init__(self):
        bert_pretrained_dir = args.pretrain_models_path + args.bert_model_name
        self.do_lower_case = args.bert_model_name.startswith('uncased')
        self.vocab_file = os.path.join(bert_pretrained_dir, 'vocab.txt')
        self.config_file = os.path.join(bert_pretrained_dir,
                                        'bert_config.json')
        self.tokenizer = FullTokenizer(vocab_file=self.vocab_file,
                                       do_lower_case=self.do_lower_case)

        self.input_id = tf.placeholder(tf.int64, [None, None], 'input_ids')
        self.input_mask = tf.placeholder(tf.int64, [None, None], 'input_mask')
        self.segment_ids = tf.placeholder(tf.int64, [None, None],
                                          'segment_ids')

        bert_config = BertConfig.from_json_file(self.config_file)
        model = BertModel(config=bert_config,
                          is_training=False,
                          input_ids=self.input_id,
                          input_mask=self.input_mask,
                          token_type_ids=self.segment_ids,
                          use_one_hot_embeddings=True,
                          scope='bert')
        self.output_layer = model.get_sequence_output()
        self.embedding_layer = model.get_embedding_output()

        saver = tf.train.Saver()

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.session = tf.Session(config=config)
        saver.restore(self.session, bert_pretrained_dir + '/bert_model.ckpt')
Пример #5
0
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):


    bert_config_file = os.path.join(BERT_PT_PATH, f'bert_config_{bert_type}.json')
    vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
    init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin')



    bert_config = BertConfig.from_json_file(bert_config_file)
    tokenizer = tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)
    bert_config.print_status()

    model_bert = BertModel(bert_config)
    # if no_pretraining:
    #     pass
    # else:
        # model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
        # print("Load pre-trained parameters.")
    # model_bert=torch.nn.DataParallel(model_bert, device_ids=[0, 4, 5])
    model_bert.to(device)
    # model_bert.cuda(2)

    return model_bert, tokenizer, bert_config
Пример #6
0
def load_bert_rc_model(config_path, wieght_path, device=None):
    config = BertConfig(config_path)
    model = BertForQuestionAnswering(config)
    model.load_state_dict(torch.load(wieght_path, map_location=device))
    if device is not None:
        return model.to(device)
    return model.cpu()
Пример #7
0
 def __init__(self, train_corpus_fname, test_corpus_fname, vocab_fname,
              pretrain_model_fname, bertconfig_fname, model_save_path,
              max_seq_length=128, warmup_proportion=0.1,
              batch_size=32, learning_rate=2e-5, num_labels=2):
     # Load a corpus.
     super().__init__(train_corpus_fname=train_corpus_fname,
                      tokenized_train_corpus_fname=train_corpus_fname + ".bert-tokenized",
                      test_corpus_fname=test_corpus_fname, batch_size=batch_size,
                      tokenized_test_corpus_fname=test_corpus_fname + ".bert-tokenized",
                      model_name="bert", vocab_fname=vocab_fname, model_save_path=model_save_path)
     # configurations
     config = BertConfig.from_json_file(bertconfig_fname)
     self.pretrain_model_fname = pretrain_model_fname
     self.max_seq_length = max_seq_length
     self.batch_size = batch_size
     self.learning_rate = learning_rate
     self.num_labels = 2 # positive, negative
     self.PAD_INDEX = 0
     self.CLS_TOKEN = "[CLS]"
     self.SEP_TOKEN = "[SEP]"
     self.num_train_steps = (int((len(self.train_data) - 1) / self.batch_size) + 1) * self.num_epochs
     self.num_warmup_steps = int(self.num_train_steps * warmup_proportion)
     self.eval_every = int(self.num_train_steps / self.num_epochs)  # epoch마다 평가
     self.training = tf.placeholder(tf.bool)
     # build train graph
     self.input_ids, self.input_mask, self.segment_ids, self.label_ids, self.logits, self.loss = make_bert_graph(config,
                                                                                                                 max_seq_length,
                                                                                                                 self.dropout_keep_prob_rate,
                                                                                                                 num_labels, tune=True)
Пример #8
0
    def __init__(self, model_folder, max_length=256, lowercase=True):

        # 1. Create tokenizer
        self.max_length = max_length
        vocab_file = os.path.join(model_folder, 'vocab.txt')
        self.tokenizer = FullTokenizer(vocab_file, do_lower_case=lowercase)

        # 2. Read Config
        config_file = os.path.join(model_folder, 'bert_config.json')
        self.config = BertConfig.from_json_file(config_file)

        # 3. Create Model
        self.session = tf.Session()
        self.token_ids_op = tf.placeholder(tf.int32,
                                           shape=(None, max_length),
                                           name='token_ids')
        self.model = BertModel(config=self.config,
                               is_training=False,
                               input_ids=self.token_ids_op,
                               use_one_hot_embeddings=False)

        # 4. Restore Trained Model
        self.saver = tf.train.Saver()
        ckpt_file = os.path.join(model_folder, 'bert_model.ckpt')
        # RCS ckpt_file = os.path.join(model_folder, 'model.ckpt-1000000')
        self.saver.restore(self.session, ckpt_file)

        hidden_layers = self.config.num_hidden_layers
        self.embeddings_op = tf.get_default_graph().get_tensor_by_name(
            "bert/encoder/Reshape_{}:0".format(hidden_layers + 1))
Пример #9
0
    def _bert_model(self, input_ids, input_tag_features, input_masks):
        """Creates the Bert model.

    Args:
      input_ids: A [batch, max_seq_len] int tensor.
      input_masks: A [batch, max_seq_len] int tensor.
    """
        is_training = self._is_training
        options = self._model_proto

        bert_config = BertConfig.from_json_file(options.bert_config_file)
        bert_model = BertModel(bert_config,
                               is_training,
                               input_ids=input_ids,
                               input_mask=input_masks,
                               use_tag_embeddings=True,
                               tag_features=input_tag_features)

        # Restore from checkpoint.
        assignment_map, _ = get_assignment_map_from_checkpoint(
            tf.global_variables(), options.bert_checkpoint_file)
        if 'global_step' in assignment_map:
            assignment_map.pop('global_step')
        tf.compat.v1.train.init_from_checkpoint(options.bert_checkpoint_file,
                                                assignment_map)
        return bert_model.get_pooled_output()
Пример #10
0
    def predict(self, inputs, **kwargs):
        """Predicts the resulting tensors.

    Args:
      inputs: A dictionary of input tensors keyed by names.

    Returns:
      predictions: A dictionary of prediction tensors keyed by name.
    """
        is_training = self._is_training
        options = self._model_proto

        (answer_choices, answer_choices_len,
         answer_label) = (inputs[InputFields.answer_choices_with_question],
                          inputs[InputFields.answer_choices_with_question_len],
                          inputs[InputFields.answer_label])

        # Create model layers.
        token_to_id_layer = token_to_id.TokenToIdLayer(
            options.bert_vocab_file, options.bert_unk_token_id)

        # Convert tokens into token ids.
        batch_size = answer_choices.shape[0]

        answer_choices_token_ids = token_to_id_layer(answer_choices)
        answer_choices_token_ids_reshaped = tf.reshape(
            answer_choices_token_ids, [batch_size * NUM_CHOICES, -1])

        answer_choices_mask = tf.sequence_mask(
            answer_choices_len, maxlen=tf.shape(answer_choices)[-1])
        answer_choices_mask_reshaped = tf.reshape(
            answer_choices_mask, [batch_size * NUM_CHOICES, -1])

        # Bert prediction.
        bert_config = BertConfig.from_json_file(options.bert_config_file)
        bert_model = BertModel(bert_config,
                               is_training,
                               input_ids=answer_choices_token_ids_reshaped,
                               input_mask=answer_choices_mask_reshaped)

        answer_choices_cls_feature_reshaped = bert_model.get_pooled_output()
        answer_choices_cls_feature = tf.reshape(
            answer_choices_cls_feature_reshaped, [batch_size, NUM_CHOICES, -1])

        assignment_map, _ = get_assignment_map_from_checkpoint(
            tf.global_variables(), options.bert_checkpoint_file)

        tf.compat.v1.train.init_from_checkpoint(options.bert_checkpoint_file,
                                                assignment_map)

        # Classification layer.
        output = tf.compat.v1.layers.dense(answer_choices_cls_feature,
                                           units=1,
                                           activation=None)
        output = tf.squeeze(output, axis=-1)

        return {FIELD_ANSWER_PREDICTION: output}
Пример #11
0
def convert(args):
    # Initialise PyTorch model
    config = BertConfig.from_json_file(args.bert_config_file)
    model = BertModel(config)

    # Load weights from TF model
    path = args.tf_checkpoint_path
    print("Converting TensorFlow checkpoint from {}".format(path))

    init_vars = tf.train.list_variables(path)
    names = []
    arrays = []
    for name, shape in init_vars:
        print("Loading {} with shape {}".format(name, shape))
        array = tf.train.load_variable(path, name)
        print("Numpy array shape {}".format(array.shape))
        names.append(name)
        arrays.append(array)

    for name, array in zip(names, arrays):
        name = name[5:]  # skip "bert/"
        print("Loading {}".format(name))
        name = name.split('/')
        if name[0] in ['redictions', 'eq_relationship']:
            print("Skipping")
            continue
        pointer = model
        for m_name in name:
            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
                l = re.split(r'_(\d+)', m_name)
            else:
                l = [m_name]
            if l[0] == 'kernel':
                pointer = getattr(pointer, 'weight')
            else:
                if l[0] != 'l_step':
                    pointer = getattr(pointer, l[0], name)
                else:
                    print(l[0])
            if len(l) >= 2:
                num = int(l[1])
                pointer = pointer[num]
        if m_name[-11:] == '_embeddings':
            pointer = getattr(pointer, 'weight')
        elif m_name == 'kernel':
            array = np.transpose(array)
        try:
            assert pointer.shape == array.shape
        except AssertionError as e:
            e.args += (pointer.shape, array.shape)
            raise
        except AttributeError:
            continue
        pointer.data = torch.from_numpy(array)

    # Save pytorch-model
    torch.save(model.state_dict(), args.pytorch_dump_path)
Пример #12
0
    def build(self, data_iter, bert_config_file):
        # get the inputs
        with tf.variable_scope('inputs'):
            input_map = data_iter.get_next()
            usrid, prdid, input_x, input_y, doc_len = \
                (input_map['usr'], input_map['prd'],
                 input_map['content'], input_map['rating'],
                 input_map['doc_len'])

            input_x = tf.reshape(input_x, [-1, self.max_sen_len])
            sen_len = tf.count_nonzero(input_x, axis=-1)
            doc_len = doc_len // self.max_sen_len

            input_x = tf.cast(input_x, tf.int32)
            self.usr = lookup(self.embeddings['usr_emb'], usrid, name='cur_usr_embedding')
            self.prd = lookup(self.embeddings['prd_emb'], prdid, name='cur_prd_embedding')
            input_x = tf.reshape(input_x, [-1, self.max_sen_len])
            input_mask = tf.sequence_mask(sen_len, self.max_sen_len)
            input_mask = tf.cast(input_mask, tf.int32)

        bert_config = BertConfig.from_json_file(bert_config_file)
        bert = BertModel(bert_config, is_training=False,
                         input_ids=input_x, input_mask=input_mask,
                         token_type_ids=None,
                         use_one_hot_embeddings=False)
        # input_x = bert.get_sequence_output()
        input_x = bert.get_embedding_output()

        # build the process of model
        d_hat = self.nsc(input_x, self.max_sen_len, self.max_doc_len // self.max_sen_len,
                         sen_len, doc_len)
        prediction = tf.argmax(d_hat, 1, name='prediction')

        with tf.variable_scope("loss"):
            sce = tf.nn.softmax_cross_entropy_with_logits_v2
            self.loss = sce(logits=d_hat, labels=tf.one_hot(input_y, self.cls_cnt))

            regularizer = tf.zeros(1)
            params = tf.trainable_variables()
            for param in params:
                if param not in self.embeddings.values():
                    regularizer += tf.nn.l2_loss(param)
            self.loss = tf.reduce_sum(self.loss) + self.l2_rate * regularizer

        prediction = tf.argmax(d_hat, 1, name='prediction')
        with tf.variable_scope("metrics"):
            correct_prediction = tf.equal(prediction, input_y)
            mse = tf.reduce_sum(tf.square(prediction - input_y), name="mse")
            correct_num = tf.reduce_sum(tf.cast(correct_prediction, dtype=tf.int32), name="correct_num")
            accuracy = tf.reduce_sum(tf.cast(correct_prediction, "float"), name="accuracy")

        return self.loss, mse, correct_num, accuracy
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file,
                                     pytorch_dump_path):
    # Initialise PyTorch model
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = BertForPreTraining(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_bert(model, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)
Пример #14
0
def get_bert(path_bert):
    bert_config_file = path_bert + 'bert_config_uncased_L-12_H-768_A-12.json'
    vocab_file = path_bert + 'vocab_uncased_L-12_H-768_A-12.txt'
    init_checkpoint = path_bert + 'pytorch_model_uncased_L-12_H-768_A-12.bin'
    bert_config = BertConfig.from_json_file(bert_config_file)
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=True)
    model_bert = BertModel(bert_config)
    model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
    print("Load pre-trained parameters.")
    if gpu:
        model_bert.to(device)
    return model_bert, tokenizer, bert_config
Пример #15
0
def get_bert(BERT_PT_PATH, bert_type, do_lower_case):

    bert_config_file = os.path.join(BERT_PT_PATH,
                                    f'bert_config_{bert_type}.json')
    vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
    bert_config = BertConfig.from_json_file(bert_config_file)
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=False)
    bert_config.print_status()
    model_bert = BertModel.from_pretrained(bert_type)
    print("Load pre-trained parameters.")
    model_bert.to(device)

    return model_bert, tokenizer, bert_config
Пример #16
0
def main():
    config = BertConfig.from_json_file(
        './bert/models/uncased_L-12_H-768_A-12/bert_config.json')

    def model_fn(features, labels, mode, params):
        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        model = BertModel(config, True, input_ids, input_mask, segment_ids)
        final_hidden = model.get_sequence_output()
        return final_hidden

    est = tf.estimator.Estimator(model_fn)
    print(est)
Пример #17
0
    def get_model(self):
        logging.info("get bert model")
        graph = tf.Graph()
        with graph.as_default():
            ph_input_ids = tf.placeholder(dtype=tf.int32, shape=[None, self._seq_length + 2], name="ph_input_ids")
            con = BertConfig.from_json_file(config.PROJECT_ROOT + "/bert_config.json")
            bert_model = BertModel(config=con, is_training=False, input_ids=ph_input_ids,
                                   use_one_hot_embeddings=True)
            output = bert_model.get_sequence_output()
            init = tf.global_variables_initializer()

        sess = tf.Session(graph=graph)
        sess.run(init)

        return sess, ph_input_ids, output
Пример #18
0
def get_bert(BERT_PATH):
    bert_config_file = BERT_PATH + "/bert_config_uncased_L-12_H-768_A-12.json"
    vocab_file = BERT_PATH + "/vocab_uncased_L-12_H-768_A-12.txt"
    init_checkpoint = BERT_PATH + "/pytorch_model_uncased_L-12_H-768_A-12.bin"

    bert_config = BertConfig.from_json_file(bert_config_file)
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=True)
    bert_config.print_status()

    model_bert = BertModel(bert_config)
    model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
    print("Load pre-trained BERT parameters.")
    model_bert.to(device)
    return model_bert, tokenizer, bert_config
Пример #19
0
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):
    bert_config_file = os.path.join(BERT_PT_PATH,
                                    f'bert_config_{bert_type}.json')
    vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
    init_checkpoint = os.path.join(BERT_PT_PATH,
                                   f'pytorch_model_{bert_type}.bin')

    bert_config = BertConfig.from_json_file(bert_config_file)
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
    bert_config.print_status()

    model_bert = BertModel(bert_config)
    model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
    model_bert.to(device)

    return model_bert, tokenizer, bert_config
Пример #20
0
def get_bert(BERT_PT_PATH):

    bert_config_file = os.path.join(BERT_PT_PATH, 'bert_config.json')
    vocab_file = os.path.join(BERT_PT_PATH, 'vocab.txt')
    init_checkpoint = os.path.join(BERT_PT_PATH, 'pytorch_model.bin')


    bert_config = BertConfig.from_json_file(bert_config_file)
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file)
    bert_config.print_status()

    model_bert = BertModel(bert_config)

    model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
    print("Load pre-trained parameters.")
    model_bert.to(device)

    return model_bert, tokenizer, bert_config
Пример #21
0
def bert(bert_config_file,
         mode,
         dim,
         input_ids,
         input_mask,
         input_type,
         activation,
         init_checkpoint=None):
    bert_config = BertConfig.from_json_file(bert_config_file)
    bert_model = BertModel(config=bert_config,
                           is_training=mode == tf.estimator.ModeKeys.TRAIN,
                           input_ids=input_ids,
                           input_mask=input_mask,
                           token_type_ids=input_type,
                           scope="bert_query")
    output = bert_model.get_pooled_output()
    if mode == tf.estimator.ModeKeys.TRAIN:
        output = tf.nn.dropout(output, keep_prob=0.9)
    sig = tf.layers.dense(output,
                          dim,
                          activation=activation,
                          kernel_initializer=tf.truncated_normal_initializer(
                              stddev=bert_config.initializer_range),
                          name="bert_query/query")

    tvars = tf.trainable_variables('bert_query')
    initialized_variable_names = {}
    if init_checkpoint:
        (assignment_map,
         initialized_variable_names) = get_assignment_map_from_checkpoint(
             tvars, init_checkpoint)
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
    """
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)
    """

    return sig
Пример #22
0
    def __init__(
            self,
            model_fname="/notebooks/embedding/data/sentence-embeddings/bert/tune-ckpt",
            bertconfig_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/bert_config.json",
            vocab_fname="/notebooks/embedding/data/sentence-embeddings/bert/multi_cased_L-12_H-768_A-12/vocab.txt",
            max_seq_length=32,
            dimension=768,
            num_labels=2):

        super().__init__("bert", dimension)
        config = BertConfig.from_json_file(bertconfig_fname)
        self.max_seq_length = max_seq_length
        self.tokenizer = FullTokenizer(vocab_file=vocab_fname,
                                       do_lower_case=False)
        self.model, self.input_ids, self.input_mask, self.segment_ids, self.probs = make_bert_graph(
            config, max_seq_length, 1.0, num_labels, tune=False)
        saver = tf.train.Saver(tf.global_variables())
        self.sess = tf.Session()
        checkpoint_path = tf.train.latest_checkpoint(model_fname)
        saver.restore(self.sess, checkpoint_path)
Пример #23
0
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):

    bert_config_file = os.path.join(BERT_PT_PATH, f"bert_config_{bert_type}.json")
    vocab_file = os.path.join(BERT_PT_PATH, f"vocab_{bert_type}.txt")
    init_checkpoint = os.path.join(BERT_PT_PATH, f"pytorch_model_{bert_type}.bin")

    bert_config = BertConfig.from_json_file(bert_config_file)
    tokenizer = tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case
    )
    bert_config.print_status()

    model_bert = BertModel(bert_config)
    if no_pretraining:
        pass
    else:
        model_bert.load_state_dict(torch.load(init_checkpoint, map_location="cpu"))
        print("Load pre-trained parameters.")
    model_bert.to(device)

    return model_bert, tokenizer, bert_config
Пример #24
0
    def __init__(self, aggregation_method):
        self.aggregation_method = aggregation_method #'cls_max',  'cls_avg', 'cls_attn' or 'cls_transformer'

        self.tokenizer = tokenization.FullTokenizer(vocab_file='vocab.txt')
        self.writer = tf.python_io.TFRecordWriter("output.tfrecords")

        self.run_config = tf.estimator.tpu.RunConfig(
            cluster=None,
            model_dir=None,
            save_checkpoints_steps=1000,
            keep_checkpoint_max=1,
            tpu_config=tf.estimator.tpu.TPUConfig(
                iterations_per_loop=1000,
                num_shards=8,
                per_host_input_for_training=tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2))

        self.model_fn = model_fn_builder(
            bert_config=BertConfig.from_json_file('bert_models_onMSMARCO/vanilla_bert_tiny_on_MSMARCO/bert_config.json'),
            num_labels=2,
            init_checkpoint='bert_models_onMSMARCO/vanilla_bert_tiny_on_MSMARCO/model.ckpt-1600000',
            learning_rate=5e-5,
            num_train_steps=None,
            num_warmup_steps=None,
            use_tpu=False,
            use_one_hot_embeddings=False,
            aggregation_method=self.aggregation_method,
            pretrained_model='bert',
            from_distilled_student=False)

        self.estimator = tf.estimator.tpu.TPUEstimator(
            use_tpu=False,
            model_fn=self.model_fn,
            config=self.run_config,
            train_batch_size=32,
            eval_batch_size=32,
            predict_batch_size=32)
Пример #25
0
    def __init__(self, opt, bert_config=None):
        super(SANBertNetwork, self).__init__()
        self.dropout_list = []
        self.bert_config = BertConfig.from_dict(opt)
        self.bert = BertModel(self.bert_config)
        if opt['update_bert_opt'] > 0:
            for p in self.bert.parameters():
                p.requires_grad = False
        mem_size = self.bert_config.hidden_size
        self.decoder_opt = opt['answer_opt']
        self.scoring_list = nn.ModuleList()
        labels = [int(ls) for ls in opt['label_size'].split(',')]
        task_dropout_p = opt['tasks_dropout_p']
        self.bert_pooler = None

        for task, lab in enumerate(labels):
            decoder_opt = self.decoder_opt[task]
            dropout = DropoutWrapper(task_dropout_p[task], opt['vb_dropout'])
            self.dropout_list.append(dropout)
            if decoder_opt == 1:
                out_proj = SANClassifier(mem_size,
                                         mem_size,
                                         lab,
                                         opt,
                                         prefix='answer',
                                         dropout=dropout)
                self.scoring_list.append(out_proj)
            else:
                out_proj = nn.Linear(self.bert_config.hidden_size, lab)
                self.scoring_list.append(out_proj)
        # print('len of scoring_list:',len(self.scoring_list))
        # input()
        # pdb.set_trace()
        self.opt = opt
        self._my_init()
        self.set_embed(opt)
Пример #26
0
def get_bert(BERT_PT_PATH, bert_type, do_lower_case, no_pretraining):
    bert_config_file = os.path.join(BERT_PT_PATH,
                                    f'bert_config.json')  #bert的配置文件
    vocab_file = os.path.join(BERT_PT_PATH, f'vocab.txt')  #bert的词汇文件
    init_checkpoint = os.path.join(BERT_PT_PATH,
                                   f'pytorch_model.bin')  #bert的预训练模型(不一定有)
    """
    ==BertConfig==该类在bert文件里的modeling里,用bert的配置文件初始化(默认uS)
    <from_json_file>方法用于读取bert配置文件的内容
    """
    bert_config = BertConfig.from_json_file(bert_config_file)
    """
    ==tokenization==bert里的文件
    ==FullTokenizer==类,里面存放词汇信息
    """
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)

    #毫无作用的输出参数
    bert_config.print_status()
    """
    ==BertModel==该类在bert文件里的modeling里,同样用bert的配置文件初始化,里面有一系列对bert模型的操作(例如添加层,加载参数等...)
    """
    model_bert = BertModel(bert_config)

    if no_pretraining:  #如果不用bert预训练模型,只要它们团队的模型(不需要.bin)
        pass
    else:
        model_bert.load_state_dict(
            torch.load(init_checkpoint,
                       map_location='cpu'))  #加载.bin文件,即加载预训练参数
        print("Load pre-trained parameters.")
    model_bert.to(device)

    #      bert模型       词汇       bert配置文件
    return model_bert, tokenizer, bert_config
Пример #27
0
def get_bert(BERT_PT_PATH, bert_type, do_lower_case):

    bert_config_file = os.path.join(BERT_PT_PATH,
                                    f'bert_config_{bert_type}.json')
    vocab_file = os.path.join(BERT_PT_PATH, f'vocab_{bert_type}.txt')
    #init_checkpoint = os.path.join(BERT_PT_PATH, f'pytorch_model_{bert_type}.bin')
    #init_checkpoint = os.path.join(BERT_PT_PATH, f'bert_model_{bert_type}.ckpt.data')

    bert_config = BertConfig.from_json_file(bert_config_file)
    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,
                                           do_lower_case=False)
    bert_config.print_status()
    model_bert = BertModel.from_pretrained(bert_type)
    #model_bert.eval()
    '''
    if no_pretraining:
        pass
    else:
        model_bert.load_state_dict(torch.load(init_checkpoint, map_location='cpu'))
    '''
    print("Load pre-trained parameters.")
    model_bert.to(device)

    return model_bert, tokenizer, bert_config
Пример #28
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--bert_config_file", default=None, type=str, required=True,
                        help="The config json file corresponding to the pre-trained BERT model. "
                             "This specifies the model architecture.")
    parser.add_argument("--vocab_file", default=None, type=str, required=True,
                        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--debug", default=False, action='store_true', help="Whether to run in debug mode.")
    parser.add_argument("--data_dir", default='data/semeval_14', type=str, help="SemEval data dir")
    parser.add_argument("--train_file", default=None, type=str, help="SemEval xml for training")
    parser.add_argument("--predict_file", default=None, type=str, help="SemEval csv for prediction")
    parser.add_argument("--extraction_file", default=None, type=str, help="pkl file for extraction")
    parser.add_argument("--init_checkpoint", default=None, type=str,
                        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument("--do_lower_case", default=True, action='store_true',
                        help="Whether to lower case the input text. Should be True for uncased "
                             "models and False for cased models.")
    parser.add_argument("--max_seq_length", default=96, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
                             "longer than this will be truncated, and sequences shorter than this will be padded.")
    parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.")
    parser.add_argument("--do_predict", default=False, action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--do_pipeline", default=False, action='store_true', help="Whether to run pipeline on the dev set.")
    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
    parser.add_argument("--predict_batch_size", default=32, type=int, help="Total batch size for predictions.")
    parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion", default=0.1, type=float,
                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
                             "of training.")
    parser.add_argument("--save_proportion", default=0.5, type=float,
                        help="Proportion of steps to save models for. E.g., 0.5 = 50% "
                             "of training.")
    parser.add_argument("--verbose_logging", default=False, action='store_true',
                        help="If true, all of the warnings related to data processing will be printed. "
                             "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--optimize_on_cpu',
                        default=False,
                        action='store_true',
                        help="Whether to perform optimization and keep the optimizer averages on CPU")
    parser.add_argument('--fp16',
                        default=False,
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=128,
                        help='Loss scaling, positive power of 2 values can improve fp16 convergence.')

    args = parser.parse_args()

    if not args.do_train and not args.do_predict and not args.do_pipeline:
        raise ValueError("At least one of `do_train` or `do_predict` must be True.")

    if args.do_train and not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict and not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified.")
    if args.do_pipeline and not args.extraction_file:
            raise ValueError(
                "If `do_pipeline` is True, then `extraction_file` must be specified.")

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if args.fp16:
            logger.info("16-bits training currently not supported in distributed training")
            args.fp16 = False # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("torch_version: {} device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        torch.__version__, device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    if args.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (args.max_seq_length, bert_config.max_position_embeddings))

    tokenizer = tokenization.FullTokenizer(
        vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    logger.info('output_dir: {}'.format(args.output_dir))
    save_path = os.path.join(args.output_dir, 'checkpoint.pth.tar')
    log_path = os.path.join(args.output_dir, 'performance.txt')
    network_path = os.path.join(args.output_dir, 'network.txt')
    parameter_path = os.path.join(args.output_dir, 'parameter.txt')

    f = open(parameter_path, "w")
    for arg in sorted(vars(args)):
        print("{}: {}".format(arg, getattr(args, arg)), file=f)
    f.close()

    logger.info("***** Preparing model *****")
    model = BertForSpanAspectClassification(bert_config)
    if args.init_checkpoint is not None and not os.path.isfile(save_path):
        model = bert_load_state_dict(model, torch.load(args.init_checkpoint, map_location='cpu'))
        logger.info("Loading model from pretrained checkpoint: {}".format(args.init_checkpoint))

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if os.path.isfile(save_path):
        checkpoint = torch.load(save_path)
        model.load_state_dict(checkpoint['model'])
        step = checkpoint['step']
        logger.info("Loading model from finetuned checkpoint: '{}' (step {})"
                    .format(save_path, step))

    f = open(network_path, "w")
    for n, param in model.named_parameters():
        print("name: {}, size: {}, dtype: {}, requires_grad: {}"
              .format(n, param.size(), param.dtype, param.requires_grad), file=f)
    total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print("Total trainable parameters: {}".format(total_trainable_params), file=f)
    print("Total parameters: {}".format(total_params), file=f)
    f.close()

    logger.info("***** Preparing data *****")
    train_dataloader, num_train_steps = None, None
    eval_examples, eval_features, eval_dataloader = None, None, None
    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)
    if args.do_train:
        logger.info("***** Preparing training *****")
        train_dataloader, num_train_steps = read_train_data(args, tokenizer, logger)
        logger.info("***** Preparing evaluation *****")
        eval_examples, eval_features, eval_dataloader = read_eval_data(args, tokenizer, logger)

    logger.info("***** Preparing optimizer *****")
    optimizer, param_optimizer = prepare_optimizer(args, model, num_train_steps)

    global_step = 0
    if os.path.isfile(save_path):
        checkpoint = torch.load(save_path)
        optimizer.load_state_dict(checkpoint['optimizer'])
        step = checkpoint['step']
        logger.info("Loading optimizer from finetuned checkpoint: '{}' (step {})".format(save_path, step))
        global_step = step

    if args.do_train:
        logger.info("***** Running training *****")
        best_f1 = 0
        save_checkpoints_steps = int(num_train_steps / (5 * args.num_train_epochs))
        start_save_steps = int(num_train_steps * args.save_proportion)
        if args.debug:
            args.num_train_epochs = 1
            save_checkpoints_steps = 20
            start_save_steps = 0
        model.train()
        for epoch in range(int(args.num_train_epochs)):
            logger.info("***** Epoch: {} *****".format(epoch+1))
            global_step, model, best_f1 = run_train_epoch(args, global_step, model, param_optimizer,
                                                          train_dataloader, eval_examples, eval_features,
                                                          eval_dataloader,
                                                          optimizer, n_gpu, device, logger, log_path, save_path,
                                                          save_checkpoints_steps, start_save_steps, best_f1)

    if args.do_predict:
        logger.info("***** Running prediction *****")
        if eval_dataloader is None:
            eval_examples, eval_features, eval_dataloader = read_eval_data(args, tokenizer, logger)

        # restore from best checkpoint
        if save_path and os.path.isfile(save_path) and args.do_train:
            checkpoint = torch.load(save_path)
            model.load_state_dict(checkpoint['model'])
            step = checkpoint['step']
            logger.info("Loading model from finetuned checkpoint: '{}' (step {})"
                        .format(save_path, step))

        model.eval()
        metrics = evaluate(args, model, device, eval_examples, eval_features, eval_dataloader, logger, write_pred=True)
        print("step: {}, P: {:.4f}, R: {:.4f}, F1: {:.4f} (common: {}, retrieved: {}, relevant: {})"
              .format(global_step, metrics['p'], metrics['r'],
                      metrics['f1'], metrics['common'], metrics['retrieved'], metrics['relevant']))

    if args.do_pipeline:
        logger.info("***** Running prediction *****")
        eval_examples, eval_features, eval_dataloader = pipeline_eval_data(args, tokenizer, logger)

        # restore from best checkpoint
        if save_path and os.path.isfile(save_path) and args.do_train:
            checkpoint = torch.load(save_path)
            model.load_state_dict(checkpoint['model'])
            step = checkpoint['step']
            logger.info("Loading model from finetuned checkpoint: '{}' (step {})"
                        .format(save_path, step))

        model.eval()
        metrics = evaluate(args, model, device, eval_examples, eval_features, eval_dataloader, logger, write_pred=True)
        f = open(log_path, "a")
        print("pipeline, step: {}, P: {:.4f}, R: {:.4f}, F1: {:.4f} (common: {}, retrieved: {}, relevant: {})"
              .format(global_step, metrics['p'], metrics['r'],
                      metrics['f1'], metrics['common'], metrics['retrieved'], metrics['relevant']), file=f)
        print(" ", file=f)
        f.close()
Пример #29
0
import pickle
import json
from bert import tokenization
from bert.modeling import BertConfig
from utils import (input_fn_builder, make_filename, read_squad_examples,
                   FeatureWriter, convert_examples_to_features)
from train import model_fn_builder, FLAGS
from models.rnn_lstm import create_rnn_lstm_model, LSTMConfig
from models.cnn import CNNConfig, create_cnn_model
from models.cnn_keras import CNNKerasConfig, create_cnnKeras_model
from models.contextualized_cnn import create_contextualized_cnn_model, ContextualizedCNNConfig
from models.fully_connected import create_fully_connected_model, FullyConnectedConfig

DATA_BERT_DIRECTORY = FLAGS.data_bert_directory
BERT_CONFIG_FILE = "%s/bert_config.json" % DATA_BERT_DIRECTORY
bert_config = BertConfig.from_json_file(BERT_CONFIG_FILE)

INIT_CHECKPOINT = FLAGS.output_dir
if FLAGS.init_checkpoint is not None:
    INIT_CHECKPOINT = '%s/%s' % (FLAGS.output_dir, FLAGS.init_checkpoint)

DEV_FILENAME = make_filename('dev', 1., FLAGS.features_dir, FLAGS.fine_tune,
                             FLAGS.n_examples)
print('DEV_FILENAE %s' % DEV_FILENAME)

RawResult = collections.namedtuple("RawResult",
                                   ["unique_id", "start_logits", "end_logits"])


def load_and_save_config(filename):
    with tf.gfile.GFile(filename, 'r') as json_data:
Пример #30
0
    def predict(self, inputs, **kwargs):
        """Predicts the resulting tensors.

    Args:
      inputs: A dictionary of input tensors keyed by names.

    Returns:
      predictions: A dictionary of prediction tensors keyed by name.
    """
        is_training = self._is_training
        options = self._model_proto

        token_to_id_layer = token_to_id.TokenToIdLayer(
            options.bert_vocab_file, options.bert_unk_token_id)
        bert_config = BertConfig.from_json_file(options.bert_config_file)
        slim_fc_scope = hyperparams.build_hyperparams(options.fc_hyperparams,
                                                      is_training)()

        # Predict object embedding vectors.
        (num_objects, object_bboxes, object_labels, object_scores,
         object_features, max_num_objects) = _trim_to_max_num_objects(
             inputs[InputFields.num_detections],
             inputs[InputFields.detection_boxes],
             inputs[InputFields.detection_classes],
             inputs[InputFields.detection_scores],
             inputs[InputFields.detection_features],
             max_num_objects=options.max_num_objects)

        object_features = _predict_object_embeddings(
            object_features,
            bert_config.hidden_size,
            slim_fc_scope,
            keep_prob=options.dropout_keep_prob,
            is_training=is_training)

        # Gather text inputs.
        (answer_choices, answer_choices_tag,
         answer_choices_len) = (inputs[self._field_answer_choices],
                                inputs[self._field_answer_choices_tag],
                                inputs[self._field_answer_choices_len])
        batch_size = answer_choices.shape[0]

        answer_choices_tag = _assign_invalid_tags(answer_choices_tag,
                                                  max_num_objects)

        # Convert tokens into token ids.
        answer_choices_token_ids = token_to_id_layer(answer_choices)
        answer_choices_token_ids = tf.reshape(answer_choices_token_ids,
                                              [batch_size * NUM_CHOICES, -1])
        answer_choices_mask = tf.sequence_mask(
            answer_choices_len, maxlen=tf.shape(answer_choices)[-1])
        answer_choices_mask = tf.reshape(answer_choices_mask,
                                         [batch_size * NUM_CHOICES, -1])

        # Create tag features sequence.
        answer_choices_tag = tf.reshape(answer_choices_tag,
                                        [batch_size * NUM_CHOICES, -1])
        answer_choices_tag_embeddings = _ground_tag_using_object_features(
            object_features, answer_choices_tag)

        (tiled_object_masks, tiled_object_ids,
         tiled_object_features) = _tile_objects(
             num_objects, token_to_id_layer(object_labels), object_features)

        # Create Bert model.
        input_ids = tf.concat([answer_choices_token_ids, tiled_object_ids], -1)
        input_tag_embeddings = tf.concat(
            [answer_choices_tag_embeddings, tiled_object_features], 1)
        input_mask = tf.concat([answer_choices_mask, tiled_object_masks], -1)

        output = self._bert_model(
            input_ids,
            input_tag_embeddings,
            input_mask,
            bert_config,
            bert_checkpoint_file=options.bert_checkpoint_file,
            is_training=is_training)

        # Classification layer.
        with slim.arg_scope(slim_fc_scope):
            output = slim.fully_connected(output,
                                          num_outputs=1,
                                          activation_fn=None,
                                          scope='logits')
            output = tf.reshape(output, [batch_size, NUM_CHOICES])

        return {FIELD_ANSWER_PREDICTION: output}