示例#1
0
    def start_bundle(self):
        bert_config = modeling.BertConfig.from_json_file(
            self._bert_config_file)

        self._tokenizer = tokenization.FullTokenizer(
            vocab_file=self._vocab_file, do_lower_case=self._do_lower_case)

        is_per_host = tf.compat.v1.estimator.tpu.InputPipelineConfig.PER_HOST_V2
        run_config = tf.compat.v1.estimator.tpu.RunConfig(
            master=None,
            tpu_config=tf.compat.v1.estimator.tpu.TPUConfig(
                num_shards=1, per_host_input_for_training=is_per_host))

        model_fn = extract_features.model_fn_builder(
            bert_config=bert_config,
            init_checkpoint=self._init_checkpoint,
            layer_indexes=self._layer_indexes,
            use_tpu=False,
            use_one_hot_embeddings=False)

        self._estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
            use_tpu=False,
            model_fn=model_fn,
            config=run_config,
            predict_batch_size=1)
示例#2
0
    def _build_bert_model(self):
        # load pre-trained model config
        bert_config_file = self.bert_model_dir + "bert_config.json"
        bert_config = BertConfig.from_json_file(bert_config_file)

        # code to facilitate TPU usage - not used in this case so can be overlooked
        is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
        run_config = tf.contrib.tpu.RunConfig(
            master=None,
            tpu_config=tf.contrib.tpu.TPUConfig(
            num_shards=8,
            per_host_input_for_training=is_per_host)
        )

        # then load build BERT model
        checkpoint_file = self.bert_model_dir + 'bert_model.ckpt'
        
        model_fn = model_fn_builder(
            bert_config = bert_config,
            # the bert_model.ckpt file is actually three files, but is referenced as one
            init_checkpoint = checkpoint_file,
            layer_indexes = self.layer_indexes,
            use_tpu = False,
            # extract_features script reccomends this to be set to true if using TPU
            # apparently much faster
            use_one_hot_embeddings = False
        )
        estimator = tf.contrib.tpu.TPUEstimator(
            use_tpu=False,
            model_fn=model_fn,
            config=run_config,
            predict_batch_size=32
        )

        return estimator
示例#3
0
 def __init__(self, id, args, worker_address, sink_address):
     super().__init__()
     self.model_dir = args.model_dir
     self.config_fp = os.path.join(self.model_dir, 'bert_config.json')
     self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt')
     self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt')
     self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp)
     self.max_seq_len = args.max_seq_len
     self.worker_id = id
     self.daemon = True
     self.model_fn = model_fn_builder(
         bert_config=modeling.BertConfig.from_json_file(self.config_fp),
         init_checkpoint=self.checkpoint_fp,
         pooling_strategy=args.pooling_strategy,
         pooling_layer=args.pooling_layer)
     os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id)
     config = tf.ConfigProto()
     config.gpu_options.allow_growth = True
     config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory_fraction
     self.estimator = Estimator(self.model_fn,
                                config=RunConfig(session_config=config))
     self.exit_flag = multiprocessing.Event()
     self.logger = set_logger('WORKER-%d' % self.worker_id)
     self.worker_address = worker_address
     self.sink_address = sink_address
示例#4
0
def prepare_bert(bert_path, bert_config_file, bert_vocab_file, init_checkpoint,
                 select_layers):
    bert_config = modeling.BertConfig.from_json_file(bert_config_file)
    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=init_checkpoint,
                                layer_indexes=select_layers,
                                use_tpu=False,
                                use_one_hot_embeddings=False)

    estimator = tf.contrib.tpu.TPUEstimator(model_fn=model_fn,
                                            model_dir=bert_path,
                                            use_tpu=False,
                                            predict_batch_size=32,
                                            config=tf.contrib.tpu.RunConfig())
    #config=tf.contrib.tpu.RunConfig(master=None, tpu_config=tf.contrib.tpu.TPUConfig(num_shards=8, per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2)))

    #config = tf.ConfigProto()
    #config.gpu_options.allow_growth = True
    #config.gpu_options.per_process_gpu_memory_fraction = 0.3
    #estimator = Estimator(model_fn, config=RunConfig(session_config=config), params = {'batch_size': 32}, model_dir=MODEL_DIR)

    tokenizer = tokenization.FullTokenizer(vocab_file=bert_vocab_file,
                                           do_lower_case=False)

    return estimator, tokenizer
示例#5
0
 def __init__(self, id, args):
     super().__init__()
     self.model_dir = args.model_dir
     self.config_fp = os.path.join(self.model_dir, 'bert_config.json')
     self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt')
     self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt')
     self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp)
     self.max_len = args.max_len
     self.worker_id = id
     self.daemon = True
     self.model_fn = model_fn_builder(
         bert_config=modeling.BertConfig.from_json_file(self.config_fp),
         init_checkpoint=self.checkpoint_fp)
     os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id)
     self.estimator = Estimator(self.model_fn)
     self.result = []
示例#6
0
 def __init__(self, id, args):
     super().__init__()
     self.model_dir = args.model_dir
     self.config_fp = os.path.join(self.model_dir, 'bert_config.json')
     self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt')
     self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt')
     self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp)
     self.max_seq_len = args.max_seq_len
     self.worker_id = id
     self.daemon = True
     self.model_fn = model_fn_builder(
         bert_config=modeling.BertConfig.from_json_file(self.config_fp),
         init_checkpoint=self.checkpoint_fp)
     os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id)
     self.estimator = Estimator(self.model_fn)
     self.dest = None
     self._start_t = time.perf_counter()
     self.socket = None
     self.exit_flag = multiprocessing.Event()
示例#7
0
 def __init__(self, id, args):
     super().__init__()
     self.model_dir = args.model_dir
     self.config_fp = os.path.join(self.model_dir, 'bert_config.json')
     self.checkpoint_fp = os.path.join(self.model_dir, 'bert_model.ckpt')
     self.vocab_fp = os.path.join(args.model_dir, 'vocab.txt')
     self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp)
     self.max_seq_len = args.max_seq_len
     self.worker_id = id
     self.daemon = True
     self.model_fn = model_fn_builder(
         bert_config=modeling.BertConfig.from_json_file(self.config_fp),
         init_checkpoint=self.checkpoint_fp,
         pooling_strategy=args.pooling_strategy,
         pooling_layer=args.pooling_layer)
     os.environ['CUDA_VISIBLE_DEVICES'] = str(self.worker_id)
     self.estimator = Estimator(self.model_fn)
     self.exit_flag = multiprocessing.Event()
     self.logger = set_logger('WORKER-%d' % self.worker_id)
示例#8
0
 def init(self):
     bert_config = modeling.BertConfig.from_json_file(CONFIG)
     model_fn = model_fn_builder(bert_config=bert_config,
                                 init_checkpoint=CKPT,
                                 layer_indexes=_layers,
                                 use_tpu=False,
                                 use_one_hot_embeddings=False)
     self._estimator = tf.contrib.tpu.TPUEstimator(
         model_fn=model_fn,
         model_dir=MODEL_DIR,
         use_tpu=False,
         predict_batch_size=32,
         config=tf.contrib.tpu.RunConfig(
             master=None,
             tpu_config=tf.contrib.tpu.TPUConfig(
                 num_shards=8,
                 per_host_input_for_training=tf.contrib.tpu.
                 InputPipelineConfig.PER_HOST_V2)))
     self._tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB,
                                                  do_lower_case=False)
示例#9
0
    def __init__(self):
        # the pooling layer index of bert-original model
        self.pooling_layer = [-2]
        # the pooling_strategy of bert-original model
        self.pooling_strategy = PoolingStrategy.REDUCE_MEAN
        # "The maximum total input sequence length after WordPiece tokenization. "
        # "Sequences longer than this will be truncated, and sequences shorter "
        # "than this will be padded."
        self.max_seq_len = 128

        self.bert_model_dir = sys_conf["bert_dir"]
        self.config_fp = os.path.join(self.bert_model_dir, "bert_config.json")
        self.ckpt_fp = os.path.join(self.bert_model_dir, "bert_model.ckpt")
        self.vocab_fp = os.path.join(self.bert_model_dir, "vocab.txt")
        self.tokenizer = tokenization.FullTokenizer(vocab_file=self.vocab_fp)
        self.model_fn = model_fn_builder(
            bert_config=modeling.BertConfig.from_json_file(self.config_fp),
            init_checkpoint=self.ckpt_fp,
            pooling_strategy=self.pooling_strategy,
            pooling_layer=self.pooling_layer)
        self.estimator = Estimator(self.model_fn)
示例#10
0
from bert import modeling, tokenization
from bert.extract_features import model_fn_builder, convert_lst_to_features, PoolingStrategy

# 获取当前文件的上层路径
path = os.path.dirname(os.path.abspath(__file__))
model_dir = "/Users/yucong/PycharmProjects/helloAi/bert"
config_fp = os.path.join(model_dir, 'bert_config.json')
checkpoint_fp = os.path.join(model_dir, 'bert_model.ckpt')
vocab_fp = os.path.join(model_dir, 'vocab.txt')
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_fp)
max_seq_len = 10
worker_id = id
daemon = True
model_fn = model_fn_builder(
    bert_config=modeling.BertConfig.from_json_file(config_fp),
    init_checkpoint=checkpoint_fp,
    pooling_strategy=PoolingStrategy.NONE,
    pooling_layer=[-2])
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.per_process_gpu_memory_fraction = 0.3
estimator = Estimator(model_fn,
                      config=RunConfig(session_config=config),
                      model_dir=None)


def input_fn_builder(msg):
    def gen():
        for i in range(1):
            tmp_f = list(convert_lst_to_features(msg, max_seq_len, tokenizer))
            yield {