Exemplo n.º 1
0
    def _init_bert_client(model_dir, max_seq_len, device_map,
                          num_worker) -> BertClient:
        """Initialize bert client for sentence embeddings and avoid restarting bert-server if already running.

        For more information, see: https://github.com/hanxiao/bert-as-service
        Bert-server can take a long time to start, take over stdout during training, and create many temp log files.
        It's highly recommended to run bert-server beforehand from command-line in a dedicated folder:
        e.g:
        ~/gym-summarizer/data/bert $
            bert-serving-start -model_dir uncased_L-12_H-768_A-12/ -max_seq_len 40 -device_map 1 2 3 4 -num_worker 4

        :param model_dir: directory containing bert model
        :param max_seq_len: max sequence length for bert
        :return bc: bert-client
        """

        try:
            bc = BertClient()
        except:
            from bert_serving.server.helper import get_args_parser
            from bert_serving.server import BertServer
            args = get_args_parser().parse_args([
                '-model_dir', model_dir, '-max_seq_len', max_seq_len,
                '-device_map', device_map, '-num_worker', num_worker
            ])
            server = BertServer(args)
            server.start()
            bc = BertClient()

        return bc
Exemplo n.º 2
0
 def post_init(self):
     from bert_serving.server import BertServer
     from bert_serving.server import get_args_parser
     self.bert_server = BertServer(get_args_parser().parse_args(
         self._bert_args))
     self.bert_server.start()
     self.bert_server.is_ready.wait()
Exemplo n.º 3
0
class BERTserver:
    """
    Sentence encoder using BERT (server side)
    """

    def __init__(self, path,  port='5555', port_out='5556', pooling_strategy='REDUCE_MEAN'):
        """
        Word vocabulary initialization
        
        Args:
            path (str): BERT pretrained vector path
            port (str, optional, defaults to '5555'): server port for receiving data from client
            port_out(str, optional, defaults to '5556'): server port for sending result to client
            pooling_strategy(str, optional, defaults to `REDUCE_MEAN`): {NONE, REDUCE_MAX, REDUCE_MEAN, REDUCE_MEAN_MAX, FIRST_TOKEN, LAST_TOKEN}
        """
        self.__port = port
        self.__port_out = port_out
        args = get_args_parser().parse_args(['-model_dir', path,
                                     '-port', self.__port,
                                     '-port_out', self.__port_out,
                                     '-max_seq_len', 'NONE',
                                     '-mask_cls_sep',
                                     '-cpu',
                                     '-pooling_strategy', pooling_strategy])
        self.__server = BertServer(args)
        self.__server.start()
def main():
    args = get_args_parser().parse_args([
        '-model_dir', r'../data/chinese_L-12_H-768_A-12', '-port', '86500',
        '-port_out', '86501', '-max_seq_len', '512', '-mask_cls_sep', '-cpu'
    ])

    bs = BertServer(args)
    bs.start()
Exemplo n.º 5
0
 def __init__(self, model_path):
     args = get_args_parser().parse_args([
         '-num_worker', '4', '-model_dir', model_path, '-port', '5555',
         '-port_out', '5556', '-max_seq_len', 'NONE', '-mask_cls_sep',
         '-cpu'
     ])
     # 详细说明,请参考:https://github.com/hanxiao/bert-as-service
     self._server = BertServer(args)
Exemplo n.º 6
0
def main():
    args = get_args_parser().parse_args([
        '-model_dir', './uncased_L-12_H-768_A-12', '-port', '5555',
        '-port_out', '5556', '-max_seq_len', '25', '-num_worker', '1',
        '-mask_cls_sep', '-cpu'
    ])
    server = BertServer(args)
    server.start()
Exemplo n.º 7
0
def start_server(max_seq_len, pretrained_model):
    args = get_args_parser().parse_args([
        '-model_dir', pretrained_model, '-port', '5555', '-port_out', '5556',
        '-pooling_strategy', 'NONE', '-show_tokens_to_client', '-max_seq_len',
        str(max_seq_len), '-mask_cls_sep', '-cpu'
    ])
    server = BertServer(args)
    server.start()
Exemplo n.º 8
0
def main():
    args = get_args_parser().parse_args([
        '-model_dir', './biobert', '-ckpt_name', 'model.ckpt-1000000', '-port',
        '5555', '-port_out', '5556', '-max_seq_len', '30', '-num_worker', '1',
        '-mask_cls_sep', '-cpu'
    ])
    server = BertServer(args)
    server.start()
Exemplo n.º 9
0
def start_bert_server():
    from bert_serving.server.helper import get_args_parser
    from bert_serving.server import BertServer
    args = get_args_parser().parse_args(['-model_dir', 'YOUR_MODEL_PATH_HERE',
                                         '-port', '5555',
                                         '-port_out', '5556',
                                         '-num_worker',
                                         '-cpu'])
    server = BertServer(args)
    server.start()
Exemplo n.º 10
0
 def __init__(self):
     args = get_args_parser().parse_args([
         '-model_dir',
         '/Data_HDD/zhipengye/projects/bert/multi_cased_L-12_H-768_A-12',
         '-port', '5555', '-port_out', '5556', '-max_seq_len', 'NONE',
         '-mask_cls_sep', '-cpu'
     ])
     self.server = BertServer(args)
     self.server.start()
     print('bert sever has started')
def main():
    args = get_args_parser().parse_args(
        ['-model_dir', 'uncased_L-12_H-768_A-12'])
    #                                      ,'-port', '5555',
    #                                      '-port_out', '5556',
    #                                      '-max_seq_len', 'NONE',
    #                                      '-mask_cls_sep',
    #                                      '-cpu'])
    server = BertServer(args)
    server.start()
Exemplo n.º 12
0
def bert_server_start():
    # 感谢哈工大人工智能团队提供的bert服务
    args = get_args_parser().parse_args(['-num_worker', '1',
                                         '-model_dir', BERT_MODEL_PATH,
                                         '-port', '5555',
                                         '-port_out', '5556',
                                         '-max_seq_len', 'NONE',
                                         '-mask_cls_sep',
                                         '-cpu'])
    bert_server = BertServer(args)
    bert_server.start()
def natural_language_to_embeddings(dataset_file):#dataset in a CSV File format
    args = get_args_parser().parse_args(['-model_dir', 'C:\\Users\\Ronak\\Desktop\\uncased_L-12_H-768_A-12','-port', '5555','-port_out', '5556','-max_seq_len', 'NONE', '-mask_cls_sep','-cpu'])
    server = BertServer(args)
    server.start()
    bc = BertClient()
    df=pd.read_csv(dataset_file)
    l=df.values.tolist()
    nat_lan_sen=[]
    for i in l:
        nat_lan_sen.append(i[0])
    sen_encodings=bc.encode(nat_lan_sen)
    return sen_encodings
def bert_service_start(switch=True):  # 启动bert服务 并启动服务监控 输入参数为是否开关服务
    from bert_serving.server.helper import get_args_parser
    from bert_serving.server import BertServer
    from bert_serving.client import BertClient

    args = get_args_parser().parse_args(
        ['-model_dir', 'models\chinese_L-12_H-768_A-12'])
    server = BertServer(args)
    if switch:
        server.start()
    else:
        pass
Exemplo n.º 15
0
def start():

    args = get_args_parser().parse_args([
        '-model_dir',
        '/Users/henry/Documents/application/multi-label-bert/data/chinese_L-12_H-768_A-12/',
        '-tuned_model_dir',
        '/Users/henry/Documents/application/nlp_assignments/data/KnowledgeLabel/corpus2/output/',
        '-port', '12544', '-ckpt_name', 'model.ckpt-1000', '-port_out',
        '12546', '-http_port', '12547', '-max_seq_len', '128', '-mask_cls_sep',
        '-show_tokens_to_client', '-pooling_strategy', 'NONE', '-cpu'
    ])
    server = BertServer(args)
    server.start()
Exemplo n.º 16
0
def launch_bert_as_service_server(model_name, layer, encoding_level = None, pooling_strategy = None):
    """
    Launches a BERT-as-service server used to encode the sentences using the designated BERT model
    https://github.com/hanxiao/bert-as-service

    Args:
        - :param: `model_name` (str): the specific bert model to use
        - :param: `layer` (int): the layer of representation to use
        - :param  'encoding_level' (int): n-gram encoding level - desginates how many word vectors to combine for each final embedding vector
                                        if 'none' -> embedding level defaults to the sentence level of each individual sentence
        - :param: `pooling_strategy` (str): the vector combination strategy - used when 'encoding_level' == 'sentence' 
    
    """

    model_path = bert_model_directories[model_name]
    pooling_layer = layers_base[layer]

    server_parameters = ""
        
    if encoding_level == None:

        if pooling_strategy not in pooling_strategies:
            print('"pooling_strategy" must be defined as one of the following:', pooling_strategies)
            return

        server_parameters = get_args_parser().parse_args(['-model_dir', model_path,
                                        '-port', '5555',
                                        '-port_out', '5556',
                                        '-max_seq_len', 'NONE',                                        
                                        '-pooling_layer', pooling_layer,
                                        '-pooling_strategy', pooling_strategy, 
                                        '-num_workers', '=1'])
    
    elif encoding_level >=1:
        server_parameters = get_args_parser().parse_args(['-model_dir', model_path,
                                        '-port', '5555',
                                        '-port_out', '5556',
                                        '-max_seq_len', 'NONE',                                        
                                        '-pooling_layer', pooling_layer,
                                        '-pooling_strategy', 'NONE',
                                        '-num_workers', '=1'])
    else:
        print('"encoding_level" must be >=1 or None, see README for descriptions')
        return

    server = BertServer(server_parameters)
    print("LAUNCHING SERVER, PLEASE HOLD", '\n')
    server.start()
    # Include a check here that ensures that the server is running before printing the below statement
    print("SERVER RUNNING, BEGGINING ENCODING...")
Exemplo n.º 17
0
def get_sentence_embedding(rpath, wpath):
    bc = BertClient()
    with open(wpath, 'w') as wf:
        with open(rpath, 'r') as rf:
            lines = rf.readlines()
            for line in tqdm(lines, total=len(lines)):
                user = json.loads(line.strip())
                tips = [
                    t['text']
                    for t in user['fsq']['tips']['tips content'][:MAX_SEQLEN]
                ]
                emb_tips = bc.encode(tips)
                user['fsq']['tips']['tips embedding'] = emb_tips.tolist()
                wf.write(json.dumps(user) + '\n')
    BertServer.shutdown()
Exemplo n.º 18
0
def main():
    from bert_serving.server import BertServer
    from bert_serving.server.helper import get_run_args

    import tensorflow as tf
    tf.compat.v1.disable_eager_execution()

    with BertServer(get_run_args()) as server:
        server.join()
Exemplo n.º 19
0
    def save_vectors(self, save_to):
        max_length = self.corpus.max_length
        corpus_type = self.corpus.corpus_type
        length_threshold = self.corpus.length_threshold
        length = len(self.corpus)
        sentences = self.corpus.sentences
        bert = BertWordEmbedding()
        train = list()
        counter = 0
        with BertServer(bert.start_args):
            with BertClient() as client:
                if corpus_type == "train":
                    for sent in sentences:
                        vector = sent.get_vector(client, bert)
                        labels = np.pad(
                            sent.labels,
                            (0, length_threshold - vector.shape[0]),
                            mode="constant",
                            constant_values=0,
                        )  # padding  0.
                        # for solving the broadcast problem
                        labels = np.expand_dims(labels, axis=0)
                        vector = np.pad(
                            vector,
                            [(0, length_threshold - vector.shape[0]), (0, 0)],
                            mode="constant",
                            constant_values=0,
                        )  # padding  0.
                        train.append((vector, labels))
                        counter += 1
                        logging.info(
                            f"sentence id:=================={counter}")
                        if counter % 2000 == 0 or counter == length:
                            logging.info(
                                f"training set{counter} is done ===========")
                            save_to = save_to + "_" + str(counter)
                            np.save(save_to, np.asarray(train))
                            train = list()
                elif corpus_type == "test":
                    test = list()
                    for sent in sentences:
                        vector = sent.get_vector(client, bert)
                        vector = np.pad(
                            vector,
                            [(0, length_threshold - vector.shape[0]), (0, 0)],
                            mode="constant",
                            constant_values=0,
                        )
                        test.append(vector)
                    test = np.asarray(test)
                    np.save(save_to, test)
                    # TODO: break for just 3000

        logging.info("training set is done ===========")
Exemplo n.º 20
0
def start_bert_server():
    args = get_args_parser().parse_args([
        '-model_dir',
        BERT_MODEL_PATH,
        '-max_seq_len',
        str(MAX_TEXTLEN),
        '-max_batch_size',
        str(MAX_SEQLEN),
        #'-pooling_strategy', 'NONE',
        '-num_worker',
        str(multiprocessing.cpu_count()),
        '-port',
        '5555',
        '-port_out',
        '5556',
        '-cased_tokenization',
        '-cpu'
    ])
    server = BertServer(args)
    server.start()
Exemplo n.º 21
0
class BertEncoderServer(BaseTextEncoder):
    store_args_kwargs = True
    is_trained = True

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        bert_args = ['-%s' % v for v in args]
        for k, v in kwargs.items():
            bert_args.append('-%s' % k)
            bert_args.append(str(v))
        self._bert_args = bert_args

    def post_init(self):
        from bert_serving.server import BertServer
        from bert_serving.server import get_args_parser
        self.bert_server = BertServer(get_args_parser().parse_args(self._bert_args))
        self.bert_server.start()
        self.bert_server.is_ready.wait()

    def close(self):
        self.bert_server.close()
def main():
    from bert_serving.server import BertServer
    from bert_serving.server.helper import get_run_args
    args = get_run_args()
    server = BertServer(args)
    server.start()
    server.join()
Exemplo n.º 23
0
def get_model(TUNED_FLAG=False):
    args = [
        '-model_dir',
        'english_L-12_H-768_A-12/',
        '-port',
        '5555',
        '-port_out',
        '5556',
        '-max_seq_len',
        'NONE',
        '-mask_cls_sep',
        'num_worker',
        '4',
        '-cpu',
    ]
    if TUNED_FLAG == True:
        args.extend([
            '-tuned_model_dir',
            '/tmp/mrpc_output/',
            '-ckpt_name',
            'model.ckpt-343',
        ])

    bert_args = get_args_parser().parse_args(args)
    server = BertServer(bert_args)
    server.start()
    BertServer.shutdown(port=5555)
Exemplo n.º 24
0
def run_benchmark(args):
    from copy import deepcopy
    from bert_serving.server import BertServer

    # load vocabulary
    with open(args.client_vocab_file, encoding='utf8') as fp:
        vocab = list(set(vv for v in fp for vv in v.strip().split()))
    print('vocabulary size: %d' % len(vocab))

    # select those non-empty test cases
    all_exp_names = [
        k.replace('test_', '') for k, v in vars(args).items()
        if k.startswith('test_') and v
    ]

    for exp_name in all_exp_names:
        # set common args
        cargs = deepcopy(args)
        exp_vars = vars(args)['test_%s' % exp_name]
        avg_speed = []

        for cvar in exp_vars:
            # override exp args
            setattr(cargs, exp_name, cvar)
            server = BertServer(cargs)
            server.start()
            time.sleep(cargs.wait_till_ready)

            # sleep until server is ready
            all_clients = [
                BenchmarkClient(cargs, vocab) for _ in range(cargs.num_client)
            ]
            for bc in all_clients:
                bc.start()

            clients_speed = []
            for bc in all_clients:
                bc.join()
                clients_speed.append(cargs.client_batch_size / bc.avg_time)
            server.close()

            max_speed, min_speed, cavg_speed = int(max(clients_speed)), int(
                min(clients_speed)), int(mean(clients_speed))

            print('avg speed: %d\tmax speed: %d\tmin speed: %d' %
                  (cavg_speed, max_speed, min_speed),
                  flush=True)

            avg_speed.append(cavg_speed)

        with open(
                'benchmark-%d%s.result' %
            (args.num_worker, '-fp16' if args.fp16 else ''), 'a') as fw:
            print('\n|`%s`\t|samples/s|\n|---|---|' % exp_name, file=fw)
            for cvar, cavg_speed in zip(exp_vars, avg_speed):
                print('|%s\t|%d|' % (cvar, cavg_speed), file=fw)
            # for additional plotting
            print('\n%s = %s\n%s = %s' %
                  (exp_name, exp_vars, 'speed', avg_speed),
                  file=fw)
Exemplo n.º 25
0
def run_benchmark(args):
    from copy import deepcopy
    from bert_serving.server import BertServer

    # load vocabulary
    with open(args.client_vocab_file, encoding='utf8') as fp:
        vocab = list(set(vv for v in fp for vv in v.strip().split()))
    print('vocabulary size: %d' % len(vocab))

    all_exp_names = [
        k.replace('test_', '') for k in vars(args).keys()
        if k.startswith('test_')
    ]
    fp = open(
        'benchmark-%d%s.result' %
        (args.num_worker, '-fp16' if args.fp16 else ''), 'w')
    for exp_name in all_exp_names:
        # set common args
        cargs = deepcopy(args)
        exp_vars = vars(args)['test_%s' % exp_name]
        avg_speed = []
        fp.write('\n%s\tsamples/s\n' % exp_name)
        for cvar in exp_vars:
            # override exp args
            setattr(cargs, exp_name, cvar)
            server = BertServer(cargs)
            server.start()
            time.sleep(cargs.wait_till_ready)

            # sleep until server is ready
            all_clients = [
                BenchmarkClient(cargs, vocab) for _ in range(cargs.num_client)
            ]
            for bc in all_clients:
                bc.start()

            clients_speed = []
            for bc in all_clients:
                bc.join()
                clients_speed.append(cargs.client_batch_size / bc.avg_time)
            server.close()

            max_speed, min_speed, cavg_speed = int(max(clients_speed)), int(
                min(clients_speed)), int(mean(clients_speed))

            print('avg speed: %d\tmax speed: %d\tmin speed: %d' %
                  (cavg_speed, max_speed, min_speed),
                  flush=True)
            fp.write('%s\t%d\n' % (cvar, cavg_speed))
            fp.flush()
            avg_speed.append(cavg_speed)

        # for plotting
        fp.write('%s\n%s\n' % (exp_vars, avg_speed))
        fp.flush()
    fp.close()
def save_emb():

    common = [
        '-model_dir',
        '/home/ydu/BERT/uncased_L-12_H-768_A-12/',
        '-num_worker',
        '2',
        '-port',
        '5555',
        '-port_out',
        '5556',
        '-max_seq_len',
        '128',
        '-max_batch_size',
        '256',
        # '-tuned_model_dir', '/home/ydu/BERT/bert_mgpu/pretrain_output/10k-32b-all4data/',
        # '-ckpt_name', 'model.ckpt-2500',
    ]

    args = get_args_parser().parse_args(common)

    # folder = ['books', 'dvd', 'electronics', 'kitchen']
    data_path = '/home/ydu/BERT/DATA/'
    data_folder = ['metacritic', 'imdb', 'amazon', 'reddit']

    # model_path = 'home/ydu/BERT/bert_mgpu/results/'
    # model_folder = 'amazon-balanced/'
    # model_type = 'bert-tune'
    data = {}

    # setattr(args, 'tuned_model_dir', '/home/ydu/BERT/bert_mgpu/pretrain_output/reddit-pretrain')
    # setattr(args, 'ckpt_name', 'model.ckpt-2500')
    setattr(args, 'tuned_model_dir',
            '/home/ydu/BERT/bert_mgpu/pretrain_output/10k-32b-all4data')
    setattr(args, 'ckpt_name', 'model.ckpt-2500')

    for d in data_folder:
        fn = data_path + d + '/all.tsv'
        print("===========", fn, "================")
        text = read_tsv(fn)
        server = BertServer(args)
        server.start()
        print('wait until server is ready...')
        time.sleep(20)
        print('encoding...')
        bc = BertClient()
        data[d] = bc.encode(text)
        bc.close()
        server.close()

    pickle_name = data_path + 'EMB/allpre_emb.pickle'
    with open(pickle_name, 'wb') as handle:
        pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return pickle_name
Exemplo n.º 27
0
def extract_topics_all(issues_path, model_dir, topic_file, n_topics):
    """Extract topics for all issues with top n_topics topics"""
    topic_all = []
    text_all, divide_list = combine_issues(issues_path)
    topics = tp.get_topic_list(topic_file)
    topic_embedding = tp.get_topic_embedding(topics, port=3500, port_out=3501, model_path=model_dir)
    #topic_embedding = np.load('../output/topic_embedding.npy')
    print('topic embedding shape = ', topic_embedding.shape)
    stop_words = tp.expand_stopwords()
    print(len(stop_words))
    text_flat_tokenized, text_article_tokenized = tp.bert_tokens(text_all)
    tfidf_biglist = tp.tfidf_vec(text_flat_tokenized, stop_words)
    port_in = 6550
    port_out = 6551
    tmp_dir = './output/tmp'
    if not os.path.isdir(tmp_dir):
        os.makedirs(tmp_dir)
    ZEROMQ_SOCK_TMP_DIR=tmp_dir
    common = [
        '-model_dir', model_dir,
        '-num_worker', '2',
        '-port', str(port_in),
        '-port_out', str(port_out),
        '-max_seq_len', '20',
        '-max_batch_size', '256',
        '-pooling_strategy', 'NONE',
        '-pooling_layer', '-2',
        '-graph_tmp_dir', tmp_dir,
        '-cpu',
        '-show_tokens_to_client',
    ]
    args = get_args_parser().parse_args(common)
    server = BertServer(args)
    server.start()
    print('wait until server is ready...')
    time.sleep(20)
    print('encoding...')        
    for issue_num in range(len(text_all)):  
        #issue_num = 0
        divide_list_each = divide_list[issue_num]
        text_one_issue = text_all[issue_num]
        vec = tp.get_word_embedding_server_on(text_one_issue, port=port_in, port_out=port_out)
        topics_issue, sort_topic_sim = tp.get_topics_one_issue(vec,topic_embedding,topics, divide_list_each, 
                                               tfidf_biglist, issue_num, n_topics)
        topic_all.append(topics_issue)       
    server.close()
    topic_folder = './output/topic'
    if not os.path.isdir(topic_folder):
        os.makedirs(topic_folder)
    with open(topic_folder + '/topic.pkl', 'wb') as f:
        pickle.dump(topic_all, f)
    return topic_all
Exemplo n.º 28
0
def get_word_embedding(rpath, wpath):
    args = get_args_parser().parse_args([
        '-model_dir', BERT_MODEL_PATH, '-max_seq_len',
        str(MAX_TEXTLEN), '-max_batch_size',
        str(MAX_SEQLEN), '-pooling_strategy', 'NONE', '-num_worker', '8',
        '-port', '5555', '-port_out', '5556', '-cased_tokenization', '-cpu'
    ])
    server = BertServer(args)
    server.start()
    bc = BertClient()
    with open(wpath, 'w') as wf:
        with open(rpath, 'r') as rf:
            lines = rf.readlines()
            for line in tqdm(lines, total=len(lines)):
                user = json.loads(line.strip())
                tips = [
                    t['text']
                    for t in user['fsq']['tips']['tips content'][:MAX_SEQLEN]
                ]
                emb_tips = bc.encode(tips)
                user['fsq']['tips']['tips embedding'] = emb_tips.tolist()
                wf.write(json.dumps(user) + '\n')
    BertServer.shutdown(args)
Exemplo n.º 29
0
    'num_client': 1,
    'pooling_strategy': PoolingStrategy.REDUCE_MEAN,
    'pooling_layer': [-2],
    'gpu_memory_fraction': 0.5,
    'xla': False,
    'cpu': False,
    'verbose': False,
    'device_map': []
}
args = namedtuple('args_namedtuple', ','.join(common.keys()))
for k, v in common.items():
    setattr(args, k, v)

for pool_layer in range(1, 13):
    setattr(args, 'pooling_layer', [-pool_layer])
    server = BertServer(args)
    server.start()
    print('wait until server is ready...')
    time.sleep(15)
    print('encoding...')
    bc = BertClient(port=common['port'],
                    port_out=common['port_out'],
                    show_server_config=True)
    subset_vec_all_layers.append(bc.encode(subset_text))
    bc.close()
    server.close()
    print('done at layer -%d' % pool_layer)


def vis(embed, vis_alg='PCA', pool_alg='REDUCE_MEAN'):
    plt.close()
Exemplo n.º 30
0
import sys

from bert_serving.server import BertServer
from bert_serving.server.helper import get_run_args

if __name__ == '__main__':
    args = get_run_args()
    server = BertServer(args)
    server.start()
    server.join()