def _init_bert_client(model_dir, max_seq_len, device_map, num_worker) -> BertClient: """Initialize bert client for sentence embeddings and avoid restarting bert-server if already running. For more information, see: https://github.com/hanxiao/bert-as-service Bert-server can take a long time to start, take over stdout during training, and create many temp log files. It's highly recommended to run bert-server beforehand from command-line in a dedicated folder: e.g: ~/gym-summarizer/data/bert $ bert-serving-start -model_dir uncased_L-12_H-768_A-12/ -max_seq_len 40 -device_map 1 2 3 4 -num_worker 4 :param model_dir: directory containing bert model :param max_seq_len: max sequence length for bert :return bc: bert-client """ try: bc = BertClient() except: from bert_serving.server.helper import get_args_parser from bert_serving.server import BertServer args = get_args_parser().parse_args([ '-model_dir', model_dir, '-max_seq_len', max_seq_len, '-device_map', device_map, '-num_worker', num_worker ]) server = BertServer(args) server.start() bc = BertClient() return bc
def post_init(self): from bert_serving.server import BertServer from bert_serving.server import get_args_parser self.bert_server = BertServer(get_args_parser().parse_args( self._bert_args)) self.bert_server.start() self.bert_server.is_ready.wait()
class BERTserver: """ Sentence encoder using BERT (server side) """ def __init__(self, path, port='5555', port_out='5556', pooling_strategy='REDUCE_MEAN'): """ Word vocabulary initialization Args: path (str): BERT pretrained vector path port (str, optional, defaults to '5555'): server port for receiving data from client port_out(str, optional, defaults to '5556'): server port for sending result to client pooling_strategy(str, optional, defaults to `REDUCE_MEAN`): {NONE, REDUCE_MAX, REDUCE_MEAN, REDUCE_MEAN_MAX, FIRST_TOKEN, LAST_TOKEN} """ self.__port = port self.__port_out = port_out args = get_args_parser().parse_args(['-model_dir', path, '-port', self.__port, '-port_out', self.__port_out, '-max_seq_len', 'NONE', '-mask_cls_sep', '-cpu', '-pooling_strategy', pooling_strategy]) self.__server = BertServer(args) self.__server.start()
def main(): args = get_args_parser().parse_args([ '-model_dir', r'../data/chinese_L-12_H-768_A-12', '-port', '86500', '-port_out', '86501', '-max_seq_len', '512', '-mask_cls_sep', '-cpu' ]) bs = BertServer(args) bs.start()
def __init__(self, model_path): args = get_args_parser().parse_args([ '-num_worker', '4', '-model_dir', model_path, '-port', '5555', '-port_out', '5556', '-max_seq_len', 'NONE', '-mask_cls_sep', '-cpu' ]) # 详细说明,请参考:https://github.com/hanxiao/bert-as-service self._server = BertServer(args)
def main(): args = get_args_parser().parse_args([ '-model_dir', './uncased_L-12_H-768_A-12', '-port', '5555', '-port_out', '5556', '-max_seq_len', '25', '-num_worker', '1', '-mask_cls_sep', '-cpu' ]) server = BertServer(args) server.start()
def start_server(max_seq_len, pretrained_model): args = get_args_parser().parse_args([ '-model_dir', pretrained_model, '-port', '5555', '-port_out', '5556', '-pooling_strategy', 'NONE', '-show_tokens_to_client', '-max_seq_len', str(max_seq_len), '-mask_cls_sep', '-cpu' ]) server = BertServer(args) server.start()
def main(): args = get_args_parser().parse_args([ '-model_dir', './biobert', '-ckpt_name', 'model.ckpt-1000000', '-port', '5555', '-port_out', '5556', '-max_seq_len', '30', '-num_worker', '1', '-mask_cls_sep', '-cpu' ]) server = BertServer(args) server.start()
def start_bert_server(): from bert_serving.server.helper import get_args_parser from bert_serving.server import BertServer args = get_args_parser().parse_args(['-model_dir', 'YOUR_MODEL_PATH_HERE', '-port', '5555', '-port_out', '5556', '-num_worker', '-cpu']) server = BertServer(args) server.start()
def __init__(self): args = get_args_parser().parse_args([ '-model_dir', '/Data_HDD/zhipengye/projects/bert/multi_cased_L-12_H-768_A-12', '-port', '5555', '-port_out', '5556', '-max_seq_len', 'NONE', '-mask_cls_sep', '-cpu' ]) self.server = BertServer(args) self.server.start() print('bert sever has started')
def main(): args = get_args_parser().parse_args( ['-model_dir', 'uncased_L-12_H-768_A-12']) # ,'-port', '5555', # '-port_out', '5556', # '-max_seq_len', 'NONE', # '-mask_cls_sep', # '-cpu']) server = BertServer(args) server.start()
def bert_server_start(): # 感谢哈工大人工智能团队提供的bert服务 args = get_args_parser().parse_args(['-num_worker', '1', '-model_dir', BERT_MODEL_PATH, '-port', '5555', '-port_out', '5556', '-max_seq_len', 'NONE', '-mask_cls_sep', '-cpu']) bert_server = BertServer(args) bert_server.start()
def natural_language_to_embeddings(dataset_file):#dataset in a CSV File format args = get_args_parser().parse_args(['-model_dir', 'C:\\Users\\Ronak\\Desktop\\uncased_L-12_H-768_A-12','-port', '5555','-port_out', '5556','-max_seq_len', 'NONE', '-mask_cls_sep','-cpu']) server = BertServer(args) server.start() bc = BertClient() df=pd.read_csv(dataset_file) l=df.values.tolist() nat_lan_sen=[] for i in l: nat_lan_sen.append(i[0]) sen_encodings=bc.encode(nat_lan_sen) return sen_encodings
def bert_service_start(switch=True): # 启动bert服务 并启动服务监控 输入参数为是否开关服务 from bert_serving.server.helper import get_args_parser from bert_serving.server import BertServer from bert_serving.client import BertClient args = get_args_parser().parse_args( ['-model_dir', 'models\chinese_L-12_H-768_A-12']) server = BertServer(args) if switch: server.start() else: pass
def start(): args = get_args_parser().parse_args([ '-model_dir', '/Users/henry/Documents/application/multi-label-bert/data/chinese_L-12_H-768_A-12/', '-tuned_model_dir', '/Users/henry/Documents/application/nlp_assignments/data/KnowledgeLabel/corpus2/output/', '-port', '12544', '-ckpt_name', 'model.ckpt-1000', '-port_out', '12546', '-http_port', '12547', '-max_seq_len', '128', '-mask_cls_sep', '-show_tokens_to_client', '-pooling_strategy', 'NONE', '-cpu' ]) server = BertServer(args) server.start()
def launch_bert_as_service_server(model_name, layer, encoding_level = None, pooling_strategy = None): """ Launches a BERT-as-service server used to encode the sentences using the designated BERT model https://github.com/hanxiao/bert-as-service Args: - :param: `model_name` (str): the specific bert model to use - :param: `layer` (int): the layer of representation to use - :param 'encoding_level' (int): n-gram encoding level - desginates how many word vectors to combine for each final embedding vector if 'none' -> embedding level defaults to the sentence level of each individual sentence - :param: `pooling_strategy` (str): the vector combination strategy - used when 'encoding_level' == 'sentence' """ model_path = bert_model_directories[model_name] pooling_layer = layers_base[layer] server_parameters = "" if encoding_level == None: if pooling_strategy not in pooling_strategies: print('"pooling_strategy" must be defined as one of the following:', pooling_strategies) return server_parameters = get_args_parser().parse_args(['-model_dir', model_path, '-port', '5555', '-port_out', '5556', '-max_seq_len', 'NONE', '-pooling_layer', pooling_layer, '-pooling_strategy', pooling_strategy, '-num_workers', '=1']) elif encoding_level >=1: server_parameters = get_args_parser().parse_args(['-model_dir', model_path, '-port', '5555', '-port_out', '5556', '-max_seq_len', 'NONE', '-pooling_layer', pooling_layer, '-pooling_strategy', 'NONE', '-num_workers', '=1']) else: print('"encoding_level" must be >=1 or None, see README for descriptions') return server = BertServer(server_parameters) print("LAUNCHING SERVER, PLEASE HOLD", '\n') server.start() # Include a check here that ensures that the server is running before printing the below statement print("SERVER RUNNING, BEGGINING ENCODING...")
def get_sentence_embedding(rpath, wpath): bc = BertClient() with open(wpath, 'w') as wf: with open(rpath, 'r') as rf: lines = rf.readlines() for line in tqdm(lines, total=len(lines)): user = json.loads(line.strip()) tips = [ t['text'] for t in user['fsq']['tips']['tips content'][:MAX_SEQLEN] ] emb_tips = bc.encode(tips) user['fsq']['tips']['tips embedding'] = emb_tips.tolist() wf.write(json.dumps(user) + '\n') BertServer.shutdown()
def main(): from bert_serving.server import BertServer from bert_serving.server.helper import get_run_args import tensorflow as tf tf.compat.v1.disable_eager_execution() with BertServer(get_run_args()) as server: server.join()
def save_vectors(self, save_to): max_length = self.corpus.max_length corpus_type = self.corpus.corpus_type length_threshold = self.corpus.length_threshold length = len(self.corpus) sentences = self.corpus.sentences bert = BertWordEmbedding() train = list() counter = 0 with BertServer(bert.start_args): with BertClient() as client: if corpus_type == "train": for sent in sentences: vector = sent.get_vector(client, bert) labels = np.pad( sent.labels, (0, length_threshold - vector.shape[0]), mode="constant", constant_values=0, ) # padding 0. # for solving the broadcast problem labels = np.expand_dims(labels, axis=0) vector = np.pad( vector, [(0, length_threshold - vector.shape[0]), (0, 0)], mode="constant", constant_values=0, ) # padding 0. train.append((vector, labels)) counter += 1 logging.info( f"sentence id:=================={counter}") if counter % 2000 == 0 or counter == length: logging.info( f"training set{counter} is done ===========") save_to = save_to + "_" + str(counter) np.save(save_to, np.asarray(train)) train = list() elif corpus_type == "test": test = list() for sent in sentences: vector = sent.get_vector(client, bert) vector = np.pad( vector, [(0, length_threshold - vector.shape[0]), (0, 0)], mode="constant", constant_values=0, ) test.append(vector) test = np.asarray(test) np.save(save_to, test) # TODO: break for just 3000 logging.info("training set is done ===========")
def start_bert_server(): args = get_args_parser().parse_args([ '-model_dir', BERT_MODEL_PATH, '-max_seq_len', str(MAX_TEXTLEN), '-max_batch_size', str(MAX_SEQLEN), #'-pooling_strategy', 'NONE', '-num_worker', str(multiprocessing.cpu_count()), '-port', '5555', '-port_out', '5556', '-cased_tokenization', '-cpu' ]) server = BertServer(args) server.start()
class BertEncoderServer(BaseTextEncoder): store_args_kwargs = True is_trained = True def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) bert_args = ['-%s' % v for v in args] for k, v in kwargs.items(): bert_args.append('-%s' % k) bert_args.append(str(v)) self._bert_args = bert_args def post_init(self): from bert_serving.server import BertServer from bert_serving.server import get_args_parser self.bert_server = BertServer(get_args_parser().parse_args(self._bert_args)) self.bert_server.start() self.bert_server.is_ready.wait() def close(self): self.bert_server.close()
def main(): from bert_serving.server import BertServer from bert_serving.server.helper import get_run_args args = get_run_args() server = BertServer(args) server.start() server.join()
def get_model(TUNED_FLAG=False): args = [ '-model_dir', 'english_L-12_H-768_A-12/', '-port', '5555', '-port_out', '5556', '-max_seq_len', 'NONE', '-mask_cls_sep', 'num_worker', '4', '-cpu', ] if TUNED_FLAG == True: args.extend([ '-tuned_model_dir', '/tmp/mrpc_output/', '-ckpt_name', 'model.ckpt-343', ]) bert_args = get_args_parser().parse_args(args) server = BertServer(bert_args) server.start() BertServer.shutdown(port=5555)
def run_benchmark(args): from copy import deepcopy from bert_serving.server import BertServer # load vocabulary with open(args.client_vocab_file, encoding='utf8') as fp: vocab = list(set(vv for v in fp for vv in v.strip().split())) print('vocabulary size: %d' % len(vocab)) # select those non-empty test cases all_exp_names = [ k.replace('test_', '') for k, v in vars(args).items() if k.startswith('test_') and v ] for exp_name in all_exp_names: # set common args cargs = deepcopy(args) exp_vars = vars(args)['test_%s' % exp_name] avg_speed = [] for cvar in exp_vars: # override exp args setattr(cargs, exp_name, cvar) server = BertServer(cargs) server.start() time.sleep(cargs.wait_till_ready) # sleep until server is ready all_clients = [ BenchmarkClient(cargs, vocab) for _ in range(cargs.num_client) ] for bc in all_clients: bc.start() clients_speed = [] for bc in all_clients: bc.join() clients_speed.append(cargs.client_batch_size / bc.avg_time) server.close() max_speed, min_speed, cavg_speed = int(max(clients_speed)), int( min(clients_speed)), int(mean(clients_speed)) print('avg speed: %d\tmax speed: %d\tmin speed: %d' % (cavg_speed, max_speed, min_speed), flush=True) avg_speed.append(cavg_speed) with open( 'benchmark-%d%s.result' % (args.num_worker, '-fp16' if args.fp16 else ''), 'a') as fw: print('\n|`%s`\t|samples/s|\n|---|---|' % exp_name, file=fw) for cvar, cavg_speed in zip(exp_vars, avg_speed): print('|%s\t|%d|' % (cvar, cavg_speed), file=fw) # for additional plotting print('\n%s = %s\n%s = %s' % (exp_name, exp_vars, 'speed', avg_speed), file=fw)
def run_benchmark(args): from copy import deepcopy from bert_serving.server import BertServer # load vocabulary with open(args.client_vocab_file, encoding='utf8') as fp: vocab = list(set(vv for v in fp for vv in v.strip().split())) print('vocabulary size: %d' % len(vocab)) all_exp_names = [ k.replace('test_', '') for k in vars(args).keys() if k.startswith('test_') ] fp = open( 'benchmark-%d%s.result' % (args.num_worker, '-fp16' if args.fp16 else ''), 'w') for exp_name in all_exp_names: # set common args cargs = deepcopy(args) exp_vars = vars(args)['test_%s' % exp_name] avg_speed = [] fp.write('\n%s\tsamples/s\n' % exp_name) for cvar in exp_vars: # override exp args setattr(cargs, exp_name, cvar) server = BertServer(cargs) server.start() time.sleep(cargs.wait_till_ready) # sleep until server is ready all_clients = [ BenchmarkClient(cargs, vocab) for _ in range(cargs.num_client) ] for bc in all_clients: bc.start() clients_speed = [] for bc in all_clients: bc.join() clients_speed.append(cargs.client_batch_size / bc.avg_time) server.close() max_speed, min_speed, cavg_speed = int(max(clients_speed)), int( min(clients_speed)), int(mean(clients_speed)) print('avg speed: %d\tmax speed: %d\tmin speed: %d' % (cavg_speed, max_speed, min_speed), flush=True) fp.write('%s\t%d\n' % (cvar, cavg_speed)) fp.flush() avg_speed.append(cavg_speed) # for plotting fp.write('%s\n%s\n' % (exp_vars, avg_speed)) fp.flush() fp.close()
def save_emb(): common = [ '-model_dir', '/home/ydu/BERT/uncased_L-12_H-768_A-12/', '-num_worker', '2', '-port', '5555', '-port_out', '5556', '-max_seq_len', '128', '-max_batch_size', '256', # '-tuned_model_dir', '/home/ydu/BERT/bert_mgpu/pretrain_output/10k-32b-all4data/', # '-ckpt_name', 'model.ckpt-2500', ] args = get_args_parser().parse_args(common) # folder = ['books', 'dvd', 'electronics', 'kitchen'] data_path = '/home/ydu/BERT/DATA/' data_folder = ['metacritic', 'imdb', 'amazon', 'reddit'] # model_path = 'home/ydu/BERT/bert_mgpu/results/' # model_folder = 'amazon-balanced/' # model_type = 'bert-tune' data = {} # setattr(args, 'tuned_model_dir', '/home/ydu/BERT/bert_mgpu/pretrain_output/reddit-pretrain') # setattr(args, 'ckpt_name', 'model.ckpt-2500') setattr(args, 'tuned_model_dir', '/home/ydu/BERT/bert_mgpu/pretrain_output/10k-32b-all4data') setattr(args, 'ckpt_name', 'model.ckpt-2500') for d in data_folder: fn = data_path + d + '/all.tsv' print("===========", fn, "================") text = read_tsv(fn) server = BertServer(args) server.start() print('wait until server is ready...') time.sleep(20) print('encoding...') bc = BertClient() data[d] = bc.encode(text) bc.close() server.close() pickle_name = data_path + 'EMB/allpre_emb.pickle' with open(pickle_name, 'wb') as handle: pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) return pickle_name
def extract_topics_all(issues_path, model_dir, topic_file, n_topics): """Extract topics for all issues with top n_topics topics""" topic_all = [] text_all, divide_list = combine_issues(issues_path) topics = tp.get_topic_list(topic_file) topic_embedding = tp.get_topic_embedding(topics, port=3500, port_out=3501, model_path=model_dir) #topic_embedding = np.load('../output/topic_embedding.npy') print('topic embedding shape = ', topic_embedding.shape) stop_words = tp.expand_stopwords() print(len(stop_words)) text_flat_tokenized, text_article_tokenized = tp.bert_tokens(text_all) tfidf_biglist = tp.tfidf_vec(text_flat_tokenized, stop_words) port_in = 6550 port_out = 6551 tmp_dir = './output/tmp' if not os.path.isdir(tmp_dir): os.makedirs(tmp_dir) ZEROMQ_SOCK_TMP_DIR=tmp_dir common = [ '-model_dir', model_dir, '-num_worker', '2', '-port', str(port_in), '-port_out', str(port_out), '-max_seq_len', '20', '-max_batch_size', '256', '-pooling_strategy', 'NONE', '-pooling_layer', '-2', '-graph_tmp_dir', tmp_dir, '-cpu', '-show_tokens_to_client', ] args = get_args_parser().parse_args(common) server = BertServer(args) server.start() print('wait until server is ready...') time.sleep(20) print('encoding...') for issue_num in range(len(text_all)): #issue_num = 0 divide_list_each = divide_list[issue_num] text_one_issue = text_all[issue_num] vec = tp.get_word_embedding_server_on(text_one_issue, port=port_in, port_out=port_out) topics_issue, sort_topic_sim = tp.get_topics_one_issue(vec,topic_embedding,topics, divide_list_each, tfidf_biglist, issue_num, n_topics) topic_all.append(topics_issue) server.close() topic_folder = './output/topic' if not os.path.isdir(topic_folder): os.makedirs(topic_folder) with open(topic_folder + '/topic.pkl', 'wb') as f: pickle.dump(topic_all, f) return topic_all
def get_word_embedding(rpath, wpath): args = get_args_parser().parse_args([ '-model_dir', BERT_MODEL_PATH, '-max_seq_len', str(MAX_TEXTLEN), '-max_batch_size', str(MAX_SEQLEN), '-pooling_strategy', 'NONE', '-num_worker', '8', '-port', '5555', '-port_out', '5556', '-cased_tokenization', '-cpu' ]) server = BertServer(args) server.start() bc = BertClient() with open(wpath, 'w') as wf: with open(rpath, 'r') as rf: lines = rf.readlines() for line in tqdm(lines, total=len(lines)): user = json.loads(line.strip()) tips = [ t['text'] for t in user['fsq']['tips']['tips content'][:MAX_SEQLEN] ] emb_tips = bc.encode(tips) user['fsq']['tips']['tips embedding'] = emb_tips.tolist() wf.write(json.dumps(user) + '\n') BertServer.shutdown(args)
'num_client': 1, 'pooling_strategy': PoolingStrategy.REDUCE_MEAN, 'pooling_layer': [-2], 'gpu_memory_fraction': 0.5, 'xla': False, 'cpu': False, 'verbose': False, 'device_map': [] } args = namedtuple('args_namedtuple', ','.join(common.keys())) for k, v in common.items(): setattr(args, k, v) for pool_layer in range(1, 13): setattr(args, 'pooling_layer', [-pool_layer]) server = BertServer(args) server.start() print('wait until server is ready...') time.sleep(15) print('encoding...') bc = BertClient(port=common['port'], port_out=common['port_out'], show_server_config=True) subset_vec_all_layers.append(bc.encode(subset_text)) bc.close() server.close() print('done at layer -%d' % pool_layer) def vis(embed, vis_alg='PCA', pool_alg='REDUCE_MEAN'): plt.close()
import sys from bert_serving.server import BertServer from bert_serving.server.helper import get_run_args if __name__ == '__main__': args = get_run_args() server = BertServer(args) server.start() server.join()