def launch_bert_as_service_server(model_name, layer, encoding_level = None, pooling_strategy = None): """ Launches a BERT-as-service server used to encode the sentences using the designated BERT model https://github.com/hanxiao/bert-as-service Args: - :param: `model_name` (str): the specific bert model to use - :param: `layer` (int): the layer of representation to use - :param 'encoding_level' (int): n-gram encoding level - desginates how many word vectors to combine for each final embedding vector if 'none' -> embedding level defaults to the sentence level of each individual sentence - :param: `pooling_strategy` (str): the vector combination strategy - used when 'encoding_level' == 'sentence' """ model_path = bert_model_directories[model_name] pooling_layer = layers_base[layer] server_parameters = "" if encoding_level == None: if pooling_strategy not in pooling_strategies: print('"pooling_strategy" must be defined as one of the following:', pooling_strategies) return server_parameters = get_args_parser().parse_args(['-model_dir', model_path, '-port', '5555', '-port_out', '5556', '-max_seq_len', 'NONE', '-pooling_layer', pooling_layer, '-pooling_strategy', pooling_strategy, '-num_workers', '=1']) elif encoding_level >=1: server_parameters = get_args_parser().parse_args(['-model_dir', model_path, '-port', '5555', '-port_out', '5556', '-max_seq_len', 'NONE', '-pooling_layer', pooling_layer, '-pooling_strategy', 'NONE', '-num_workers', '=1']) else: print('"encoding_level" must be >=1 or None, see README for descriptions') return server = BertServer(server_parameters) print("LAUNCHING SERVER, PLEASE HOLD", '\n') server.start() # Include a check here that ensures that the server is running before printing the below statement print("SERVER RUNNING, BEGGINING ENCODING...")
def get_model(TUNED_FLAG=False): args = [ '-model_dir', 'english_L-12_H-768_A-12/', '-port', '5555', '-port_out', '5556', '-max_seq_len', 'NONE', '-mask_cls_sep', 'num_worker', '4', '-cpu', ] if TUNED_FLAG == True: args.extend([ '-tuned_model_dir', '/tmp/mrpc_output/', '-ckpt_name', 'model.ckpt-343', ]) bert_args = get_args_parser().parse_args(args) server = BertServer(bert_args) server.start() BertServer.shutdown(port=5555)
def _init_bert_client(model_dir, max_seq_len, device_map, num_worker) -> BertClient: """Initialize bert client for sentence embeddings and avoid restarting bert-server if already running. For more information, see: https://github.com/hanxiao/bert-as-service Bert-server can take a long time to start, take over stdout during training, and create many temp log files. It's highly recommended to run bert-server beforehand from command-line in a dedicated folder: e.g: ~/gym-summarizer/data/bert $ bert-serving-start -model_dir uncased_L-12_H-768_A-12/ -max_seq_len 40 -device_map 1 2 3 4 -num_worker 4 :param model_dir: directory containing bert model :param max_seq_len: max sequence length for bert :return bc: bert-client """ try: bc = BertClient() except: from bert_serving.server.helper import get_args_parser from bert_serving.server import BertServer args = get_args_parser().parse_args([ '-model_dir', model_dir, '-max_seq_len', max_seq_len, '-device_map', device_map, '-num_worker', num_worker ]) server = BertServer(args) server.start() bc = BertClient() return bc
def __init__(self, model_path): args = get_args_parser().parse_args([ '-num_worker', '4', '-model_dir', model_path, '-port', '5555', '-port_out', '5556', '-max_seq_len', 'NONE', '-mask_cls_sep', '-cpu' ]) # 详细说明,请参考:https://github.com/hanxiao/bert-as-service self._server = BertServer(args)
def main(): args = get_args_parser().parse_args([ '-model_dir', r'../data/chinese_L-12_H-768_A-12', '-port', '86500', '-port_out', '86501', '-max_seq_len', '512', '-mask_cls_sep', '-cpu' ]) bs = BertServer(args) bs.start()
def start_server(max_seq_len, pretrained_model): args = get_args_parser().parse_args([ '-model_dir', pretrained_model, '-port', '5555', '-port_out', '5556', '-pooling_strategy', 'NONE', '-show_tokens_to_client', '-max_seq_len', str(max_seq_len), '-mask_cls_sep', '-cpu' ]) server = BertServer(args) server.start()
def main(): args = get_args_parser().parse_args([ '-model_dir', './uncased_L-12_H-768_A-12', '-port', '5555', '-port_out', '5556', '-max_seq_len', '25', '-num_worker', '1', '-mask_cls_sep', '-cpu' ]) server = BertServer(args) server.start()
def main(): args = get_args_parser().parse_args([ '-model_dir', './biobert', '-ckpt_name', 'model.ckpt-1000000', '-port', '5555', '-port_out', '5556', '-max_seq_len', '30', '-num_worker', '1', '-mask_cls_sep', '-cpu' ]) server = BertServer(args) server.start()
def start_bert_server(): from bert_serving.server.helper import get_args_parser from bert_serving.server import BertServer args = get_args_parser().parse_args(['-model_dir', 'YOUR_MODEL_PATH_HERE', '-port', '5555', '-port_out', '5556', '-num_worker', '-cpu']) server = BertServer(args) server.start()
def main(): args = get_args_parser().parse_args( ['-model_dir', 'uncased_L-12_H-768_A-12']) # ,'-port', '5555', # '-port_out', '5556', # '-max_seq_len', 'NONE', # '-mask_cls_sep', # '-cpu']) server = BertServer(args) server.start()
def __init__(self): args = get_args_parser().parse_args([ '-model_dir', '/Data_HDD/zhipengye/projects/bert/multi_cased_L-12_H-768_A-12', '-port', '5555', '-port_out', '5556', '-max_seq_len', 'NONE', '-mask_cls_sep', '-cpu' ]) self.server = BertServer(args) self.server.start() print('bert sever has started')
def save_emb(): common = [ '-model_dir', '/home/ydu/BERT/uncased_L-12_H-768_A-12/', '-num_worker', '2', '-port', '5555', '-port_out', '5556', '-max_seq_len', '128', '-max_batch_size', '256', # '-tuned_model_dir', '/home/ydu/BERT/bert_mgpu/pretrain_output/10k-32b-all4data/', # '-ckpt_name', 'model.ckpt-2500', ] args = get_args_parser().parse_args(common) # folder = ['books', 'dvd', 'electronics', 'kitchen'] data_path = '/home/ydu/BERT/DATA/' data_folder = ['metacritic', 'imdb', 'amazon', 'reddit'] # model_path = 'home/ydu/BERT/bert_mgpu/results/' # model_folder = 'amazon-balanced/' # model_type = 'bert-tune' data = {} # setattr(args, 'tuned_model_dir', '/home/ydu/BERT/bert_mgpu/pretrain_output/reddit-pretrain') # setattr(args, 'ckpt_name', 'model.ckpt-2500') setattr(args, 'tuned_model_dir', '/home/ydu/BERT/bert_mgpu/pretrain_output/10k-32b-all4data') setattr(args, 'ckpt_name', 'model.ckpt-2500') for d in data_folder: fn = data_path + d + '/all.tsv' print("===========", fn, "================") text = read_tsv(fn) server = BertServer(args) server.start() print('wait until server is ready...') time.sleep(20) print('encoding...') bc = BertClient() data[d] = bc.encode(text) bc.close() server.close() pickle_name = data_path + 'EMB/allpre_emb.pickle' with open(pickle_name, 'wb') as handle: pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) return pickle_name
def bert_server_start(): # 感谢哈工大人工智能团队提供的bert服务 args = get_args_parser().parse_args(['-num_worker', '1', '-model_dir', BERT_MODEL_PATH, '-port', '5555', '-port_out', '5556', '-max_seq_len', 'NONE', '-mask_cls_sep', '-cpu']) bert_server = BertServer(args) bert_server.start()
def natural_language_to_embeddings(dataset_file):#dataset in a CSV File format args = get_args_parser().parse_args(['-model_dir', 'C:\\Users\\Ronak\\Desktop\\uncased_L-12_H-768_A-12','-port', '5555','-port_out', '5556','-max_seq_len', 'NONE', '-mask_cls_sep','-cpu']) server = BertServer(args) server.start() bc = BertClient() df=pd.read_csv(dataset_file) l=df.values.tolist() nat_lan_sen=[] for i in l: nat_lan_sen.append(i[0]) sen_encodings=bc.encode(nat_lan_sen) return sen_encodings
def bert_service_start(switch=True): # 启动bert服务 并启动服务监控 输入参数为是否开关服务 from bert_serving.server.helper import get_args_parser from bert_serving.server import BertServer from bert_serving.client import BertClient args = get_args_parser().parse_args( ['-model_dir', 'models\chinese_L-12_H-768_A-12']) server = BertServer(args) if switch: server.start() else: pass
def extract_topics_all(issues_path, model_dir, topic_file, n_topics): """Extract topics for all issues with top n_topics topics""" topic_all = [] text_all, divide_list = combine_issues(issues_path) topics = tp.get_topic_list(topic_file) topic_embedding = tp.get_topic_embedding(topics, port=3500, port_out=3501, model_path=model_dir) #topic_embedding = np.load('../output/topic_embedding.npy') print('topic embedding shape = ', topic_embedding.shape) stop_words = tp.expand_stopwords() print(len(stop_words)) text_flat_tokenized, text_article_tokenized = tp.bert_tokens(text_all) tfidf_biglist = tp.tfidf_vec(text_flat_tokenized, stop_words) port_in = 6550 port_out = 6551 tmp_dir = './output/tmp' if not os.path.isdir(tmp_dir): os.makedirs(tmp_dir) ZEROMQ_SOCK_TMP_DIR=tmp_dir common = [ '-model_dir', model_dir, '-num_worker', '2', '-port', str(port_in), '-port_out', str(port_out), '-max_seq_len', '20', '-max_batch_size', '256', '-pooling_strategy', 'NONE', '-pooling_layer', '-2', '-graph_tmp_dir', tmp_dir, '-cpu', '-show_tokens_to_client', ] args = get_args_parser().parse_args(common) server = BertServer(args) server.start() print('wait until server is ready...') time.sleep(20) print('encoding...') for issue_num in range(len(text_all)): #issue_num = 0 divide_list_each = divide_list[issue_num] text_one_issue = text_all[issue_num] vec = tp.get_word_embedding_server_on(text_one_issue, port=port_in, port_out=port_out) topics_issue, sort_topic_sim = tp.get_topics_one_issue(vec,topic_embedding,topics, divide_list_each, tfidf_biglist, issue_num, n_topics) topic_all.append(topics_issue) server.close() topic_folder = './output/topic' if not os.path.isdir(topic_folder): os.makedirs(topic_folder) with open(topic_folder + '/topic.pkl', 'wb') as f: pickle.dump(topic_all, f) return topic_all
def start(): args = get_args_parser().parse_args([ '-model_dir', '/Users/henry/Documents/application/multi-label-bert/data/chinese_L-12_H-768_A-12/', '-tuned_model_dir', '/Users/henry/Documents/application/nlp_assignments/data/KnowledgeLabel/corpus2/output/', '-port', '12544', '-ckpt_name', 'model.ckpt-1000', '-port_out', '12546', '-http_port', '12547', '-max_seq_len', '128', '-mask_cls_sep', '-show_tokens_to_client', '-pooling_strategy', 'NONE', '-cpu' ]) server = BertServer(args) server.start()
def start_bert_server(): args = get_args_parser().parse_args([ '-model_dir', BERT_MODEL_PATH, '-max_seq_len', str(MAX_TEXTLEN), '-max_batch_size', str(MAX_SEQLEN), #'-pooling_strategy', 'NONE', '-num_worker', str(multiprocessing.cpu_count()), '-port', '5555', '-port_out', '5556', '-cased_tokenization', '-cpu' ]) server = BertServer(args) server.start()
def __init__(self, path, port='5555', port_out='5556', pooling_strategy='REDUCE_MEAN'): """ Word vocabulary initialization Args: path (str): BERT pretrained vector path port (str, optional, defaults to '5555'): server port for receiving data from client port_out(str, optional, defaults to '5556'): server port for sending result to client pooling_strategy(str, optional, defaults to `REDUCE_MEAN`): {NONE, REDUCE_MAX, REDUCE_MEAN, REDUCE_MEAN_MAX, FIRST_TOKEN, LAST_TOKEN} """ self.__port = port self.__port_out = port_out args = get_args_parser().parse_args(['-model_dir', path, '-port', self.__port, '-port_out', self.__port_out, '-max_seq_len', 'NONE', '-mask_cls_sep', '-cpu', '-pooling_strategy', pooling_strategy]) self.__server = BertServer(args) self.__server.start()
def get_word_embedding(rpath, wpath): args = get_args_parser().parse_args([ '-model_dir', BERT_MODEL_PATH, '-max_seq_len', str(MAX_TEXTLEN), '-max_batch_size', str(MAX_SEQLEN), '-pooling_strategy', 'NONE', '-num_worker', '8', '-port', '5555', '-port_out', '5556', '-cased_tokenization', '-cpu' ]) server = BertServer(args) server.start() bc = BertClient() with open(wpath, 'w') as wf: with open(rpath, 'r') as rf: lines = rf.readlines() for line in tqdm(lines, total=len(lines)): user = json.loads(line.strip()) tips = [ t['text'] for t in user['fsq']['tips']['tips content'][:MAX_SEQLEN] ] emb_tips = bc.encode(tips) user['fsq']['tips']['tips embedding'] = emb_tips.tolist() wf.write(json.dumps(user) + '\n') BertServer.shutdown(args)
from bert_serving.server.helper import get_args_parser from bert_serving.server import BertServer args = get_args_parser().parse_args( ['-model_dir', ' F:\learn\ngnlab\KG\chinese_L-12_H-768_A-1']) server = BertServer(args) server.start()
from bert_serving.server.helper import get_args_parser from bert_serving.server import BertServer from bert_serving.client import BertClient from bert_serving.server.bert import tokenization import time if len(sys.argv) < 4: print('please provide embeddings, conl file and port') exit(0) port1 = int(sys.argv[3]) port2 = port1 + 1 args = get_args_parser().parse_args([ '-model_dir', sys.argv[1], '-port', str(port1), '-port_out', str(port2), '-max_seq_len', 'NONE', '-pooling_strategy', 'NONE', '-mask_cls_sep', '-cpu' ]) print('starting bert') server = BertServer(args) server.start() print('started') #os.system('bert-serving-start -pooling_strategy NONE -model_dir ' + sys.argv[1] + ' -num_worker=1 > /dev/null 2> /dev/null &') time.sleep(30) #is this necessary? print('starting client') bc = BertClient(port=port1, port_out=port2) print('done') time.sleep(30) # is this necessary?
from bert_serving.server.helper import get_args_parser from bert_serving.server import BertServer args = get_args_parser().parse_args([ '-model_dir', 'cased', '-port', '7010', '-port_out', '7011', '-max_seq_len', 'NONE', '-mask_cls_sep', '-num_worker', '1', '-cpu' ]) server = BertServer(args) server.start()
from bert_serving.server.helper import get_args_parser from bert_serving.server import BertServer import pandas as pd from bert_serving.client import BertClient file_path = '/Users/yuchk/PycharmProjects/IMDB/0_dataset/orign/train_imdb.tsv' model_path = '/Users/yuchk/PycharmProjects/IMDB/0_dataset/bert_model' res_file_path = '/Users/yuchk/PycharmProjects/IMDB/0_dataset/orign/encode_train_imdb.tsv' args = get_args_parser().parse_args([ '-model_dir', model_path, '-port', '5558', '-port_out', '5559', '-max_seq_len', 'NONE', '-mask_cls_sep', '-cpu' ]) server = BertServer(args) server.start() bc = BertClient(port=5558, port_out=5559) df = pd.read_csv(file_path, usecols=['sen', 'tag'], sep='\t') df['encode'] = df['sen'].apply(lambda x: bc.encode([x])[0]) df.to_csv(res_file_path, sep="\t", encoding="utf-8", columns=['sen', "tag", "encode"], header=True, index=False)
from bert_serving.server import BertServer import spacy from spacymoji import Emoji #in the following tweets stand for Facebook post line_done = 0 #multi_cased_L-12_H-768_A-12 #uncased_L-12_H-768_A-12 bert_model_dir = 'pretrained_bert/multi_cased_L-12_H-768_A-12' check_empty = "" output_file = open("train_whole_lines.csv", "a") args = get_args_parser().parse_args(['-model_dir',bert_model_dir, '-port', '5555', '-port_out', '5556', '-max_seq_len','NONE', '-mask_cls_sep','-cpu','-num_worker=1', '-pooling_strategy', 'CLS_TOKEN']) server = BertServer(args) server.start() bc = BertClient(ip = 'localhost') #large vocabulary used in final solution #nlp = spacy.load("en_core_web_lg") #small vocabulary is used for testing purpose print("spaCy en_core_web_sm loading...") nlp = spacy.load("en_core_web_sm") print("spaCy loaded") # we use this library to translate image unicode of emoji ":)" into words "smiling face" emoji = Emoji(nlp)
# on another CPU machine # from bert_serving.client import BertClient # bc = BertClient(ip='127.0.0.1') # ip address of the GPU machine # bc.encode(['First do it', 'then do it right', 'then do it better']) from bert_serving.server.helper import get_args_parser from bert_serving.server import BertServer args = get_args_parser().parse_args(['-model_dir', './multi_cased_L-12_H-768_A-12/', '-port', '5555', '-port_out', '5556', '-max_seq_len', 'NONE', '-mask_cls_sep', '-cpu']) server = BertServer(args) server.start() LYcTaWoRu4b8bjGh
import os from bert_serving.server import BertServer from bert_serving.server.helper import get_args_parser # Avoid conflict USE_CPU = True arg_list = [ '-model_dir', os.path.join(os.getcwd(), 'uncased_L-12_H-768_A-12'), '-port', '23333', '-num_worker=1' ] if USE_CPU: arg_list.append('-cpu') args = get_args_parser().parse_args(arg_list) if __name__ == '__main__': server = BertServer(args) server.start()
def load_data(emb_type='w2v', collapse_classes=False, fold=None, num_folds=1, random_state=None, force_reload=False, drop_feat_idx=[]): print('Loading data from',dataset_dir) data = pd.read_csv(dataset_dir+"/dataset.csv", sep=',') if force_reload: reset_hash() print("size of initial \"dataset\":",len(data)) data = data.drop_duplicates(subset='o_url', keep='first') print("after dropping duplicates:",len(data)) data.o_body = data.o_body.astype('str') data.verdict = data.verdict.astype('str') data['verdict'] = data['verdict'].str.lower() #data = data[data['o_body'].map(len) > MIN_BODY_LEN] #print("after dropping origins with less than "+str(MIN_BODY_LEN)+" chars:",len(data)) data = data.reset_index() if(collapse_classes): print("labels before collapse classes:", data.verdict.unique()) data.loc[data['verdict'] == "mfalse", 'verdict'] = 'false' data.loc[data['verdict'] == "mtrue", 'verdict'] = 'true' labels = ['true', 'false'] print(data['verdict'].value_counts()) data = data.loc[data.verdict.isin(labels)] print("considered labels:", data.verdict.unique()) print("after dropping invalid labels:",len(data)) #creating hash json_data = data.to_json().encode() data = data.sample(frac=1, random_state=random_state) df_hash = hashlib.sha256(json_data).hexdigest() labels_idx = [labels.index(label) for label in labels] labels_one_hot = np.eye(len(labels))[labels_idx] label_to_oh = {label:labels_one_hot[labels.index(label)] for label in labels} print("MEMORY: ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) assert (num_folds > 2), "Needs at least three folds for Dev/Train/Test to be different from each other" #generate and save the folds: for fold in range(num_folds): bucket_size = int(len(data.index)/num_folds) fold_dev = fold+1 if fold == num_folds-1: fold_dev = 0 if not check_hash(df_hash, num_folds, drop_feat_idx=drop_feat_idx): #TODO modify these two lines back!!! df = data[['o_body','verdict']].copy() #df = data[['claim','verdict']].copy() df = df.rename(columns={"o_body": "body"}) #df = df.rename(columns={"claim": "body"}) df.body.apply(clean_text) lens = np.asarray([len(e.split(" ")) for e in df['body'].values]) #df = df[lens < MAX_SENT_LEN] df.reset_index(drop = True, inplace = True) df.to_csv(data_dir+'/data.csv', sep="\t", index=False) num_entries = len(df) #plots the data distribution by number of words print("Number of entries: ", num_entries) print("True/False: ",df.groupby('verdict').count()) print("Mean and Std of number of words per document: ",np.mean(lens),np.std(lens), "\n") #sns.distplot(lens) #plt.show() ################################### ############# FEATURES ############ ################################### #check if new linguistic features should be generated flag_concat = False if not check_hash(df_hash, num_folds, stage="complexity"): flag_concat = True #Generate the features ndarray and save it to a pickle try: feat.generate_complexity() except Exception as e: print(traceback.format_exc()) input("Error occured while GENERATING COMPLEXITY. Press any key to exit.") sys.exit(1) savehash("complexity", hashcode=df_hash) if not check_hash(df_hash, num_folds, stage="specificity"): flag_concat = True try: feat.generate_specificity() except Exception as e: print(traceback.format_exc()) input("Error occured while GENERATING SPECIFICITY. Press any key to exit.") sys.exit(1) savehash("specificity", hashcode=df_hash) if not check_hash(df_hash, num_folds, drop_feat_idx=drop_feat_idx, stage="features"): flag_concat = True try: features = feat.generateFeats() except Exception as e: print(traceback.format_exc()) input("Error occured while GENERATING FEATURES. Press any key to exit.") sys.exit(1) save_p(data_dir+"/features", features) print("Generated Features. Saved to pickle.") print("Features Shape:", features.shape) savehash("features", hashcode=df_hash, drop_feat_idx=drop_feat_idx) #check if drop_features is NOT the same if not check_hash(df_hash, num_folds, drop_feat_idx=drop_feat_idx, stage="drop_feat"): flag_concat = True savehash("drop_feat", hashcode=df_hash, drop_feat_idx=drop_feat_idx) print("MEMORY AFTER FEATURES: ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) #################################### ############### BERT ############### #################################### #check if new bert should be generated if not check_hash(df_hash, num_folds, stage="bert"): try: #creates the shuffle order (not random) index_shuf = list(range(len(df))) #creates a list of N=folds lists, each inner list contains the index of the elements of each fold bert_folds = np.array_split(index_shuf, num_folds) bert_folds = [a.tolist() for a in bert_folds] #creates an ordered list of N=entries of integers(:folds) indicating the fold idx of each entry fold_idx = [bert_folds.index(list(sl)) for e in index_shuf for sl in bert_folds if e in list(sl)] #I think this should start as True flag = {idx:True for idx in range(len(bert_folds))} #get the starting time: start_time = time.time() #start the bert-as-a-service server bert_dir = os.environ.get("BERT_BASE_DIR") print(bert_dir) args = get_args_parser().parse_args(['-model_dir', bert_dir, '-port', '5555', '-port_out', '5556', '-max_seq_len', '512', '-mask_cls_sep']) server = BertServer(args) server.start() print(num_folds) #delete the bert.csv files inside the folds for i in range(num_folds): filename = data_dir+"/folds/"+str(i)+"/bert.csv" if os.path.exists(filename): subprocess.call("rm -rf "+filename, shell=True, cwd=data_dir) #TODO make this process read only one fold at a time for fold, idx in zip(fold_idx, index_shuf): #generates the encodings for the texts bc = BertClient(check_version=False) b = bc.encode([df.body[idx]])[0] bert_df = pd.DataFrame([b], columns=['f'+str(e) for e in range(len(b))]) bert_df.to_csv(data_dir+"/folds/"+str(fold)+"/bert.csv", mode='a+', index=False, header=flag[fold]) flag[fold] = False #stops the bert-as-a-service server shut_args = get_shutdown_parser().parse_args(['-ip','localhost','-port','5555','-timeout','5000']) server.shutdown(shut_args) #print total time delta_time = time.time() - start_time print('Time Taken: for BERT generation:', time.strftime("%H:%M:%S",time.gmtime(delta_time))) except Exception as e: print(traceback.format_exc()) input("Error occured while fine training BERT. Press any key to exit.") sys.exit(1) print("BERT Embeddings Saved") savehash("bert", df_hash) ######################################### ## CONCATENATION, SHUFFLING AND SAVING ## ######################################### #if not check_hash(df_hash, num_folds, stage="concat"): if flag_concat: features = read_p(data_dir+"/features") features = np.delete(features,drop_feat_idx,axis=1) #normalize features features = np.nan_to_num(features) features_t = features.T for c in range(features_t.shape[0]): row = features_t[c] features_t[c] = np.interp(row, (np.min(row), np.max(row)), (-2, +2)) features = features_t.T #delete labels and folds folders for i in range(num_folds): subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/labels", shell=True, cwd=data_dir) subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/features+bert.csv", shell=True, cwd=data_dir) subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/bert", shell=True, cwd=data_dir) subprocess.call("rm -rf "+data_dir+"/folds/"+str(i)+"/only_bert", shell=True, cwd=data_dir) #creates the shuffle order (not random) index_shuf = list(range(len(df))) #LABELS labels = [label_to_oh[label].tolist() for label in df['verdict'].values.tolist()] labels = [labels[i] for i in index_shuf] label_folds = np.array_split(labels, num_folds) for i in range(num_folds): fold_dir = data_dir+"/folds/"+str(i) if not os.path.exists(fold_dir): os.mkdir(fold_dir) save_p(fold_dir+"/labels", label_folds[i]) #creates a list of N=folds lists, each inner list contains the index of the elements of each fold bert_folds = np.array_split(index_shuf, num_folds) bert_folds = [a.tolist() for a in bert_folds] #creates an ordered list of N=entries of integers(:folds) indicating the fold idx of each entry fold_idx = [bert_folds.index(list(sl)) for e in index_shuf for sl in bert_folds if e in list(sl)] #TODO make this process read only one fold at a time for fold in range(num_folds): b_fold_csv = pd.read_csv(data_dir+"/folds/"+str(fold)+"/bert.csv") #gets only the indexes count = sum([1 for fidx,_ in zip(fold_idx, index_shuf) if fold == fidx]) for idx in range(count): #print("csv:",b_fold_csv) #print("len",len(b_fold_csv)) #print("count: ", count) #print("range(count): ",range(count)) b = b_fold_csv.iloc[idx] entry = np.concatenate((features[idx,:],b)) feat_df = pd.DataFrame([entry], columns=['f'+str(e) for e in range(len(entry))]) feat_df.to_csv(data_dir+"/folds/"+str(fold)+"/features+bert.csv", mode='a+', index=False, header=False) for i in range(num_folds): fold_dir = data_dir+"/folds/"+str(i) bert = np.genfromtxt(fold_dir+"/features+bert.csv", delimiter=',') only_bert = np.genfromtxt(fold_dir+"/bert.csv", delimiter=',') print("saving bert fold ",str(i), bert.shape) save_p(fold_dir+"/bert", bert) save_p(fold_dir+"/only_bert", only_bert) print("MEMORY AFTER FOLDS SAVING: ",resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) savehash("concat", hashcode=df_hash) checks = ["bert", "features", "concat", "complexity", "specificity"] for e in checks: print(e) print(check_hash(df_hash,num_folds,stage=e)) if not (check_hash(df_hash,num_folds,stage=e, drop_feat_idx=drop_feat_idx)): print('Problem at Generation of data!') print("Stage: "+e) return print('Generation of data successfully done!') savehash("data", hashcode=df_hash) savehash("folds", hashcode=str(num_folds)) return load_data(emb_type=emb_type, collapse_classes=collapse_classes, fold=fold, num_folds=num_folds, random_state=random_state, drop_feat_idx=drop_feat_idx) else: print("Reading already processed data") #returns the selected emb type (bert/w2v) test_data = read_p(data_dir+"/folds/"+str(fold)+"/"+emb_type) test_target = read_p(data_dir+"/folds/"+str(fold)+"/labels") dev_data = read_p(data_dir+"/folds/"+str(fold_dev)+"/"+emb_type) #dev_data = np.ndarray(dev_data) dev_target = read_p(data_dir+"/folds/"+str(fold_dev)+"/labels") train_data_filenames = [data_dir+"/folds/"+str(i)+"/"+emb_type for i in range(num_folds) if i not in [fold,fold_dev]] train_data = np.concatenate([read_p(fn) for fn in train_data_filenames], axis=0) train_target_filenames = [data_dir+"/folds/"+str(i)+"/labels" for i in range(num_folds) if i not in [fold,fold_dev]] train_target = np.concatenate([read_p(fn) for fn in train_target_filenames], axis=0) return train_data, train_target, dev_data, dev_target, test_data, test_target, label_to_oh
common = [ '-model_dir', '/bert_model/chinese_L-12_H-768_A-12/', '-num_worker', '2', '-port', str(port), '-port_out', str(port_out), '-max_seq_len', '20', # '-client_batch_size', '2048', '-max_batch_size', '256', # '-num_client', '1', '-pooling_strategy', 'REDUCE_MEAN', '-pooling_layer', '-2', '-gpu_memory_fraction', '0.2', '-device','3', ] args = get_args_parser().parse_args(common) for pool_layer in range(1, 13): setattr(args, 'pooling_layer', [-pool_layer]) server = BertServer(args) server.start() print('wait until server is ready...') time.sleep(20) print('encoding...') bc = BertClient(port=port, port_out=port_out, show_server_config=True) subset_vec_all_layers.append(bc.encode(subset_text)) bc.close() server.close() print('done at layer -%d' % pool_layer) #save bert vectors and labels
def precompute_embeddings(path_to_binary: Path, path_to_batches: Path, batch_size: int = 100, bert_model_dir: str = "bert/uncased_L-12_H-768_A-12"): """Precompute and store sentence embeddings, along with articles and summaries, in batches of 100. :param path_to_binary: Path to pre-tokenized binaries (train/valid/test.bin, from https://github.com/JafferWilson/Process-Data-of-CNN-DailyMail) :param path_to_batches: Path prefix for storing batches of embeddings/articles/summaries :param batch_size: Number of articles per batch. :param bert_model_dir: Directory of bert model (https://github.com/hanxiao/bert-as-service) """ from bert_serving.client import BertClient from tensorflow.core.example import example_pb2 import struct import nltk nltk.download('punkt') # load bert client (and bert server if not already running) try: bc = BertClient() except: from bert_serving.server.helper import get_args_parser from bert_serving.server import BertServer args = get_args_parser().parse_args(['-model_dir', bert_model_dir, '-max_seq_len', 40, '-num_worker', 4, '-device_map', '1,2,3,4']) server = BertServer(args) server.start() bc = BertClient() print("Bert client loaded...") # load articles and summaries articles: List[List[str]] = [] summaries: List[str] = [] reader = open(path_to_binary, 'rb') i = 0 while True: len_bytes = reader.read(8) if not len_bytes: break # finished reading this file str_len = struct.unpack('q', len_bytes)[0] example_str = struct.unpack('%ds' % str_len, reader.read(str_len))[0] example = example_pb2.Example.FromString(example_str) try: article = example.features.feature['article'].bytes_list.value[0].decode('utf-8') summary = example.features.feature['abstract'].bytes_list.value[0].decode('utf-8') if len(article) != 0: articles.append(nltk.sent_tokenize(article)) summaries.append(summary.replace("<s>", "").replace("</s>", "")) i += 1 if not i % 1000: print(f"loaded {i} articles...") except ValueError: print("Failed retrieving an article or abstract.") print(f"Articles and summaries read from path: {path_to_binary}...") # precompute embeddings, and store batches of embeddings/articles/summaries for i in tqdm(range(0, len(articles), batch_size)): j = min(len(articles), i + batch_size) print(f"embedding articles {i}-{j}...") a = articles[i:j] s = summaries[i:j] articles_tensor = bc.encode(sum(a, [])) np.savez_compressed(f"{path_to_batches}.article_tensors.{i}.npz", articles_tensor) with open(f"{path_to_batches}.sentencized_articles.{i}.pkl", 'wb') as f: pickle.dump(a, f) with open(f"{path_to_batches}.summaries.{i}.pkl", 'wb') as f: pickle.dump(s, f) i += batch_size