async def do_get_question_api(question: str): if not question: return {'status': False, 'msg': 'Please enter the query.'} if question: try: # user_id = 'qa_' + user_id conn = connect_postgres_server() cursor = conn.cursor() client = milvus_client() bc = BertClient(ip=BERT_HOST, port=BERT_PORT, check_length=False) output = get_similar_question(question, client, conn, cursor, bc) if output: return {'status': True, 'msg': output} else: return { 'status': False, 'msg': 'No similar questions in the database' } except Exception as e: print('search faild: ', e) return { 'status': False, 'msg': 'Failed to search, please try again.' } finally: cursor.close() conn.close() bc.close() return {'status': False, 'msg': 'Failed to search, please try again.'}
class FeatureExtractor: """Uses Bert-as-a-Server to set up a BertClient and embed text in a Document. Attributes: document (Document): This object encompasses the extracted text from one of the PDF documents. There is an encoding field on each Line which is where the embedding from BERT will be included, and where the text that gets encoded will be provided. _bc (BertClient): Connection to the BertServer which can be used for encoding. """ def __init__(self, document): self._document = document self._bc = BertClient() def encode(self): """ encodes the text in the Document object, and then adds it to the encoding attribute """ text_lines = [line.text for line in self._document.lines] encodings = self._bc.encode(text_lines) for (line, encoding) in zip(self._document.lines, encodings): line.encoding = encoding return self._document def end(self): """ Closes the BertClient connection to BertServer """ self._bc.close()
async def do_load_api(file: UploadFile = File(...)): try: text = await file.read() fname = file.filename dirs = "QA_data/" if not os.path.exists(dirs): os.makedirs(dirs) fname_path = dirs + "/" + fname with open(fname_path, 'wb') as f: f.write(text) except Exception as e: return {'status': False, 'msg': 'Failed to load data.'} try: conn = connect_postgres_server() cursor = conn.cursor() client = milvus_client() bc = BertClient(ip=BERT_HOST, port=BERT_PORT, check_length=False) status, message = load_data(fname_path, client, conn, cursor, bc) return {'status': status, 'msg': message} except Exception as e: print("load data faild: ", e) return {'status': False, 'msg': 'Failed to load data.'} finally: cursor.close() conn.close() bc.close()
def save_emb(): common = [ '-model_dir', '/home/ydu/BERT/uncased_L-12_H-768_A-12/', '-num_worker', '2', '-port', '5555', '-port_out', '5556', '-max_seq_len', '128', '-max_batch_size', '256', # '-tuned_model_dir', '/home/ydu/BERT/bert_mgpu/pretrain_output/10k-32b-all4data/', # '-ckpt_name', 'model.ckpt-2500', ] args = get_args_parser().parse_args(common) # folder = ['books', 'dvd', 'electronics', 'kitchen'] data_path = '/home/ydu/BERT/DATA/' data_folder = ['metacritic', 'imdb', 'amazon', 'reddit'] # model_path = 'home/ydu/BERT/bert_mgpu/results/' # model_folder = 'amazon-balanced/' # model_type = 'bert-tune' data = {} # setattr(args, 'tuned_model_dir', '/home/ydu/BERT/bert_mgpu/pretrain_output/reddit-pretrain') # setattr(args, 'ckpt_name', 'model.ckpt-2500') setattr(args, 'tuned_model_dir', '/home/ydu/BERT/bert_mgpu/pretrain_output/10k-32b-all4data') setattr(args, 'ckpt_name', 'model.ckpt-2500') for d in data_folder: fn = data_path + d + '/all.tsv' print("===========", fn, "================") text = read_tsv(fn) server = BertServer(args) server.start() print('wait until server is ready...') time.sleep(20) print('encoding...') bc = BertClient() data[d] = bc.encode(text) bc.close() server.close() pickle_name = data_path + 'EMB/allpre_emb.pickle' with open(pickle_name, 'wb') as handle: pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL) return pickle_name
async def do_insert_api(data_path: str): try: conn, cursor = init_conn() bc = BertClient(ip=BERT_HOST, port=BERT_PORT, check_length=False) status = do_insert(data_path, index_client, conn, cursor, bc) return "{0}".format(status) except Exception as e: return "{0}".format(e) finally: cursor.close() conn.close() bc.close()
class BertEmbedd: def __init__(self): self.logger = logging.getLogger(__name__) self.logger.setLevel(logging.INFO) def get_connection(self, inport=8000, outport=8010): ''' Sets up a connection with the specified BERT server api. :param inport: Port for pushing data from client to server (defaults to 8000) :param outport: Port for publishing results from server to client (defaults to 8010) :return: Bert client object connected to a BertServer ''' try: self.connection = BertClient(port=inport, port_out=outport) except Exception as e: logger.error(f'Connection to BERT server failed: {str(e)}') logger.info("Connection to BERT server was successful.") def close_connection(self): if self.connection: self.connection.close() def get_encode(self, data, istokenized=True, isblocked=True): ''' :param inport: the port where bert service located :param outport: the port where the results get back :param sentences: list of sentences, (preprocessed, tokenized-rejoin) :return: encoded sentence/token-level embeddings, rows correspond to sentences :type: numpy.ndarray or list[list[float]] ''' logger.info('sending new request...') try: # encode tokenized sentences result = self.connection.encode(data, blocking=isblocked, is_tokenized=istokenized) logger.info('encoding job done') except Exception as e: logger.error(f'getting encodes from BERT failed: {str(e)}') return [] return result
class BertEncoder(BaseTextEncoder): store_args_kwargs = True is_trained = True def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self._bc_encoder_args = args self._bc_encoder_kwargs = kwargs def post_init(self): from bert_serving.client import BertClient self.bc_encoder = BertClient(*self._bc_encoder_args, **self._bc_encoder_kwargs) @batching def encode(self, text: List[str], *args, **kwargs) -> np.ndarray: return self.bc_encoder.encode(text, *args, **kwargs) # type: np.ndarray def close(self): self.bc_encoder.close()
def get_vec(question_list, prcess_conifg): question_vec = [] if prcess_conifg.vec_type == "bert": bc = BertClient(ip="127.0.0.1") for question in question_list: question_vec.append(bert_vec(bc, question)[0]) bc.close() if prcess_conifg.vec_type == "word": model_file = '/data/dataset/news_12g_baidubaike_20g_novel_90g_embedding_64.bin' model = gensim.models.KeyedVectors.load_word2vec_format(model_file, binary=True) print("load 模型完成") for question in question_list: #去除英文、数字和其他字符,可以选择不要 r_s = u'[a-zA-Z0-9’!"#$%&\'()*+,-./:;<=>?@,。?★、…【】《》?“”‘’![\\]^_`{|}~]+' question_t = re.sub(r_s, '', question) question_vec.append(word_vec(model, question_t)) return question_vec
class BertSupport: def __init__(self, use_timeout=True): if use_timeout: # Bert客户端请求BertServer服务,参数ip为服务地址 self.bc = BertClient(ip=DEV_BERT_SERVER_IP, timeout=15000, check_version=False) else: self.bc = BertClient(ip=DEV_BERT_SERVER_IP, check_version=False) def compute_cosine(self, word1, word2): # 使用bert对该对文本进行encode成向量 if (word1 == "" or word2 == ""): return 0 a = self.bc.encode([word1, word2]) vector_a = np.mat(a[0]) vector_b = np.mat(a[1]) num = float(vector_a * vector_b.T) denom = np.linalg.norm(vector_a) * np.linalg.norm(vector_b) # 计算向量余弦 cos = num / denom # 计算相似度 sim = 0.5 + 0.5 * cos return sim def word_list_vector(self, wordList): a = self.bc.encode(wordList) d = {} for i in range(a.__len__()): d[wordList[i]] = np.mat(a[i]) return d def compute_distance(self, word1, word2): a = self.bc.encode([word1, word2]) distance = a[0] - a[1] distance_sum = np.sum(distance) return np.abs(distance_sum) def close(self): self.bc.close()
args = namedtuple('args_namedtuple', ','.join(common.keys())) for k, v in common.items(): setattr(args, k, v) for pool_layer in range(1, 13): setattr(args, 'pooling_layer', [-pool_layer]) server = BertServer(args) server.start() print('wait until server is ready...') time.sleep(15) print('encoding...') bc = BertClient(port=common['port'], port_out=common['port_out'], show_server_config=True) subset_vec_all_layers.append(bc.encode(subset_text)) bc.close() server.close() print('done at layer -%d' % pool_layer) def vis(embed, vis_alg='PCA', pool_alg='REDUCE_MEAN'): plt.close() fig = plt.figure() plt.rcParams['figure.figsize'] = [21, 7] for idx, ebd in enumerate(embed): ax = plt.subplot(2, 6, idx + 1) vis_x = ebd[:, 0] vis_y = ebd[:, 1] plt.scatter(vis_x, vis_y, c=subset_label,
def page_rank_texts(self, texts: list): #each txt in texts is tokenized from bert_serving.client import BertClient import networkx as nx #begin re-ranking if self.is_tokenized: for i in range(len(texts)): texts[i] = self.tokenizer.tokenize(texts[i]) retry = 5 while retry > 0: try: encoder = BertClient(ip="ring-gpu-3", port=5555, check_length=False, timeout=3000) encoded_texts = encoder.encode(texts, is_tokenized=self.is_tokenized) #print("encoded",len(encoded_texts)) break except: encoder.close() retry -= 1 if retry < 1: print("error") break print("left try", retry) G = nx.Graph() for i in range(len(texts)): for j in range(i): w = np.dot(encoded_texts[i], encoded_texts[j]) G.add_edge(i, j, weight=w) rank_scores = nx.pagerank_numpy(G) ranks = sorted(rank_scores.items(), key=lambda x: x[1], reverse=True) #print(rank_scores) #print(ranks) ranks = list(map(lambda x: x[0], ranks)) selected = [] sumTokens = 0 while ranks: #print("left ranks",ranks) txt = texts[ranks[0]] if self.is_tokenized: txt = " ".join(txt) curTokens = txt.split() sumTokens += len(curTokens) selected.append((txt, ranks[0])) if sumTokens > self.maxClip: break del ranks[0] selected = sorted(selected, key=lambda x: x[1]) selected = map(lambda x: x[0], selected) selected = list(selected) return selected
def main1(args): log_file = 'logs_' + args.name + str(args.start) # 加start是为了互相区分开 model_dir = 'models_' + args.name + str(args.start) exp = common.Experiment(log_file, model_dir) # 在这里内次运行前会把已经存在model_dir的同名目录删除 # we = ph.utils.WordEmbedding() # 修改了原函数 初始化之后可以直接通过embeding查询 bert_client = BertClient(ip='202.201.242.38') trainer = Main(args.name + str(args.start), 768) # 词向量 300维 输入了模型名称和300维 ph.initialize_global_variables() # test_list = [] for i in range(domain_num): print('***********************') print('domain:' + str(i) + dom_list[i]) train_data, dev_data, test_data = build_dataset_LL( i, args, bert_client) # 重写的数据读取 这里是读取一个领域的数据 train_ds = common.TrainSource(train_data, i) # 实现了源代码中的DataSource作为Dataloader dev_ds = common.TrainSource(dev_data, i) test_ds = common.TrainSource(test_data, i) test_list.append(test_ds) # exp.load_model(trainer) # 先把之前的模型拿出来接着训练 第一次就从头训 seq_stat = trainer.stat.read_stat(trainer.flat_seq) states_stat = trainer.stat.read_stat(trainer.flat_states) trainer._optimizer.update_mask(trainer.shared.cell.wz, seq_stat, i) trainer._optimizer.update_mask(trainer.shared.cell.wr, seq_stat, i) trainer._optimizer.update_mask(trainer.shared.cell.wh, seq_stat, i) trainer._optimizer.update_mask(trainer.shared.cell.uz, states_stat, i) trainer._optimizer.update_mask(trainer.shared.cell.ur, states_stat, i) trainer._optimizer.update_mask(trainer.shared.cell.uh, states_stat, i) trainer.add_data_trainer(train_ds, 64) # 32 batch_size # trainer.add_screen_logger('train', ('Loss', 'Norm'), interval=1) # 这里是输出训练过程的 trainer.add_data_validator(test_ds, 64, interval=20) # 相当于model.eval() # trainer.add_screen_logger( # eval的结果 # "validate", # ('hit_pos', 'hit_neg', 'pred_pos', 'pred_neg', 'Error'), # message='[%d]' % i, # interval=20 # ) trainer.add_fitter(common.DevFitter(dev_ds, 64, 20)) trainer.fit(args.num_loops) trainer.clear_fitters() # 训练过程到此结束 # exp.dump_model(trainer) # 模型存起来 # test turn for test_data in test_list: trainer.add_data_validator(test_data, 64, interval=1) # model.eval() trainer.add_screen_logger( # 输出测试结果 "validate", ('hit_pos', 'hit_neg', 'pred_pos', 'pred_neg', 'Error'), message='[%d]' % i, interval=1) trainer.fit(1) trainer.clear_fitters() trainer.stat.update_stats() bert_client.close() return 0
class CorpusSearcher(object): def __init__(self, model_file='spacy-2.2/data/embedded_corpus.pkl'): from os.path import expanduser self.bc = BertClient() self.model_file=expanduser(model_file) def train(self, quotes, source_col='text'): embeddings = self.bc.encode(quotes[source_col].to_list()) quotes['EMBEDDINGS'] = embeddings.tolist() # Persist to pickle quotes.to_pickle(self.model_file) def train_corpus(self, data_file, source_col='text'): # f'{cf.conf_dir}/stack/crawlers/langcrs/all_{lang}.json' dfjson = pd.read_json(data_file) self.train(dfjson, source_col=source_col) def load_quotes_and_embeddings(self, file): quotes = pd.read_pickle(file) # change dtype in place for memory efficiency quotes['EMBEDDINGS'] = quotes['EMBEDDINGS'].apply( lambda arr: np.array(arr, dtype='float32') ) quote_embeddings = np.stack(quotes.EMBEDDINGS.values) # reduce memory footprint by dropping column quotes.drop('EMBEDDINGS', axis='columns') # normalize embeddings for cosine distance embedding_sums = quote_embeddings.sum(axis=1) normed_embeddings = quote_embeddings / embedding_sums[:, np.newaxis] return quotes, normed_embeddings def create_index(self, embeddings): import faiss """ Create an index over the quote embeddings for fast similarity search. """ dim = embeddings.shape[1] index = faiss.IndexFlatL2(dim) index.add(embeddings) return index def search(self, text, cols, top_result=5): text_embedding = self.bc.encode([text]) normalized_text_embedding = text_embedding / text_embedding.sum() quotes, embeddings = self.load_quotes_and_embeddings(self.model_file) index = self.create_index(embeddings) _, idx = index.search(normalized_text_embedding, top_result) # relevant_quotes = quotes.iloc[idx.flatten()].text.values # relevant_chapters = quotes.iloc[idx.flatten()].chapter.values rs=[] for col in cols: rs.append(quotes.iloc[idx.flatten()][col].values) # relevant_chapters = quotes.iloc[idx.flatten()]['chapter'].values return rs @staticmethod def parse_controls(results): rs = [] for lang, v in results.items(): for sent in v: rs.append((sent['translate'], lang, sent['translit'])) return rs def run(self, text, langs=None, top_result=5, summary=False, verbose=True): """ $ python -m sagas.corpus.searcher run 'I read a letter.' $ python -m sagas.corpus.searcher run 'I read a letter.' ja,id $ python -m sagas.corpus.searcher run 'I read a letter.' ja,id,fa 2 True False :param text: :return: """ # 先按相似度查找到与给定内容近似的英文句子 relevant_quotes, relevant_chapters = self.search(text, ['text', 'chapter'], top_result) summary_info=[] for q in range(top_result): tc.emp('magenta', '>' + relevant_quotes[q]) tc.emp('green', relevant_chapters[q]) if langs is not None: # 因为语料都是按英文作对照的, 所以直接按英文句子查找到其它语言的句子就可以了 # search_in_list('I write a letter.', ['ja', 'fa', 'id']) results=search_in_list(relevant_quotes[q], langs) if verbose: tc.emp('blue', json.dumps(results, indent=2, ensure_ascii=False)) if summary: all_types = [] rs_c=CorpusSearcher.parse_controls(results) for r in rs_c: if r[2]!='': tc.emp('red', f".. {r[2]}") types=sents_summary(r[0], r[1]) all_types.extend(types) summary_info.append((relevant_quotes[q], all_types)) tc.emp('cyan', '✁', '-' * 30) for s in summary_info: tc.info(s) def end(self): self.bc.close()