def create_query_vector(query): return BertClient().encode([query])[0]
''' pip下载bert-serving的server和client之后,启动server: bert-serving-start -model_dir chinese_L-12_H-768_A-12/(预训练模型) -num_worker=2 ''' from bert_serving.client import BertClient from tqdm import tqdm import pickle bc = BertClient(ip='localhost') data = [] file = 'baidu_95.csv' # 语料 with open(file, 'r', encoding='utf8') as f: for line in tqdm(f.readlines()): words = line.split(' ') data.append(bc.encode(words)) save_path = 'bert_serving_vector.pkl' with open(save_path, 'wb') as f: pickle.dump(data, f) print('successful get vocab at {}'.format(save_path))
def embed(lst): from bert_serving.client import BertClient bc = BertClient() vec = bc.encode(lst) print('vec.shape: ', vec.shape) return vec
"-port_out", "5556", "-max_seq_len", "NONE", "-pooling_strategy", "NONE", "-mask_cls_sep", "-cpu", ] SHUT_ARGS = ["-ip", "localhost", "-port", "5555", "-timeout", "5000"] class BertWordEmbedding: def __init__(self): self.start_args = get_args_parser().parse_args(START_ARGS) self.shut_args = get_shutdown_parser().parse_args(SHUT_ARGS) def vectorize(self, client, tokens): vecs = np.squeeze(client.encode(tokens, is_tokenized=True))[1:-1] return vecs if __name__ == "__main__": bert = BertWordEmbedding() tokens = [["hello", "world", "!"]] with BertServer(bert.start_args): with BertClient() as client: vecs = bert.vectorize(client, tokens) print(vecs) print(vecs.shape)
return 1 / (1 + math.exp(-100 * (cosine - 0.95))) # construct the argument parse and parse the arguments ap = argparse.ArgumentParser() ap.add_argument("-n", "--news", required=True, help="text of the news") args = vars(ap.parse_args()) #query = "Covid19 is deadly and spreads through 5G" ## change or input query here query = args["news"] json_path = "who_scrap.json" ## data corpus indices = [] scores = [] with BertClient(port=5555, port_out=5556, check_version=False) as bc: Pairs, dataframe = testing(query, json_path) print("Start testing") for i, p in enumerate(Pairs): try: score = scoring(p) #print("Similarity of Pair {}: ".format(i+1),score ) if score > 0.7: indices.append(i) scores.append(score) except: print("no text found for entry {}".format(i + 1)) result_df = dataframe.iloc[indices] weight = 0 sentiments = 0
def create_kui_data_for(path_patch_kui, path_defects4f_c, path_supply_data, path_FSE_defects4j, model): print('model: {}'.format(model)) with open('../data/experiment3/kui_data_for_' + model + '.pickle', 'wb') as f: if model == 'doc': m = Doc2Vec.load('../data/model/doc_frag.model') elif model == 'bert': # max_seq_len=360 m = BertClient(check_length=False) else: print('error') # buggy_array = np.array([]) # patched_array = np.array([]) # label_array = np.array([]) sets = set() cnt = 0 label_array, buggy_array, patched_array = list(), list(), list() # xiong's 139 data path_patch_supply = path_supply_data path_jsons = os.path.join(path_patch_supply, 'INFO') json_files = os.listdir(path_jsons) for j in json_files: with open(os.path.join(path_jsons, j), 'r') as f1: info_dict = json.load(f1) if j.split('.')[0] not in data_139: continue if info_dict['project'] == 'Mockito': continue if info_dict['correctness'] == 'Correct': label = 1 elif info_dict['correctness'] == 'Incorrect': label = 0 else: continue path_patch = os.path.join(path_patch_supply, j.split('.')[0]) bug_vec, patched_vec = get_sample_supply2(model, path_patch, m, sets) # filter duplication if type(bug_vec) is not np.ndarray: continue if cnt == 0: buggy_array = bug_vec.reshape((1, -1)) patched_array = patched_vec.reshape((1, -1)) label_array = [label] else: buggy_array = np.concatenate( (buggy_array, bug_vec.reshape((1, -1))), axis=0) patched_array = np.concatenate( (patched_array, patched_vec.reshape((1, -1))), axis=0) label_array.append(label) cnt += 1 print('cnt: {}'.format(cnt)) # kui's dataset for root, dirs, files in os.walk(path_patch_kui): if files == ['.DS_Store']: continue # files = sorted(files,key=lambda x:int(x.split('-')[1].split('.')[0])) if files == []: continue if root.split('/')[-1].startswith('Mockito'): continue label, bug_vec, patched_vec = get_sample(model, files, root, m, sets) # filter duplication if type(bug_vec) is not np.ndarray: continue if cnt == 0: buggy_array = bug_vec.reshape((1, -1)) patched_array = patched_vec.reshape((1, -1)) label_array = [label] else: buggy_array = np.concatenate( (buggy_array, bug_vec.reshape((1, -1))), axis=0) patched_array = np.concatenate( (patched_array, patched_vec.reshape((1, -1))), axis=0) label_array.append(label) cnt += 1 print('cnt: {}'.format(cnt)) # label=1 developer's correct patches for bug in bug_folder: bug_path = os.path.join(path_defects4f_c, bug) correct_patches = os.path.join(bug_path, 'patches') for patch in os.listdir(correct_patches): if not patch.endswith('src.patch'): continue path_patch = os.path.join(correct_patches, patch) try: label, bug_vec, patched_vec = get_sample_supply( model, path_patch, m, sets) # filter duplication if type(bug_vec) is not np.ndarray: continue if cnt == 0: buggy_array = bug_vec.reshape((1, -1)) patched_array = patched_vec.reshape((1, -1)) label_array = [label] else: buggy_array = np.concatenate( (buggy_array, bug_vec.reshape((1, -1))), axis=0) patched_array = np.concatenate( (patched_array, patched_vec.reshape((1, -1))), axis=0) label_array.append(label) except Exception as e: print(e) continue cnt += 1 print('cnt: {}'.format(cnt)) # big dataset # # FSE correct # cor = path_FSE_defects4j+'Correct' # patchName = os.listdir(cor) # for pn in patchName: # pf = os.path.join(cor,pn) # try: # label, bug_vec, patched_vec = get_sample_supply(model, pf, m, sets) # if type(bug_vec) is not np.ndarray: # continue # if cnt == 0: # buggy_array = bug_vec.reshape((1, -1)) # patched_array = patched_vec.reshape((1, -1)) # label_array = [label] # else: # buggy_array = np.concatenate((buggy_array, bug_vec.reshape((1, -1))), axis=0) # patched_array = np.concatenate((patched_array, patched_vec.reshape((1, -1))), axis=0) # label_array.append(label) # except Exception as e: # print(e) # continue # cnt += 1 # print('cnt: {}'.format(cnt)) # # # FSE incorrect # cor = path_FSE_defects4j + 'Incorrect' # patchName = os.listdir(cor) # for pn in patchName: # pf = os.path.join(cor, pn) # try: # label, bug_vec, patched_vec = get_sample_supply_fseincorrect(model, pf, m, sets) # if type(bug_vec) is not np.ndarray: # continue # if cnt == 0: # buggy_array = bug_vec.reshape((1, -1)) # patched_array = patched_vec.reshape((1, -1)) # label_array = [label] # else: # buggy_array = np.concatenate((buggy_array, bug_vec.reshape((1, -1))), axis=0) # patched_array = np.concatenate((patched_array, patched_vec.reshape((1, -1))), axis=0) # label_array.append(label) # except Exception as e: # print(e) # continue # cnt += 1 # print('cnt: {}'.format(cnt)) label_array = np.array(label_array) data = label_array, buggy_array, patched_array pickle.dump(data, f)
# coding: utf-8 # In[1]: import matplotlib.pyplot as plt from bert_serving.client import BertClient bc = BertClient(ip='127.0.0.1', port=8701, port_out=8702, show_server_config=True) # In[27]: vec = bc.encode( ['First do it', # [CLS] First do it [SEP] [word embedding for padding symbol] 'then do it right', 'then do it better', 'In the middle of nowhere, you will find that you are nobody, nobody in the middle of nowhere.'], show_tokens=True) print(vec[0].shape, vec[1]) for idx_sentence in range(len(vec[1])): print('\n',vec[1][idx_sentence]) for idx_token in range(len(vec[1][idx_sentence])): print(vec[1][idx_sentence][idx_token],'\t', vec[0][idx_sentence][idx_token][0:5]) vec = vec[0] plt.subplot(2, 1, 1) plt.plot(vec[0][0:5].T)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Fri Dec 28 12:45:30 2018 @author: lihuixian """ #bert-serving-start -model_dir /model/chinese_L-12_H-768_A-12/ -num_worker=4 -port 5555 -port_out 5556 from bert_serving.client import BertClient bc = BertClient(ip='192.168.13.19', port=5555, port_out=5556) import os import numpy import glob readPath = '/Users/lihuixian/Documents/2018analysis/bert3/first10.2' savePath = '/Users/lihuixian/Documents/2018analysis/bert3/vector_first10.2' files = glob.glob('%s/*.txt' % readPath) for path in files: filename = os.path.basename(path)[:-4] print(filename) f = open(path) lines = f.readlines() avector = bc.encode(lines) print(avector) numpy.savetxt(r'%s/%s.csv' % (savePath, filename), avector) print('写入成功')
def embed_sentences(self): bc = BertClient() embedding = bc.encode(self.sentences) return embedding
def _get_BaaS(): assert spu.is_port_in_use( cfg.bert_port ), f'Bert As Service port not in use ({cfg.bert_port}).' return BertClient(ignore_all_checks=True)
app = app = Flask(__name__) prefix_q = 'Q: ' prefix_a = 'A: ' topk = 5 with open('./QA_TravelAgancy.txt') as fp: questions = [v.replace(prefix_q, '').strip() for v in fp if v.strip() and v.startswith(prefix_q)] print('%d questions loaded, avg. len of %d' % (len(questions), np.mean([len(d.split()) for d in questions]))) with open('./QA_TravelAgancy.txt') as fp: answers = [v.replace(prefix_a, '').strip() for v in fp if v.strip() and v.startswith(prefix_a)] print('%d answers loaded, avg. len of %d' % (len(answers), np.mean([len(d.split()) for d in answers]))) bc = BertClient(ip='195.246.57.106' ,port=5555, port_out=5556, check_length=False) doc_vecs = bc.encode(questions) @app.route('/') @as_json def hello_world(): return {'message':'Hello World'} @app.route('/gsug', methods=['POST']) @as_json def sendAnswers(): data = request.form query = data['query'] query_vec = bc.encode([query])[0]
'pooling_layer': [-2], 'gpu_memory_fraction': 0.5 } args = namedtuple('args_namedtuple', ','.join(common.keys())) for k, v in common.items(): setattr(args, k, v) for pool_layer in range(1, 13): setattr(args, 'pooling_layer', [-pool_layer]) server = BertServer(args) server.start() print('wait until server is ready...') time.sleep(15) print('encoding...') bc = BertClient(port=common['port'], port_out=common['port_out'], show_server_config=True) subset_vec_all_layers.append(bc.encode(subset_text)) bc.close() server.close() print('done at layer -%d' % pool_layer) def vis(embed, vis_alg='PCA', pool_alg='REDUCE_MEAN'): plt.close() fig = plt.figure() plt.rcParams['figure.figsize'] = [21, 7] for idx, ebd in enumerate(embed): ax = plt.subplot(2, 6, idx + 1) vis_x = ebd[:, 0] vis_y = ebd[:, 1]
import json import os from tqdm import tqdm from copy import deepcopy from bert_serving.client import BertClient bc = BertClient(output_fmt='list') # input: cord-19 dataset in directory ./data/ # output: each paper as its own json, trimmed and ready to upload to elasticsearch in ./trimmed_papers/ def get_bert_encoding(text: str) -> list: return bc.encode([text])[0] def write_json(output_file: str, data: dict, counter: int) -> int: with open(output_file, 'w') as f: json.dump(data, f) return counter + 1 def handle_file(data: dict, p_num_offset: int): keep = {} keys = data.keys() p_num = p_num_offset if 'abstract' in keys:
def cosine_similarity(self, c1, c2): #bc = BertClient(check_length=False) bc = BertClient() vectors = bc.encode([c1.text, c2.text]) cosine = 1.0 - scipy.spatial.distance.cosine(vectors[0], vectors[1]) return cosine
def __init__(self, ip='localhost', port=5555, port_out=5556): print('Initializing ranker...') self.bc = BertClient(ip, port, port_out) print('Ranker established.')
""" Example script to create elasticsearch documents. """ import argparse import json from pandas import read_csv from bert_serving.client import BertClient bc = BertClient(output_fmt='list', check_length=False) def create_document(doc, emb, index_name): return { '_op_type': 'index', '_index': index_name, 'title': doc['title'], 'purpose': doc['purpose'], 'documents_submission_date_start': doc['documents_submission_date_start'], 'documents_submission_date_end': doc['documents_submission_date_end'], 'documents_submission_time_end': doc['documents_submission_time_end'], 'is_urgent': doc['is_urgent'], 'fund_name': doc['fund_name'], 'country': doc['country'], 'allowed_participant_countries': doc['allowed_participant_countries'], 'id': doc['id'], 'field_of_knoweledge': doc['field_of_knoweledge'], # 'specific_objectives': doc['specific_objectives'], #! # 'expected_impact': doc['expected_impact'], #! 'topic_description': doc['topic_description'], #! 'allowed_participants': doc['allowed_participants'],
def main(): args = docopt( '''Compute BERT embeddings for target words and calculate change metrics. Usage: diasense.py <language> <corpus1> <corpus2> <targets> <result_path> <sent_limit> <language> = english, german, swedish or latin <corpus1> = path to corpus1 (txt-file) <corpus2> = path to corpus2 (txt-file) <targets> = path to target words (txt-file) <result_path> = path to directory for results <sent_limit> = number of sentences considered in calculation of BERT embeddings in each corpus ''') language = args['<language>'] corpus1 = args['<corpus1>'] corpus2 = args['<corpus2>'] targets = args['<targets>'] result_path = args['<result_path>'] sent_limit = int(args['<sent_limit>']) with open(targets, 'r') as target_in: target_list = [line.rstrip() for line in target_in] metrics_path = result_path + '/metrics' if not os.path.exists(metrics_path): os.mkdir(metrics_path) metrics = metrics_path + '/metrics.txt' delta_later_out = metrics_path + '/delta_later.txt' delta_compare_out = metrics_path + '/delta_compare.txt' metrics = open(metrics, 'w') delta_later_out = open(delta_later_out, 'w') delta_compare_out = open(delta_compare_out, 'w') metrics.write( 'TARGET\tEARLIER\tEARLIER_STD\tLATER\tLATER_STD\tCOMPARE\tCOMPARE_MIXED\tDELTA_LATER\tDELTA_COMPARE\n' ) with open(corpus1, 'r') as corpus: corpus1 = [line.rstrip() for line in corpus] with open(corpus2, 'r') as corpus: corpus2 = [line.rstrip() for line in corpus] #get BERT sentence encoder; bert-as-service should be started beforehand (separately in your terminal) #recommendation: bert-serving-start -pooling_strategy NONE -show_tokens_to_client -model_dir multi_cased_L-12_H-768_A-12 -max_seq_len=128 bc = BertClient(check_length=False) for target in target_list: #get sentences in which target occurs sentences_c1 = get_sentences(corpus1, target) sentences_c2 = get_sentences(corpus2, target) if len(sentences_c1) > sent_limit: sentences_c1 = sentences_c1[0:sent_limit] if len(sentences_c2) > sent_limit: sentences_c2 = sentences_c2[0:sent_limit] #corpus 1 #get sentence embeddings (embeddings and tokenization) for sentences which contain target word in corpus1 embed_target_c1, tokens_target_c1 = bc.encode(sentences_c1, show_tokens=True) #get word embeddings for target words target_embeddings_c1 = word_embeddings(embed_target_c1, tokens_target_c1, target) #earlier earlier_dist = [] #get all distances between target word embeddings in corpus1 for i, embed in enumerate(target_embeddings_c1): j = 1 while i + j in range(len(target_embeddings_c1)): dist = cosine(target_embeddings_c1[i], target_embeddings_c1[i + j]) earlier_dist.append(dist) j += 1 #mean of all distances in corpus1 (=earlier) earlier = np.mean(np.array(earlier_dist)) #standard deviation in earlier earlier_std = np.std(np.array(earlier_dist), axis=0) #corpus2 #get sentence embeddings (embeddings and tokenization) for sentences which contain target word in corpus2 embed_target_c2, tokens_target_c2 = bc.encode(sentences_c2, show_tokens=True) #get word embeddings for target words target_embeddings_c2 = word_embeddings(embed_target_c2, tokens_target_c2, target) #later later_dist = [] #get all distances between target word embeddings in corpus2 for i, embed in enumerate(target_embeddings_c2): j = 1 while i + j in range(len(target_embeddings_c2)): dist = cosine(target_embeddings_c2[i], target_embeddings_c2[i + j]) later_dist.append(dist) j += 1 #mean of all distances in corpus2 (=later) later = np.mean(np.array(later_dist)) #standard deviation in later later_std = np.std(np.array(later_dist), axis=0) #delta_later delta_later = later - earlier #compare compare_dist = [] #get all distances between pairs of target word embeddings, where one embedding is from corpus1 and the other from corpus2 for embed in target_embeddings_c1: for embed2 in target_embeddings_c2: dist = cosine(embed, embed2) compare_dist.append(dist) #mean of distances between pairs (=compare) compare = np.mean(np.array(compare_dist)) #compare_mixed all_embeddings = np.concatenate( (target_embeddings_c1, target_embeddings_c2), axis=0) mixed_dist = [] #get all distances between all target word embeddings in corpus1 and corpus2 for i, embed in enumerate(all_embeddings): j = 1 while i + j in range(len(all_embeddings)): dist = cosine(all_embeddings[i], all_embeddings[i + j]) mixed_dist.append(dist) j += 1 #mean of distances between all target word embeddings compare_mixed = np.mean(np.array(mixed_dist)) #delta_compare (here redefined as compare - compare_mixed) delta_compare = abs(compare - compare_mixed) metrics.write(target + '\t' + str(earlier) + '\t' + str(earlier_std) + '\t' + str(later) + '\t' + str(later_std) + '\t' + str(compare) + '\t' + str(compare_mixed) + '\t' + str(delta_later) + '\t' + str(delta_compare) + '\n') delta_later_out.write(target + '\t' + str(delta_later) + '\n') delta_compare_out.write(target + '\t' + str(delta_compare) + '\n')
from elasticsearch import Elasticsearch from bert_serving.client import BertClient from elasticsearch.exceptions import ConnectionError, NotFoundError # total number of responses SEARCH_SIZE = 1 # establishing connections bc = BertClient(ip='localhost', output_fmt='list', check_length=False) client = Elasticsearch('localhost:9200') # this query is used as the search term, feel free to change query = 'machine learning' query_vector = bc.encode([query])[0] script_query = { "script_score": { "query": { "match_all": {} }, "script": { "source": "cosineSimilarity(params.query_vector, doc['abstract_vector']) + 1.0", "params": { "query_vector": query_vector } } } } try:
args = sys.argv if args[1] == 'w2v': print('loading w2v model...') model = api.load('word2vec-google-news-300') model_abbr = 'w2v' embed_html = word2vec_avg main() elif args[1] == 'bert': if args[2] == 'server': bc = BertClient(ip='iccluster037.iccluster.epfl.ch', check_length=False) print('connection with server established') def bert_avg_server(body): return bert_avg(body, server=True, bc=bc) embed_html = bert_avg_server else: print('loading bert model...') tokenizer = BertTokenizer.from_pretrained('bert-base-cased') bert_model = BertModel.from_pretrained('bert-base-cased', output_hidden_states=True) bert_model.eval()
no_improve = 0 valid_slot = 0 test_slot = 0 valid_intent = 0 test_intent = 0 valid_err = 0 test_err = 0 best_epoch_num = 0 eval_loss = 0.0 result_records = [] if arg.use_bert: from bert_serving.client import BertClient bc = BertClient(ip=arg.bert_ip) while True: if data_processor == None: # For unk purpose if arg.use_unk == True: unker = UNKer(os.path.join(full_train_path, arg.input_file), os.path.join( full_train_path, arg.input_file + ".unk." + arg.unk_priority), os.path.join(full_train_path, arg.slot_file), ratio=arg.unk_ratio, threshold=arg.unk_threshold, priority=arg.unk_priority) data_processor = DataProcessor(
'''从pip下载bert-serving-server模型 下载语句:pip install bert-serving-server pip install bert-serving-client 然后将下载的中文编码chinese_L-12_H-768_A-12放到bert-serving模型中启动 在cmd中输入 : bert-serving-start -model_dir (放中文编码的路径) -num_worker=1 启动完成就可以进行调用,将文字转换成词向量''' from bert_serving.client import BertClient import numpy as np import pandas as pd import time import tensorflow as tf bc=BertClient(port=5555,port_out=5556) def ner_test(): with BertClient(show_server_config=False, check_version=False, check_length=False) as bc: start_t = time.perf_counter() str1 = '1月24日,新华社对外发布了中央对雄安新区的指导意见,洋洋洒洒1.2万多字,17次提到北京,4次提到天津,信息量很大,其实也回答了人们关心的很多问题。' str1 = list(str1) rst = bc.encode([str1], is_tokenized=True) print('rst:', rst) print(len(rst[0])) print(time.perf_counter() - start_t) # file=pd.read_csv("dd.txt") # data=file.to_string() # for line in data: # result.append(line.strip("\n")) # class_test() ner_test()
import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim import matplotlib.pyplot as plt from bert_serving.client import BertClient bc = BertClient(check_length=True) torch.manual_seed(1) raw_data = [("How are you? I am well".lower(), [0, 0, 0, 0, 0, 1]), ("Who are you? I am me".lower(), [0, 0, 0, 0, 0, 1]), ("What are you? I am me".lower(), [0, 0, 0, 1, 1, 0])] training_data = [] #### CURRENTLY TAKES FIRST WORD VECTOR FOR TESTING———NEEDS UPDATE ##### for dataPoint in raw_data: sentence = dataPoint[0] sentenceVec = [num for num in bc.encode([sentence])[0][1]] training_data.append((torch.tensor(sentenceVec), dataPoint[1])) tag_to_ix = {0: 0, 1: 1} # These will usually be more like 32 or 64 dimensional. # We will keep them small, so we can see how the weights change as we train. EMBEDDING_DIM = 6 HIDDEN_DIM = 30
from elasticsearch import Elasticsearch from bert_serving.client import BertClient import itertools bc = BertClient(ip='10.51.101.101', check_length=False) es = Elasticsearch(['https://cypher.es.eu-central-1.aws.cloud.es.io:9243'], http_auth=('elastic', 'UVrF6kyW58KrBzxoffp2YRKH')) def remove_duplicates_from_list(combined): ret = [] checked = [] for c in combined: if c[0] not in checked: ret.append(c) checked.append(c[0]) return ret def findRelevantHits(in_query): in_query_vector = bc.encode([in_query])[0].tolist() queries = { 'bert': { "script_score": { "query": { "match_all": {} }, "script": { "source": "cosineSimilarity(params.in_query_vector, doc['vector']) + 1.0", "params": {
def build_nn_graph(self, instance): word_vec = self.word_embed(instance.word_seq.unsqueeze(0)) # generate bert word embedding, not finetune if self.bert_emb > 0: from bert_serving.client import BertClient bc = BertClient(port=8880) tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") tokens = [] orig_to_tok_index = [] # 0 - >0, 1-> len(all word_piece) for i, word in enumerate(instance.input): orig_to_tok_index.append(len(tokens)) word_tokens = tokenizer.tokenize(word) for sub_token in word_tokens: tokens.append(sub_token) vec = bc.encode([tokens], show_tokens=True, is_tokenized=True) vec = vec[0][:, 1:, :][:, orig_to_tok_index, :] bert_vec = torch.tensor(vec).to(word_vec.device) word_rep = [word_vec] if self.char_emb_size > 0: char_seq_tensor = instance.char_seq_tensor.unsqueeze(0) char_seq_len = instance.char_seq_len.unsqueeze(0) char_features = self.char_bilstm.get_last_hiddens( char_seq_tensor, char_seq_len) word_rep.append(char_features) word_rep = torch.cat(word_rep, 2) #concate bert word embedding if self.bert_emb > 0: word_rep = torch.cat((word_rep, bert_vec), 2) word_rep = self.word_drop(word_rep) lstm_out, (hn, cn) = self.rnn(word_rep, None) lstm_out = self.lstm_drop(lstm_out) lstm_out = lstm_out.squeeze(0) linear_output = self.linear(lstm_out).squeeze(0) #score of each node instance_len = instance.size() lstm_hidden_size = self.lstm_hidden_size seg_embs = {} for i in range(instance_len): for j in range(i, instance_len): if i == 0 and j + 1 == instance_len: segment_emb = torch.cat([ lstm_out[j][:lstm_hidden_size], lstm_out[i][lstm_hidden_size:] ], 0) elif i == 0 and j + 1 < instance_len: segment_emb = torch.cat([ lstm_out[j][:lstm_hidden_size], lstm_out[i][lstm_hidden_size:] - lstm_out[j + 1][lstm_hidden_size:] ], 0) elif i > 0 and j + 1 == instance_len: segment_emb = torch.cat([ lstm_out[j][:lstm_hidden_size] - lstm_out[i - 1][:lstm_hidden_size], lstm_out[i][lstm_hidden_size:] ], 0) else: segment_emb = torch.cat([ lstm_out[j][:lstm_hidden_size] - lstm_out[i - 1][:lstm_hidden_size], lstm_out[i][lstm_hidden_size:] - lstm_out[j + 1][lstm_hidden_size:] ], 0) seg_embs[i, j] = segment_emb span_score = {} polar_score = {} for i in range(instance_len): for j in range(i, instance_len): span_score[i, j] = self.linear_span(seg_embs[i, j]) offset = [i for i in range(self.pos_embed_range_max)] offset = torch.LongTensor(offset) offset_score = self.pos_embed(offset) offset_score = self.pos_embed_linear(offset_score) zero_col = torch.zeros(1, self.label_size).to(NetworkConfig.DEVICE) return torch.cat( [linear_output, zero_col], 0), span_score, polar_score, offset_score, lstm_out, seg_embs
from bert_serving.client import BertClient import tensorflow as tf import numpy as np import os import pandas as pd import time import progressbar bc = BertClient() # ip address of the GPU machine dir_name = "/Users/jaswanttummala/downloads/questions.csv" question1 = np.array(pd.read_csv(dir_name, usecols=["question1"])) question2 = np.array(pd.read_csv(dir_name, usecols=["question2"])) question1 = question1.tolist() question2 = question2.tolist() temp1 = [] temp2 = [] for i in range(len(question1)): temp1.append(str(question1[i][0])) temp2.append(str(question2[i][0])) temp1.append("") temp2.append("") def divide_chunks(l, n): # looping till length l for i in range(0, len(l), n): yield l[i:i + n]
import sys from bert_serving.client import BertClient def send_without_block(bc, data, repeat=10): # encoding without blocking: print('sending all data without blocking...') for _ in range(repeat): bc.encode(data, blocking=False) print('all sent!') if __name__ == '__main__': bc = BertClient(port=int(sys.argv[1]), port_out=int(sys.argv[2])) num_repeat = 20 with open('../README.md') as fp: data = [v for v in fp if v.strip()] send_without_block(bc, data, num_repeat) num_expect_vecs = len(data) * num_repeat # then fetch all print('now waiting until all results are available...') vecs = bc.fetch_all(concat=True) print('received %s, expected: %d' % (vecs.shape, num_expect_vecs)) # now send it again
import sys try: import numpy as np from sklearn.cluster import KMeans from bert_serving.client import BertClient from sklearn.metrics import pairwise_distances_argmin_min from flask import Flask, jsonify, request from flask_cors import CORS from nltk import sent_tokenize except ImportError: sys.exit('Error importing modules') app = Flask(__name__) CORS(app) bc = BertClient(check_length=False) @app.route('/summary', methods=['POST']) def summary(): req = request.get_json(force=True) text = req['text'] sent_list = sent_tokenize(text) sent_list = [sent for sent in sent_list if len(sent) > 20] encoded = bc.encode(sent_list).tolist() n_clusters = int(np.ceil(len(encoded)**0.5)) kmeans = KMeans(n_clusters=n_clusters) kmeans = kmeans.fit(encoded)
def validate(model, dataloader): """ Compute the loss and accuracy of a model on some validation dataset. Args: model: A torch module for which the loss and accuracy must be computed. dataloader: A DataLoader object to iterate over the validation data. criterion: A loss criterion to use for computing the loss. epoch: The number of the epoch for which validation is performed. device: The device on which the model is located. Returns: epoch_time: The total time to compute the loss and accuracy on the entire validation set. epoch_loss: The loss computed on the entire validation set. epoch_accuracy: The accuracy computed on the entire validation set. """ criterion = nn.CrossEntropyLoss(reduction='none') criterion_all = nn.CrossEntropyLoss() l2dist = PairwiseDistance(2) # Switch to evaluate mode. running_loss_entailment, running_loss_neutral, running_loss_contradiction = 0.0, 0.0, 0.0 adv_loss_entailment, adv_loss_neutral, adv_loss_contradiction = None, None, None model.train() device = model.device epoch_start = time.time() running_loss = 0.0 running_accuracy = 0.0 total_num = 0 bc = BertClient(check_length=False) batch = dataloader # Deactivate autograd for evaluation. for batch_index in range(len(dataloader['labels'])): # Move input and output data to the GPU if one is used. premises = torch.tensor(bc.encode( batch["premises"][batch_index])).to(device) hypotheses = torch.tensor(bc.encode( batch["hypotheses"][batch_index])).to(device) labels = torch.tensor(batch["labels"][batch_index]).to(device) logits, probs, _ = model(premises, hypotheses) pred = torch.argmax(logits, dim=1) loss = criterion(logits, labels) running_loss += loss.sum().item() # running_accuracy += correct_predictions(probs, labels) total_num += len(labels) np_labels = labels.cpu().numpy() np_loss = loss.detach().cpu().numpy() np_pred = pred.detach().cpu().numpy() # 'entailment': 0, 'neutral': 1, 'contradiction': 2 running_loss_entailment += np_loss[( np_labels == 0)].sum() # &(np_pred==1) running_loss_neutral += np_loss[( np_labels == 1)].sum() # &(np_pred==0) running_loss_contradiction += np_loss[( np_labels == 2)].sum() # &(np_pred==1) # adv premises_adv, hypotheses_adv = fgsm( premises, hypotheses, pred, model, criterion_all) # eps=0.05, if_infnity=True logits_adv, probs_adv, _ = model(premises_adv, hypotheses_adv) running_accuracy += correct_predictions(probs, labels) adv_loss = ShannonEntropy(logits_adv, probs) # adv_loss = criterion(logits_adv, pred) np_adv_loss = adv_loss.detach().cpu().numpy() # np_probs_adv = torch.max(probs_adv, dim=1)[0].detach().cpu().numpy() if batch_index == 0: adv_loss_entailment = np_adv_loss[np_labels == 0] adv_loss_neutral = np_adv_loss[np_labels == 1] adv_loss_contradiction = np_adv_loss[np_labels == 2] else: adv_loss_entailment = np.concatenate( (adv_loss_entailment, np_adv_loss[(np_labels == 0)]), axis=0) adv_loss_neutral = np.concatenate( (adv_loss_neutral, np_adv_loss[(np_labels == 1)]), axis=0) adv_loss_contradiction = np.concatenate( (adv_loss_contradiction, np_adv_loss[(np_labels == 2)]), axis=0) # if batch_index == 10: # break epoch_time = time.time() - epoch_start epoch_accuracy = running_accuracy / total_num print(running_loss_entailment, running_loss_neutral, running_loss_contradiction) # losses = np.concatenate((adv_loss_pos, adv_loss_neg), axis=0) # labels = np.concatenate((np.ones_like(adv_loss_pos), np.zeros_like(adv_loss_neg)), axis=0) # auc_score = roc_auc(labels, losses) adv_loss_entailment = adv_loss_entailment[adv_loss_entailment < 1.5] adv_loss_neutral = adv_loss_neutral[adv_loss_neutral < 1.5] adv_loss_contradiction = adv_loss_contradiction[ adv_loss_contradiction < 1.5] creterion_func(adv_loss_entailment, adv_loss_neutral, adv_loss_contradiction) # print('[ROC_AUC] score: %.2f%%' % (100. * auc_score)) return epoch_time, epoch_accuracy
from bert_serving.client import BertClient import datetime bc = BertClient() m = 2 #def trigger_func(m): file_dir = './why_merged_' + str(m) + '_set.tsv' trigger = [] with open(file_dir, 'r') as f: line = f.readline() while line: trigger.append(line[:-1]) line = f.readline() print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) node_feat_vec_H0 = bc.encode(trigger) print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) node_feat_vec_H0.tofile('./node_feat_vec_H0_cutoff_' + str(m) + '.txt') print(node_feat_vec_H0.shape)
def __init__(self): self.bc = BertClient()