def eval_reverse_proposal(input_original, masked_sent, input_ids_old, pos_set, reverse_action_set, sim=None): proposal_prob_reverse = 1.0 # Q(x|x') input_ids_tmp = np.array(masked_sent) for step_i in range(len(pos_set)): ind = pos_set[step_i] # note: here the positions are exchanged action = reverse_action_set[step_i] old_tok = input_ids_old[ind] # word replacement (action: 0) if action == 0: prob_mask = bert_scorer.mask_score(input_ids_tmp, ind, mode=0) input_ids_tmp[ind] = old_tok proposal_prob_reverse *= prob_mask[old_tok] # Q(x|x') if sim is not None: proposal_prob_reverse *= similarity(input_ids_tmp, input_original) # word insertion(action:1) if action == 1: prob_mask = bert_scorer.mask_score(input_ids_tmp, ind, mode=0) input_ids_tmp[ind] = old_tok proposal_prob_reverse *= prob_mask[old_tok] # Q(x|x') if sim is not None: proposal_prob_reverse *= similarity(input_ids_tmp, input_original) # word deletion(action: 2) if action == 2: input_ids_tmp = input_ids_tmp # already deleted proposal_prob_reverse *= 1.0 # Q(x|x') return proposal_prob_reverse, input_ids_tmp
def main(path_to_data, file_d, query): d2s = load(open(path_to_data + 'dense_to_sparse.json', 'r')) i2t = load(open(path_to_data + 'ID-title_dict.json', 'r')) t2i = load(open(path_to_data + 'title-ID_dict.json', 'r')) s2d = load(open(path_to_data + 'sparse_to_dense.json', 'r')) p_c = [query] p_ids = list(map(concept_to_dense_id, [t2i], [s2d], p_c)) print("The query is '{0}'".format(p_c[0])) similarity(p_ids, path_to_data, file_d, d2s=d2s, i2t=i2t)
def maxSimTxt(self, intxt, simCondision=0.1, simType='simple'): """ 找出知识库里的和输入句子相似度最高的句子 simType=simple, simple_POS, vec """ self.lastTxt.append(intxt) if simType not in ('simple', 'simple_pos', 'vec'): return 'error: maxSimTxt的simType类型不存在: {}'.format(simType) # 如果没有加载词向量,那么降级成 simple_pos 方法 embedding = self.vecModel if simType == 'vec' and not embedding: simType = 'simple_pos' for t in self.zhishiku: questions = t.q_vec if simType == 'vec' else t.q_word in_vec = jieba.lcut(intxt) if simType == 'simple' else pseg.lcut( intxt) t.sim = max( similarity( in_vec, question, method=simType, embedding=embedding) for question in questions) maxSim = max(self.zhishiku, key=lambda x: x.sim) logger.info('maxSim=' + format(maxSim.sim, '.0%')) if maxSim.sim < simCondision: return '抱歉,我没有理解您的意思。请您准确描述问题。' return maxSim.a
def main(): parser = argparse.ArgumentParser( description='Rank corpus based on laser cosine distance') parser.add_argument('--debug', help='debug mode', action='store_true') parser.add_argument('--src_sents', help='source sentences') parser.add_argument('--tgt_sents', help='target sentences') parser.add_argument('--src_embs', help='laser embeddings for source sentences') parser.add_argument('--tgt_embs', help='laser embeddings for target sentences') parser.add_argument('--output_path', help='path to ranked corpus') parser.add_argument('--output_corpus', help='path to ranked corpus') o = parser.parse_args() try: os.makedirs(o.output_path) except FileExistsError: # directory already exists pass output_corpus = os.path.join(o.output_path, o.output_corpus) src_emb = load_laser_embs(o.src_embs) tgt_emb = load_laser_embs(o.tgt_embs) sim = [] for v1, v2 in zip(src_emb, tgt_emb): sim.append(similarity(v1, v2)) sim_sorted = sorted(range(len(sim)), key=lambda k: sim[k], reverse=True) with open(output_corpus, 'w') as output, open(o.src_sents, 'r') as src, open(o.tgt_sents, 'r') as tgt: src = src.readlines() tgt = tgt.readlines() pbar = tqdm.tqdm(total=len(src)) for similarity_index in sim_sorted: pbar.update(1) src_sentence = src[similarity_index].strip() tgt_sentence = tgt[similarity_index].strip() # Exclude almost identical sentences or too short sentence-pairs; # exclude sentences containing a lot of numbers if levenshtein_distance( src_sentence, tgt_sentence) < 30 or perc_numeric(src_sentence) > 0.3: continue output.write('{0}\t{1}'.format(src[similarity_index].strip(), tgt[similarity_index])) output.close()
def search_thread(results, desc_repr, codevecs, i, n_results, sim_measure): # 1. compute code similarities if sim_measure == 'cos': chunk_sims = np.dot(codevecs, desc_repr.T)[:, 0] # [pool_size] else: chunk_sims = similarity(codevecs, desc_repr, sim_measure) # [pool_size] # 2. select the top K results negsims = np.negative(chunk_sims) maxinds = np.argpartition(negsims, kth=n_results - 1) maxinds = maxinds[:n_results] chunk_codes = [codebase[i][k] for k in maxinds] chunk_sims = chunk_sims[maxinds] results.extend(zip(chunk_codes, chunk_sims))
def main(): word_pair, simi = load_standard('./wordsim353_annotator1.txt') #model = load_w2v_model('../../paper/word2vec/vec.txt', logging) model_path = '../../paper/data/srwe_model/wiki_small.w2v.r.0.001.model' model = load_w2v_model(model_path, logging) new_simi = [] for pair in word_pair: if pair[0] not in model or pair[1] not in model: logging.error('%s not in vocab.' % pair[0] if pair[0] not in model else pair[1]) new_simi.append(0.0) continue new_simi.append(similarity(model[pair[0]], model[pair[1]])) print model_path res = scipy.stats.spearmanr(simi, new_simi) print res
def generate_article(): keywords = request.form.get("topic") if keywords == None: return render_template("home.html") else: keywords = keywords.split(" ") kwords = [] for word in keywords: kwords.append(word.lower()) keywords = kwords articles = [] for file in os.listdir("articles/"): if file.endswith(".txt"): text = open(os.path.join("articles/", file), "r").read() source = file[:file.index("-")] articles.append(Article(text, source)) weighted_articles = [] for art in articles: weighted_articles.append((similarity(art.vector, keywords), art)) weighted_articles = sorted(weighted_articles, key=lambda x: -x[0]) temp = [] for pair in weighted_articles: if pair[0] > 0: temp.append(pair) weighted_articles = temp if len(weighted_articles) >= 3: model = weighted_articles[0:3] else: model = weighted_articles articles = [] for pair in model: art = pair[1] articles.append(art) generated_article, sources = group_sentences(articles) title = "" art_text = "" for sentence in generated_article: art_text += sentence[0] + " " if len(generated_article) > 0: title = create_title(art_text) else: title = "Sorry, we couldn't find any related articles!" #generate the text and display some how tit_text = title.decode('utf8') art_text = art_text.decode('utf8') return render_template("home.html", title=tit_text, article=art_text)
def train(self): embedded = self.creat_model() lr = tf.placeholder(dtype=tf.float32, name="learning_rate") # learning rate global_step = tf.Variable(0, name='global_step', trainable=False) w = tf.get_variable("w", initializer=np.array([10], dtype=np.float32)) b = tf.get_variable("b", initializer=np.array([-5], dtype=np.float32)) sim_matrix = similarity(embedded, w, b) loss = loss_cal(sim_matrix, type=config.loss) trainable_vars = tf.trainable_variables() # get variable list optimizer = optim(lr) # get optimizer (type is determined by configuration) grads, vars = zip(*optimizer.compute_gradients(loss)) # compute gradients of variables with respect to loss grads_clip, _ = tf.clip_by_global_norm(grads, 3.0) # l2 norm clipping by 3 grads_rescale = [0.01 * grad for grad in grads_clip[:2]] + grads_clip[2:] # smaller gradient scale for w, b train_op = optimizer.apply_gradients(zip(grads_rescale, vars), global_step=global_step) # gradient update operation # check variables memory variable_count = np.sum(np.array([np.prod(np.array(v.get_shape().as_list())) for v in trainable_vars])) print("total variables :", variable_count) tf.summary.scalar("loss", loss) merged = tf.summary.merge_all() saver = tf.train.Saver() with tf.Session() as sess: tf.global_variables_initializer().run() os.makedirs(os.path.join(config.model_path, "Check_Point"), exist_ok=True) # make folder to save model os.makedirs(os.path.join(config.model_path, "logs"), exist_ok=True) # make folder to save log writer = tf.summary.FileWriter(os.path.join(config.model_path, "logs"), sess.graph) lr_factor = 1 # lr decay factor ( 1/2 per 10000 iteration) loss_acc = 0 # accumulated loss ( for running average of loss) for iter in range(config.iteration): # run forward and backward propagation and update parameters _, loss_cur, summary = sess.run([train_op, loss, merged], feed_dict={self.fingerprint_input: random_batch(), lr: config.lr * lr_factor}) loss_acc += loss_cur # accumulated loss for each 100 iteration if iter % 10 == 0: writer.add_summary(summary, iter) # write at tensorboard if (iter + 1) % 100 == 0: print("(iter : %d) loss: %.4f" % ((iter + 1), loss_acc / 100)) loss_acc = 0 # reset accumulated loss if (iter + 1) % 1000 == 0: lr_factor /= 2 # lr decay print("learning rate is decayed! current lr : ", config.lr * lr_factor) if (iter + 1) % 1000 == 0: saver.save(sess, os.path.join(config.model_path, "./Check_Point/model.ckpt"), global_step=iter // 1000) print("model is saved!")
def worst_among_most_similar(population, child, goal_function, c_f = None, s = None): '''Create new population pool according to the worst among the most similar strategy. Args: population: The population pool. child: New chromosome to be added to the population. goal_function: The function we are optimising. Returns: The resulting population ''' if c_f == None: c_f = parameters.cf if s == None: s = parameters.s # parent_A = random.choice(population) # # crowding_selection_group = random.sample(population, s) cf_groups = [] for i in range(c_f): # cf_groups.append(random.sample(population, s)) cf_groups.append(population.sample(s)) most_similar = [] for group in cf_groups: most_similar.append(group.ix[ group['decoded'].apply(lambda x: similarity(child.decoded,x)).idxmax() ]) # most_similar.append(max(group, key=lambda x: similarity(child, x))) most_similar = pd.DataFrame(most_similar) worst = most_similar.fitness.idxmin() population = population.drop(worst) child.name=worst population = population.append(child) return population
def form_children(population, c_s = None, c_f = None, s = None): '''Select a pair of parents and form children Args: population: The population selection pool. Returns: The resulting children ''' if c_s == None: c_s = parameters.cs if c_f == None: c_f = parameters.cf if s == None: s = parameters.s # import pdb; pdb.set_trace(); # parent_a = random.choice(population) # crowding_selection_group = random.sample(population, c_s) parent_a = population.sample(1).iloc[0] crowding_selection_group = population.sample(c_s) # import pdb; pdb.set_trace() # pool_values['decoded'].apply(lambda x: euclidean(x, peak)).idxmin() parent_b = crowding_selection_group.ix[ crowding_selection_group['decoded'].apply(lambda x: similarity(parent_a.decoded, x)).idxmin() ] # parent_b.orient('index') # parent_b = max(crowding_selection_group, key= lambda x: similarity(parent_a, x)) # import pdb; pdb.set_trace() child_a, child_b = crossover(parent_a, parent_b) child_a = mutation(child_a) child_b = mutation(child_b) return child_a, child_b
def clusterOpinion(self, cluster, threshold): opinions = cluster.getOpinions() num = len(opinions) clusters = [] checked1 = [] for i in range(num): oc = OpinionCluster() opinion1 = opinions[i] if opinion1 in checked1: continue if opinion1 not in oc.getOpinions(): oc.addOpinion(opinion1) checked1.append(opinion1) for j in range(i + 1, num): opinion2 = opinions[j] if opinion2 in checked1: continue sim = similarity(opinion1.opinion, opinion2.opinion) if sim > threshold: if opinion2 not in oc.getOpinions(): oc.addOpinion(opinion2) checked1.append(opinion2) clusters.append(oc) return clusters
def getSummary(self, freqStrLen): opinionStrs = [] for op in self._opinions: opinion = op.opinion opinionStrs.append(opinion) # 统计字频率 word_counter = collections.Counter(list( "".join(opinionStrs))).most_common() freqStr = "" for item in word_counter: if item[1] >= freqStrLen: freqStr += item[0] maxSim = -1 maxOpinion = "" for opinion in opinionStrs: sim = similarity(freqStr, opinion) if sim > maxSim: maxSim = sim maxOpinion = opinion return maxOpinion
def train(path): tf.reset_default_graph() # reset graph # draw graph batch = tf.placeholder(shape= [None, config.N*config.M, 40], dtype=tf.float32) # input batch (time x batch x n_mel) lr = tf.placeholder(dtype= tf.float32) # learning rate global_step = tf.Variable(0, name='global_step', trainable=False) w = tf.get_variable("w", initializer= np.array([10], dtype=np.float32)) b = tf.get_variable("b", initializer= np.array([-5], dtype=np.float32)) # embedding lstm (3-layer default) with tf.variable_scope("lstm"): lstm_cells = [tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer)] lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells) # define lstm op and variables outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=batch, dtype=tf.float32, time_major=True) # for TI-VS must use dynamic rnn embedded = outputs[-1] # the last ouput is the embedded d-vector embedded = normalize(embedded) # normalize print("embedded size: ", embedded.shape) # loss sim_matrix = similarity(embedded, w, b) print("similarity matrix size: ", sim_matrix.shape) loss = loss_cal(sim_matrix, type=config.loss) # optimizer operation trainable_vars= tf.trainable_variables() # get variable list optimizer= optim(lr) # get optimizer (type is determined by configuration) grads, vars= zip(*optimizer.compute_gradients(loss)) # compute gradients of variables with respect to loss grads_clip, _ = tf.clip_by_global_norm(grads, 3.0) # l2 norm clipping by 3 grads_rescale= [0.01*grad for grad in grads_clip[:2]] + grads_clip[2:] # smaller gradient scale for w, b train_op= optimizer.apply_gradients(zip(grads_rescale, vars), global_step= global_step) # gradient update operation # check variables memory variable_count = np.sum(np.array([np.prod(np.array(v.get_shape().as_list())) for v in trainable_vars])) print("total variables :", variable_count) # record loss loss_summary = tf.summary.scalar("loss", loss) merged = tf.summary.merge_all() saver = tf.train.Saver() # training session with tf.Session() as sess: tf.global_variables_initializer().run() if(os.path.exists(path)): print("Restore from {}".format(os.path.join(path, "Check_Point/model.ckpt-2"))) saver.restore(sess, os.path.join(path, "Check_Point/model.ckpt-2")) # restore variables from selected ckpt file else: os.makedirs(os.path.join(path, "Check_Point"), exist_ok=True) # make folder to save model os.makedirs(os.path.join(path, "logs"), exist_ok=True) # make folder to save log writer = tf.summary.FileWriter(os.path.join(path, "logs"), sess.graph) epoch = 0 lr_factor = 1 # lr decay factor ( 1/2 per 10000 iteration) loss_acc = 0 # accumulated loss ( for running average of loss) for iter in range(config.iteration): # run forward and backward propagation and update parameters _, loss_cur, summary = sess.run([train_op, loss, merged], feed_dict={batch: random_batch(), lr: config.lr*lr_factor}) loss_acc += loss_cur # accumulated loss for each 100 iteration if iter % 10 == 0: writer.add_summary(summary, iter) # write at tensorboard if (iter+1) % 100 == 0: print("(iter : %d) loss: %.4f" % ((iter+1),loss_acc/100)) loss_acc = 0 # reset accumulated loss if (iter+1) % 10000 == 0: lr_factor /= 2 # lr decay print("learning rate is decayed! current lr : ", config.lr*lr_factor) if (iter+1) % 10000 == 0: saver.save(sess, os.path.join(path, "./Check_Point/model.ckpt"), global_step=iter//10000) print("model is saved!")
def test(path): tf.reset_default_graph() # draw graph enroll = tf.placeholder(shape=[None, config.N*config.M, 40], dtype=tf.float32) # enrollment batch (time x batch x n_mel) verif = tf.placeholder(shape=[None, config.N*config.M, 40], dtype=tf.float32) # verification batch (time x batch x n_mel) batch = tf.concat([enroll, verif], axis=1) # embedding lstm (3-layer default) with tf.variable_scope("lstm"): lstm_cells = [tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer)] lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells) # make lstm op and variables outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=batch, dtype=tf.float32, time_major=True) # for TI-VS must use dynamic rnn embedded = outputs[-1] # the last ouput is the embedded d-vector embedded = normalize(embedded) # normalize print("embedded size: ", embedded.shape) # enrollment embedded vectors (speaker model) enroll_embed = normalize(tf.reduce_mean(tf.reshape(embedded[:config.N*config.M, :], shape= [config.N, config.M, -1]), axis=1)) # verification embedded vectors verif_embed = embedded[config.N*config.M:, :] similarity_matrix = similarity(embedded=verif_embed, w=1., b=0., center=enroll_embed) saver = tf.train.Saver(var_list=tf.global_variables()) with tf.Session() as sess: tf.global_variables_initializer().run() # load model print("model path :", path) ckpt = tf.train.get_checkpoint_state(checkpoint_dir=os.path.join(path, "Check_Point")) ckpt_list = ckpt.all_model_checkpoint_paths loaded = 0 for model in ckpt_list: if config.model_num == int(model[-1]): # find ckpt file which matches configuration model number print("ckpt file is loaded !", model) loaded = 1 saver.restore(sess, model) # restore variables from selected ckpt file break if loaded == 0: raise AssertionError("ckpt file does not exist! Check config.model_num or config.model_path.") print("test file path : ", config.test_path) ''' test speaker:p225--p243 ''' # return similarity matrix after enrollment and verification time1 = time.time() # for check inference time if config.tdsv: S = sess.run(similarity_matrix, feed_dict={enroll:random_batch(shuffle=False, noise_filenum=1), verif:random_batch(shuffle=False, noise_filenum=2)}) else: S = sess.run(similarity_matrix, feed_dict={enroll:random_batch(shuffle=False), verif:random_batch(shuffle=False, utter_start=config.M)}) S = S.reshape([config.N, config.M, -1]) time2 = time.time() np.set_printoptions(precision=2) print("inference time for %d utterences : %0.2fs"%(2*config.M*config.N, time2-time1)) print(S) # print similarity matrix # calculating EER diff = 1; EER=0; EER_thres = 0; EER_FAR=0; EER_FRR=0 # through thresholds calculate false acceptance ratio (FAR) and false reject ratio (FRR) for thres in [0.01*i+0.5 for i in range(50)]: S_thres = S>thres # False acceptance ratio = false acceptance / mismatched population (enroll speaker != verification speaker) FAR = sum([np.sum(S_thres[i])-np.sum(S_thres[i,:,i]) for i in range(config.N)])/(config.N-1)/config.M/config.N # False reject ratio = false reject / matched population (enroll speaker = verification speaker) FRR = sum([config.M-np.sum(S_thres[i][:,i]) for i in range(config.N)])/config.M/config.N # Save threshold when FAR = FRR (=EER) if diff> abs(FAR-FRR): diff = abs(FAR-FRR) EER = (FAR+FRR)/2 EER_thres = thres EER_FAR = FAR EER_FRR = FRR print("\nEER : %0.2f (thres:%0.2f, FAR:%0.2f, FRR:%0.2f)"%(EER,EER_thres,EER_FAR,EER_FRR))
first = [encoded_file_animals[8]] second = [encoded_file_animals[9]] third = [encoded_file_buildings[0]] first = Variable(torch.LongTensor(first)) second = Variable(torch.LongTensor(second)) third = Variable(torch.LongTensor(third)) first_emb = mean_vectors(emb(first).data.numpy()[0]) second_emb = mean_vectors(emb(second).data.numpy()[0]) third_emb = mean_vectors(emb(third).data.numpy()[0]) output1 = model(first) output2 = model(second) output3 = model(third) v1 = output1.data.numpy()[0][0] v2 = output2.data.numpy()[0][0] v3 = output3.data.numpy()[0][0] print(first_emb) print(second_emb) print(third_emb) print(similarity(v1, v2)) print(similarity(v2, v3)) print(similarity(first_emb, second_emb)) print(similarity(second_emb, third_emb))
def main(): if os.path.exists(config.use_output_path): os.system('rm ' + config.use_output_path) with open(config.use_output_path, 'a') as g: g.write(str(config) + '\n\n') sim = config.sim # sta_vec=list(np.zeros([config.num_steps-1])) config.shuffle = False #original sentence input use_data = dataset_str(config.use_data_path) config.batch_size = 1 step_size = config.step_size start_time = time.time() proposal_cnt = 0 accept_cnt = 0 all_samples = [] all_acc_samples = [] all_chosen_samples = [] for sen_id in range(use_data.length): sent_ids = use_data.token_ids[sen_id] keys = use_data.keys[sen_id] searcher = ConstraintSearch(keys) sequence_length = len(sent_ids) #generate for each sentence sta_vec = np.zeros(sequence_length) input_ids = np.array(sent_ids) input_original = use_data.tokens[sen_id] prev_inds = [] old_prob = def_sent_scorer(tokenizer.decode(input_ids)) old_prob_pen = penalty_constraint( searcher.count_unsafisfied_constraint( searcher.sent2tag(input_ids))) if sim != None: old_prob *= similarity(input_ids, input_original, sta_vec) outputs = [] output_p = [] for iter in range(config.sample_time): pos_set = np.array( get_sample_positions(sequence_length, prev_inds, step_size)) prev_inds = pos_set proposal_cnt += 1 search_cands, constr_num = searcher.search_template( input_ids, pos_set) group_prob = 1.0 new_prob_pen = penalty_constraint(constr_num) original_temp = searcher.sent2tag(input_ids) original_constr_num = searcher.count_unsafisfied_constraint( original_temp) input_ids_old = np.array(input_ids) if len(search_cands) == 0: print('No candidate satisfies constraints. Continue.', pos_set) else: candidates = [] candidate_probs = [] for cand_template, action_set in search_cands: masked_sent, adjusted_pos_set = mask_sentence( input_ids, pos_set, action_set) proposal_prob, input_ids_tmp = eval_template( searcher, input_original, cand_template, masked_sent, adjusted_pos_set, action_set, sim=None) input_text_tmp = tokenizer.decode(input_ids_tmp) new_prob = def_sent_scorer(input_text_tmp) if sim != None: sim_constr = similarity(input_ids_tmp, input_original, sta_vec) new_prob *= sim_constr candidates.append( (input_ids_tmp, proposal_prob, cand_template, action_set, adjusted_pos_set)) candidate_probs.append(new_prob) candidate_probs_norm = normalize(np.array(candidate_probs)) cand_idx = sample_from_candidate( np.array(candidate_probs_norm)) input_ids_tmp, proposal_prob, cand_template, action_set, adjusted_pos_set = candidates[ cand_idx] new_prob = candidate_probs[cand_idx] input_ids_new = np.array(input_ids_tmp) new_pos_set = np.array(adjusted_pos_set) print(cand_template) print( tokenizer.decode(input_ids_new).encode('utf8', errors='ignore')) # evaluate reverse proposal reverse_action_set = get_reverse_action_set(action_set) reverse_search_cands, reverse_min_constr_num, = searcher.search_template( input_ids_new, new_pos_set, prune=False) reverse_group_prob = penalty_constraint(original_constr_num - reverse_min_constr_num) reverse_search_cands_pruned = [(x[0], x[2]) for x in reverse_search_cands if x[1] == original_constr_num] # check reverse search cand reverse_search_cand_str = [ ','.join(x[0]) for x in reverse_search_cands ] original_temp_str = ','.join(original_temp) if original_temp_str not in reverse_search_cand_str: print('Warning', original_temp, cand_template, pos_set, action_set, new_pos_set) if len(reverse_search_cands_pruned) == 0: print('Warning') reverse_search_cands_pruned = [original_temp] # evaluate reverse_candidate_probs_norm reverse_cand_idx = -1 reverse_candidate_probs = [] for c_idx, (reverse_cand_template, r_action_set ) in enumerate(reverse_search_cands_pruned): if ','.join(reverse_cand_template) == original_temp_str: reverse_candidate_probs.append(old_prob) reverse_cand_idx = c_idx else: masked_sent, new_adjusted_pos_set = mask_sentence( input_ids_new, new_pos_set, r_action_set) _, r_input_ids_tmp = eval_template( searcher, input_original, reverse_cand_template, masked_sent, new_adjusted_pos_set, r_action_set, sim=None) r_input_text_tmp = tokenizer.decode(r_input_ids_tmp) r_new_prob = def_sent_scorer(r_input_text_tmp) if sim != None: sim_constr = similarity(input_ids_tmp, input_original, sta_vec) r_new_prob *= sim_constr # candidates.append((input_ids_tmp, proposal_prob)) reverse_candidate_probs.append(r_new_prob) reverse_candidate_probs_norm = normalize( np.array(reverse_candidate_probs)) # evaluate proposal_prob_reverse r_masked_sent, pos_set_ = mask_sentence( input_ids_new, new_pos_set, reverse_action_set) assert (pos_set == pos_set_).all() proposal_prob_reverse, input_ids_tmp_0 = \ eval_reverse_proposal(input_original, r_masked_sent, input_ids_old, pos_set, reverse_action_set, sim=None) if (input_ids_tmp_0 != input_ids_old).any(): print('Warning, ', input_ids_old, input_ids_new, input_ids_tmp_0) assert (input_ids_tmp_0 == input_ids_old).all() # decide acceptance sequence_length_new = len(input_ids_new) input_text_new = tokenizer.decode(input_ids_new) if proposal_prob == 0.0 or old_prob == 0.0: alpha_star = 1.0 else: alpha_star = (comb(sequence_length_new, 3) * proposal_prob_reverse * reverse_group_prob * reverse_candidate_probs_norm[reverse_cand_idx] * new_prob * new_prob_pen) / \ (comb(sequence_length, 3) * proposal_prob * group_prob * candidate_probs_norm[cand_idx] * old_prob * old_prob_pen) alpha = min(1, alpha_star) all_samples.append([ input_text_new, new_prob * new_prob_pen, new_prob, constr_num, bert_scorer.sent_score(input_ids_new, log_prob=True), gpt2_scorer.sent_score(input_text_new, ppl=True) ]) if tokenizer.decode(input_ids_new) not in output_p: outputs.append(all_samples[-1]) if outputs != []: output_p.append(outputs[-1][0]) print(alpha, old_prob, proposal_prob, new_prob, new_prob * new_prob_pen, proposal_prob_reverse) if choose_action([ alpha, 1 - alpha ]) == 0 and (new_prob > old_prob * config.threshold or just_acc() == 0): if tokenizer.decode(input_ids_new) != tokenizer.decode( input_ids): accept_cnt += 1 print('Accept') all_acc_samples.append(all_samples[-1]) input_ids = input_ids_new sequence_length = sequence_length_new assert sequence_length == len(input_ids) old_prob = new_prob print('') # choose output from samples for num in range(config.min_length, 0, -1): outputss = [x for x in outputs if len(x[0].split()) >= num] print(num, outputss) if outputss != []: break if outputss == []: outputss.append([tokenizer.decode(input_ids), 0]) outputss = sorted(outputss, key=lambda x: x[1])[::-1] with open(config.use_output_path, 'a') as g: g.write(outputss[0][0] + '\t' + str(outputss[0][1]) + '\n') all_chosen_samples.append(outputss[0]) print('Sentence %d, used time %.2f\n' % (sen_id, time.time() - start_time)) print(proposal_cnt, accept_cnt, float(accept_cnt / proposal_cnt)) print("All samples:") all_samples_ = list(zip(*all_samples)) for metric in all_samples_[1:]: print(np.mean(np.array(metric))) print("All accepted samples:") all_samples_ = list(zip(*all_acc_samples)) for metric in all_samples_[1:]: print(np.mean(np.array(metric))) print("All chosen samples:") all_samples_ = list(zip(*all_chosen_samples)) for metric in all_samples_[1:]: print(np.mean(np.array(metric))) with open(config.use_output_path + '-result.csv', 'w', newline='') as f: csv_writer = csv.writer(f, delimiter='\t') csv_writer.writerow( ['Sentence', 'Prob_sim', 'Constraint_num', 'Log_prob', 'PPL']) csv_writer.writerows(all_samples)
import time if __name__ == '__main__': tf_config = tf.ConfigProto() tf_config.gpu_options.per_process_gpu_memory_fraction = 0.1 sess = tf.Session(config=tf_config) N, M = 4, 5 embed = tf.placeholder(dtype=tf.float32, shape=(N * 2 * M, 3)) # new loss embed_1 = embed[:N * M] embed_2 = embed[N * M:] center_1 = embedd2center(embed_1, N, M) center_2 = embedd2center(embed_2, N, M) new_loss = loss_cal(similarity(embed_1, 1.0, 0.0, N, M, center_2), name='softmax', N=N, M=M) + \ loss_cal(similarity(embed_2, 1.0, 0.0, N, M, center_1), name='softmax', N=N, M=M) # oldloss old_loss = loss_cal(similarity(embed, 1.0, 0.0, N, M * 2), N=N, M=M * 2) sess.run(tf.global_variables_initializer()) arr = np.random.rand(N * M * 2, 128) times = [] print('Calculating old loss') x = sess.run(old_loss, feed_dict={embed: arr}) print(x) times = [] print('Calculating new loss')
def test(path): tf.reset_default_graph() # draw graph enroll = tf.placeholder( shape=[None, config.N * config.M, 40], dtype=tf.float32) # enrollment batch (time x batch x n_mel) verif = tf.placeholder( shape=[None, config.N * config.M, 40], dtype=tf.float32) # verification batch (time x batch x n_mel) batch = tf.concat([enroll, verif], axis=1) # embedding lstm (3-layer default) with tf.variable_scope("lstm"): lstm_cells = [ rnn_cell.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer) ] lstm = rnn_cell.MultiRNNCell(lstm_cells) # make lstm op and variables outputs, _ = tf.nn.dynamic_rnn( cell=lstm, inputs=batch, dtype=tf.float32, time_major=True) # for TI-VS must use dynamic rnn embedded = outputs[-1] # the last ouput is the embedded d-vector embedded = normalize(embedded) # normalize print("embedded size: ", embedded.shape) # enrollment embedded vectors (speaker model) enroll_embed = normalize( tf.reduce_mean(tf.reshape(embedded[:config.N * config.M, :], shape=[config.N, config.M, -1]), axis=1)) # verification embedded vectors verif_embed = embedded[config.N * config.M:, :] similarity_matrix = similarity(embedded=verif_embed, w=1., b=0., center=enroll_embed) saver = tf.train.Saver(var_list=tf.global_variables()) with tf.Session() as sess: tf.global_variables_initializer().run() # load model print("model path :", path) ckpt = tf.train.get_checkpoint_state( checkpoint_dir=os.path.join(path, "Check_Point")) ckpt_list = ckpt.all_model_checkpoint_paths loaded = 0 for model in ckpt_list: if config.model_num == int( model.split('-')[-1] ): # find ckpt file which matches configuration model number print("ckpt file is loaded !", model) loaded = 1 saver.restore( sess, model) # restore variables from selected ckpt file break if loaded == 0: raise AssertionError( "ckpt file does not exist! Check config.model_num or config.model_path." ) print("test file path : ", config.test_path) # return similarity matrix after enrollment and verification time1 = time.time() # for check inference time if config.tdsv: S = sess.run(similarity_matrix, feed_dict={ enroll: random_batch(shuffle=False, noise_filenum=1), verif: random_batch(shuffle=False, noise_filenum=2) }) else: S = sess.run(similarity_matrix, feed_dict={ enroll: random_batch(shuffle=False), verif: random_batch(shuffle=False, utter_start=0) }) S = S.reshape([config.N, config.M, -1]) time2 = time.time() np.set_printoptions(precision=2) print("inference time for %d utterences : %0.2fs" % (2 * config.M * config.N, time2 - time1)) print(S) # print similarity matrix # 多人单句声音注册 arr = [] for i in range(config.N - 1): sim = S[-1, :, i] r = np.max(abs(sim)) arr.append(r) # threshold = S[-1] # arr = np.delete(threshold[0], -1) print(arr) # max_th = max(arr) if (arr[0] > 0.69) | (arr[1] > 0.75) | (arr[2] > 0.69) | (arr[3] > 0.73) | (arr[4] > 0.75)\ | (arr[5] > 0.75) | (arr[6] > 0.73): r = 1 else: r = 0 print(r) # threshold = S[-1] # arr = np.delete(threshold[0], -1) # print(arr) # max_th = max(abs(arr)) # if max_th >= 0.80: # r = 1 # else: # r = 0 # print(r) return r
def validate(valid_set, model, pool_size, K, sim_measure): """ simple validation in a code pool. @param: poolsize - size of the code pool, if -1, load the whole test set """ def ACC(real, predict): sum = 0.0 for val in real: try: index = predict.index(val) except ValueError: index = -1 if index != -1: sum = sum + 1 return sum / float(len(real)) def MAP(real, predict): sum = 0.0 for id, val in enumerate(real): try: index = predict.index(val) except ValueError: index = -1 if index != -1: sum = sum + (id + 1) / float(index + 1) return sum / float(len(real)) def MRR(real, predict): sum = 0.0 for val in real: try: index = predict.index(val) except ValueError: index = -1 if index != -1: sum = sum + 1.0 / float(index + 1) return sum / float(len(real)) def NDCG(real, predict): dcg = 0.0 idcg = IDCG(len(real)) for i, predictItem in enumerate(predict): if predictItem in real: itemRelevance = 1 rank = i + 1 dcg += (math.pow(2, itemRelevance) - 1.0) * (math.log(2) / math.log(rank + 1)) return dcg / float(idcg) def IDCG(n): idcg = 0 itemRelevance = 1 for i in range(n): idcg += (math.pow(2, itemRelevance) - 1.0) * (math.log(2) / math.log(i + 2)) return idcg model.eval() device = next(model.parameters()).device data_loader = torch.utils.data.DataLoader( dataset=valid_set, batch_size=pool_size, shuffle=True, drop_last=True, num_workers=1) # batch_size=10000 accs, mrrs, maps, ndcgs = [], [], [], [] code_reprs, desc_reprs = [], [] n_processed = 0 for batch in tqdm(data_loader): if len( batch ) == 6: # toks, tok_len, descs, desc_len, bad_descs, bad_desc_len code_batch = [tensor.to(device) for tensor in batch[:2]] desc_batch = [tensor.to(device) for tensor in batch[2:4]] elif len(batch) == 4: code_batch = [tensor.to(device) for tensor in batch[:2]] desc_batch = [tensor.to(device) for tensor in batch[2:4]] #else: # code_ids, type_ids, code_mask, good_ids, good_mask, bad_ids, bad_mask #code_batch = [tensor.to(device) for tensor in batch[:3]] #desc_batch = [tensor.to(device) for tensor in batch[3:5]] #assert(False, 'something wrong in the valid dataloader.') with torch.no_grad(): code_repr = model.code_encoding( *code_batch).data.cpu().numpy().astype(np.float32) desc_repr = model.desc_encoding( *desc_batch).data.cpu().numpy().astype( np.float32) # [poolsize x hid_size] if sim_measure == 'cos': code_repr = normalize(code_repr) desc_repr = normalize(desc_repr) code_reprs.append(code_repr) desc_reprs.append(desc_repr) n_processed += batch[0].size(0) # batch_size code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) for k in tqdm(range(0, n_processed, pool_size)): code_pool, desc_pool = code_reprs[k:k + pool_size], desc_reprs[k:k + pool_size] for i in range(min(10000, pool_size)): # for i in range(pool_size): desc_vec = np.expand_dims(desc_pool[i], axis=0) # [1 x dim] n_results = K if sim_measure == 'cos': sims = np.dot(code_pool, desc_vec.T)[:, 0] # [pool_size] else: sims = similarity(code_pool, desc_vec, sim_measure) # [pool_size] negsims = np.negative(sims) predict = np.argpartition(negsims, kth=n_results - 1) #predict=np.argsort(negsims)# predict = predict[:n_results] predict = [int(k) for k in predict] real = [i] accs.append(ACC(real, predict)) mrrs.append(MRR(real, predict)) maps.append(MAP(real, predict)) ndcgs.append(NDCG(real, predict)) return np.mean(accs), np.mean(mrrs), np.mean(maps), np.mean(ndcgs)
def train(path, args): tf.reset_default_graph() # reset graph timestamp = time_string() if args.time_string == None else args.time_string # draw graph feeder = Feeder(args.train_filename, args, hparams) output_classes = max( [int(f) for f in feeder.total_emt]) + 1 if args.model_type in [ 'emt', 'accent' ] else max([int(f) for f in feeder.total_spk]) + 1 batch = tf.placeholder( shape=[args.N * args.M, None, config.n_mels], dtype=tf.float32) # input batch (time x batch x n_mel) labels = tf.placeholder(shape=[args.N * args.M], dtype=tf.int32) lr = tf.placeholder(dtype=tf.float32) # learning rate global_step = tf.Variable(0, name='global_step', trainable=False) w = tf.get_variable("w", initializer=np.array([10], dtype=np.float32)) b = tf.get_variable("b", initializer=np.array([-5], dtype=np.float32)) # embedded = triple_lstm(batch) print("Training {} Discriminator Model".format(args.model_type)) encoder = ReferenceEncoder( filters=hparams.reference_filters, kernel_size=(3, 3), strides=(2, 2), is_training=True, scope='Tacotron_model/inference/pretrained_ref_enc_{}'.format( args.model_type), depth=hparams.reference_depth) # [N, 128]) embedded = encoder(batch) embedded = normalize(embedded) if args.discriminator: logit = tf.layers.dense( embedded, output_classes, name='Tacotron_model/inference/pretrained_ref_enc_{}_dense'.format( args.model_type)) labels_one_hot = tf.one_hot(tf.to_int32(labels), output_classes) # loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logit,labels=labels_one_hot)) loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=logit, labels=labels_one_hot)) acc, acc_op = tf.metrics.accuracy(labels=tf.argmax(labels_one_hot, 1), predictions=tf.argmax(logit, 1)) val_acc, val_acc_op = tf.metrics.accuracy( labels=tf.argmax(labels_one_hot, 1), predictions=tf.argmax(logit, 1)) else: # loss sim_matrix = similarity(embedded, w, b, args.N, args.M, P=hparams.reference_depth) print("similarity matrix size: ", sim_matrix.shape) loss = loss_cal(sim_matrix, args.N, args.M, type=config.loss) val_acc_op = tf.constant(1.) # optimizer operation trainable_vars = tf.trainable_variables() # get variable list optimizer = optim( lr) # get optimizer (type is determined by configuration) grads, vars = zip(*optimizer.compute_gradients( loss)) # compute gradients of variables with respect to loss if args.discriminator: grads_rescale = grads else: grads_clip, _ = tf.clip_by_global_norm(grads, 3.0) # l2 norm clipping by 3 grads_rescale = [0.01 * grad for grad in grads_clip[:2] ] + grads_clip[2:] # smaller gradient scale for w, b train_op = optimizer.apply_gradients( zip(grads_rescale, vars), global_step=global_step) # gradient update operation # check variables memory variable_count = np.sum( np.array([ np.prod(np.array(v.get_shape().as_list())) for v in trainable_vars ])) print("total variables :", variable_count) # record loss loss_summary = tf.summary.scalar("loss", loss) merged = tf.summary.merge_all() saver = tf.train.Saver(max_to_keep=20) loss_window = ValueWindow(100) acc_window = ValueWindow(100) val_loss_window = ValueWindow(5) val_acc_window = ValueWindow(5) # training session with tf.Session() as sess: tf.local_variables_initializer().run() tf.global_variables_initializer().run() checkpoint_folder = os.path.join(path, "checkpoints", timestamp) logs_folder = os.path.join(path, "logs", timestamp) os.makedirs(checkpoint_folder, exist_ok=True) # make folder to save model os.makedirs(logs_folder, exist_ok=True) # make folder to save log model_name = '{}_disc_model.ckpt'.format(args.model_type) checkpoint_path = os.path.join(checkpoint_folder, model_name) if args.restore: checkpoint_state = tf.train.get_checkpoint_state(checkpoint_folder) if (checkpoint_state and checkpoint_state.model_checkpoint_path): print('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path)) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: print('No model to load at {}'.format(checkpoint_folder)) saver.save(sess, checkpoint_path, global_step=global_step) else: print('Starting new training!') saver.save(sess, checkpoint_path, global_step=global_step) writer = tf.summary.FileWriter(logs_folder, sess.graph) lr_factor = 1 # lr decay factor ( 1/2 per 10000 iteration) iterations = 30000 if args.model_type == 'emt' else config.iteration for iter in range(iterations): if args.discriminator: batch_iter, _, labels_iter = feeder.random_batch_disc() else: batch_iter, _, labels_iter = feeder.random_batch() # run forward and backward propagation and update parameters step, _, loss_cur, summary, acc_cur = sess.run( [global_step, train_op, loss, merged, acc_op], feed_dict={ batch: batch_iter, labels: labels_iter, lr: config.lr * lr_factor }) loss_window.append(loss_cur) acc_window.append(acc_cur) if step % 10 == 0: writer.add_summary(summary, step) # write at tensorboard if (step + 1) % 20 == 0: val_loss_cur_batch = 0 val_acc_cur_batch = 0 for iter in range(VAL_ITERS): if args.discriminator: batch_iter, _, labels_iter = feeder.random_batch_disc( TEST=True) else: batch_iter, _, labels_iter = feeder.random_batch( TEST=True) # run forward and backward propagation and update parameters val_loss_cur, val_acc_cur = sess.run([loss, val_acc_op], feed_dict={ batch: batch_iter, labels: labels_iter }) val_loss_cur_batch += val_loss_cur val_acc_cur_batch += val_acc_cur val_loss_cur_batch /= VAL_ITERS val_acc_cur_batch /= VAL_ITERS val_loss_window.append(val_loss_cur_batch) val_acc_window.append(val_acc_cur_batch) message = "(iter : %d) loss: %.4f" % ( (step + 1), loss_window.average) if args.discriminator: message += ', acc: {:.2f}%'.format(acc_window.average) message += ", val_loss: %.4f" % (val_loss_window.average) if args.discriminator: message += ', val_acc: {:.2f}%'.format( val_acc_window.average) print(message) lr_changed = False if args.model_type == 'emt': if step > 6000: lr_changed = True if lr_factor != .01 else False lr_factor = .01 elif step > 4000: lr_changed = True if lr_factor != .1 else False lr_factor = .1 if lr_changed: print("learning rate is decayed! current lr : ", config.lr * lr_factor) elif args.model_type == 'spk': if step > 300: #4000: lr_changed = True if lr_factor != .01 else False lr_factor = .01 elif step > 180: #2500: lr_changed = True if lr_factor != .1 else False lr_factor = .1 if lr_changed: print("learning rate is decayed! current lr : ", config.lr * lr_factor) if step % config.save_checkpoint_iters == 0: saver.save(sess, checkpoint_path, global_step=global_step)
def repr_code(args, ast2id, code2id, nl2id, id2nl): with torch.no_grad(): device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu") config = getattr(configs, 'config_' + args.model)() ##### Define model ###### logger.info('Constructing Model..') logger.info(os.getcwd()) model = getattr(models, args.model)(config, ast2id) # initialize the model if args.reload_from > 0: ckpt_path = f'./output/{args.model}/{args.dataset}/models/step{args.reload_from}.h5' model.load_state_dict(torch.load(ckpt_path, map_location=device)) model = model.to(device) model.eval() pool_size=100 sim_measure='cos' #data_path = args.data_path + args.dataset + '/' ''' use_set = eval(config['dataset_name'])(data_path, config['use_names'], config['name_len'], config['use_apis'], config['api_len'], config['use_tokens'], config['tokens_len']) data_loader = torch.utils.data.DataLoader(dataset=use_set, batch_size=args.batch_size, shuffle=False, drop_last=False, num_workers=1) ''' valid_data_set = TreeDataSet(file_name=args.data_dir + '/train.json', ast_path=args.data_dir + '/tree/train/', ast2id=ast2id, nl2id=nl2id, max_ast_size=args.code_max_len, max_simple_name_size=args.max_simple_name_len, k=args.k, max_comment_size=args.comment_max_len, use_code=True, desc=config['valid_desc'], desclen=config['desc_len'] ) data_loader = DataLoaderX(dataset=valid_data_set, batch_size=args.batch_size, shuffle=False, num_workers=2) accs, mrrs, maps, ndcgs = [], [], [], [] code_reprs, desc_reprs = [], [] n_processed = 0 for batch in tqdm(data_loader): if len(batch) == 8: # seq_tensor, rel_par, rel_bro, rel_semantic, descs, desc_len, bad_descs, bad_desc_len code_batch = [tensor.to(device).long() for tensor in batch[:4]] desc_batch = [tensor.to(device).long() for tensor in batch[4:6]] with torch.no_grad(): code_repr = addCodeMaskToCalcuCodeRepr(model, *code_batch).data.cpu().numpy().astype(np.float32) desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype( np.float32) # [poolsize x hid_size] if sim_measure == 'cos': code_repr = normalize(code_repr) desc_repr = normalize(desc_repr) code_reprs.append(code_repr) desc_reprs.append(desc_repr) n_processed += batch[0].size(0) code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs) n_processed -= (n_processed % 100) for k in tqdm(range(0, n_processed - pool_size, pool_size)): code_pool, desc_pool = code_reprs[k:k + pool_size], desc_reprs[k:k + pool_size] sum = 0.0 for i in range(min(10000, pool_size)): # for i in range(pool_size): desc_vec = np.expand_dims(desc_pool[i], axis=0) # [1 x dim] if sim_measure == 'cos': sims = np.dot(code_pool, desc_vec.T)[:, 0] # [pool_size] else: sims = similarity(code_pool, desc_vec, sim_measure) # [pool_size] if sims[i] > 0.4: sum += 1; # negsims=np.negative(sims.T) # predict = np.argpartition(negsims, kth=n_results-1)#predict=np.argsort(negsims)# # predict = predict[:n_results] # # predict = [int(k) for k in predict] # real = [i] # for val in real: # try: # index = predict.index(val) # except ValueError: # index = -1 # if index != -1: sum = sum + 1 accs.append(sum / float(pool_size)) # accs.append(ACC(real,predict)) # mrrs.append(MRR(real,predict)) # maps.append(MAP(real,predict)) # ndcgs.append(NDCG(real,predict)) logger.info({'acc': np.mean(accs), 'err': 1 - np.mean(accs)}) return {'acc': np.mean(accs), 'err': 1 - np.mean(accs)}
def train(path): tf.reset_default_graph() # reset graph # draw graph batch = tf.placeholder( shape=[None, config.N * config.M, 40], dtype=tf.float32) # input batch (time x batch x n_mel) lr = tf.placeholder(dtype=tf.float32) # learning rate global_step = tf.Variable(0, name='global_step', trainable=False) w = tf.get_variable("w", initializer=np.array([10], dtype=np.float32)) b = tf.get_variable("b", initializer=np.array([-5], dtype=np.float32)) # embedding lstm (3-layer default) with tf.variable_scope("lstm"): lstm_cells = [ tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer) ] lstm = tf.contrib.rnn.MultiRNNCell( lstm_cells) # define lstm op and variables outputs, _ = tf.nn.dynamic_rnn( cell=lstm, inputs=batch, dtype=tf.float32, time_major=True) # for TI-VS must use dynamic rnn embedded = outputs[-1] # the last ouput is the embedded d-vector embedded = normalize(embedded) # normalize print("embedded size: ", embedded.shape) # loss sim_matrix = similarity(embedded, w, b) print("similarity matrix size: ", sim_matrix.shape) loss = loss_cal(sim_matrix, type=config.loss) # optimizer operation trainable_vars = tf.trainable_variables() # get variable list optimizer = optim( lr) # get optimizer (type is determined by configuration) grads, vars = zip(*optimizer.compute_gradients( loss)) # compute gradients of variables with respect to loss grads_clip, _ = tf.clip_by_global_norm(grads, 3.0) # l2 norm clipping by 3 grads_rescale = [0.01 * grad for grad in grads_clip[:2] ] + grads_clip[2:] # smaller gradient scale for w, b train_op = optimizer.apply_gradients( zip(grads_rescale, vars), global_step=global_step) # gradient update operation # check variables memory variable_count = np.sum( np.array([ np.prod(np.array(v.get_shape().as_list())) for v in trainable_vars ])) print("total variables :", variable_count) # record loss loss_summary = tf.summary.scalar("loss", loss) merged = tf.summary.merge_all() saver = tf.train.Saver() iter = 0 # training session with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if config.restore: # Restore saved model if the user requested it, default = True try: ckpt = tf.train.latest_checkpoint( checkpoint_dir=os.path.join(path, "Check_Point")) # if (checkpoint_state and checkpoint_state.model_checkpoint_path): # print('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path)) #saver = tf.train.import_meta_graph(os.path.join(path,"Check_Point/model.cpkt.meta")) #ckpt = tf.train.load_checkpoint(os.path.join(path,"Check_Point/model")) saver.restore(sess, ckpt) # else: # print('No model to load at {}'.format(save_dir)) # saver.save(sess, checkpoint_path, global_step=global_step) except: print('Cannot restore checkpoint exception') #if loaded == 0: # raise AssertionError("ckpt file does not exist! Check config.model_num or config.model_path.") #print("train file path : ", config.test_path) else: os.makedirs(os.path.join(path, "Check_Point"), exist_ok=True) # make folder to save model os.makedirs(os.path.join(path, "logs"), exist_ok=True) # make folder to save log writer = tf.summary.FileWriter(os.path.join(path, "logs"), sess.graph) epoch = 0 lr_factor = 1 # lr decay factor ( 1/2 per 10000 iteration) loss_acc = 0 # accumulated loss ( for running average of loss) iter = 0 training_data_size = len(os.listdir(config.train_path)) print("train_size: ", training_data_size) prev_iter = -1 # while iter < config.iteration : while iter < config.iteration: prev_iter = iter # run forward and backward propagation and update parameters iter, _, loss_cur, summary = sess.run( [global_step, train_op, loss, merged], feed_dict={ batch: random_batch(), lr: config.lr * lr_factor }) loss_acc += loss_cur # accumulated loss for each 100 iteration if (iter - prev_iter > 1): epoch = config.N * (iter + 1) // training_data_size #lr_factor = lr_factor / (2**(epoch//100)) lr_factor = lr_factor / (2**(iter // 10000)) print("restored epoch:", epoch) print("restored learning rate:", lr_factor * config.lr) #if iter % 1000 == 0: # writer.add_summary(summary, iter) # write at tensorboard if (iter + 1) % 100 == 0: print("(iter : %d) loss: %.4f" % ((iter + 1), loss_acc / 100)) loss_acc = 0 # reset accumulated loss #if config.N * (iter+1) % training_data_size == 0: # epoch = epoch + 1 # print("epoch: ", epoch) if (iter + 1) % 10000 == 0: lr_factor /= 2 print("learning rate is decayed! current lr : ", config.lr * lr_factor) #if ((config.N * (iter+1)) / training_data_size)%100 == 0: # lr_factor = lr_factor / 2 # print("learning factor: " , lr_factor) # print("learning rate is decayed! current lr : ", config.lr*lr_factor) if (iter + 1) % 5000 == 0: saver.save(sess, os.path.join(path, "Check_Point/model.ckpt"), global_step=iter) #pooooooooooooint writer.add_summary(summary, iter) # write at tensorboard print("model is saved!")
def train(path): tf.reset_default_graph() # reset graph # draw graph batch = tf.placeholder( shape=[None, config.N * config.M, 40], dtype=tf.float32) # input batch (time x batch x n_mel) lr = tf.placeholder(dtype=tf.float32) # learning rate global_step = tf.Variable(0, name='global_step', trainable=False) w = tf.get_variable("w", initializer=np.array([10], dtype=np.float32)) b = tf.get_variable("b", initializer=np.array([-5], dtype=np.float32)) # embedding lstm (3-layer default) with tf.variable_scope("lstm"): lstm_cells = [ tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer) ] lstm = tf.contrib.rnn.MultiRNNCell( lstm_cells) # define lstm op and variables outputs, _ = tf.nn.dynamic_rnn( cell=lstm, inputs=batch, dtype=tf.float32, time_major=True) # for TI-VS must use dynamic rnn embedded = outputs[-1] # the last ouput is the embedded d-vector embedded = normalize(embedded) # normalize print("embedded size: ", embedded.shape) # loss sim_matrix = similarity(embedded, w, b) print("similarity matrix size: ", sim_matrix.shape) loss = loss_cal(sim_matrix, type=config.loss) # optimizer operation trainable_vars = tf.trainable_variables() # get variable list optimizer = optim( lr) # get optimizer (type is determined by configuration) grads, vars = zip(*optimizer.compute_gradients( loss)) # compute gradients of variables with respect to loss grads_clip, _ = tf.clip_by_global_norm(grads, 3.0) # l2 norm clipping by 3 grads_rescale = [0.01 * grad for grad in grads_clip[:2] ] + grads_clip[2:] # smaller gradient scale for w, b train_op = optimizer.apply_gradients( zip(grads_rescale, vars), global_step=global_step) # gradient update operation # check variables memory variable_count = np.sum( np.array([ np.prod(np.array(v.get_shape().as_list())) for v in trainable_vars ])) print("total variables :", variable_count) # record loss loss_summary = tf.summary.scalar("loss", loss) merged = tf.summary.merge_all() saver = tf.train.Saver() # training session # with tf.Session() as sess: gpu_options = tf.GPUOptions(allow_growth=True) with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess: tf.global_variables_initializer().run() os.makedirs(os.path.join(path, "Check_Point"), exist_ok=True) # make folder to save model os.makedirs(os.path.join(path, "logs"), exist_ok=True) # make folder to save log writer = tf.summary.FileWriter(os.path.join(path, "logs"), sess.graph) epoch = 0 lr_factor = 1 # lr decay factor ( 1/2 per 10000 iteration) loss_acc = 0 # accumulated loss ( for running average of loss) train_times = [ ] #===========================================================================2020/05/20 16:30 total_times = 0 #===========================================================================2020/05/20 16:30 for iter in range(config.iteration): # run forward and backward propagation and update parameters # 记录迭代训练开始时间 begin_time = time.clock( ) #===========================================================================2020/05/20 16:30 _, loss_cur, summary = sess.run([train_op, loss, merged], feed_dict={ batch: random_batch(), lr: config.lr * lr_factor }) # 记录迭代训练结束时间 end_time = time.clock( ) # ===========================================================================2020/05/20 16:30 total_times += end_time - begin_time # ===========================================================================2020/05/20 16:30 train_times.append( str(begin_time) + '_' + str(end_time) + '_' + str(end_time - begin_time) ) # ===========================================================================2020/05/20 16:30 loss_acc += loss_cur # accumulated loss for each 100 iteration if iter % 10 == 0: writer.add_summary(summary, iter) # write at tensorboard if (iter + 1) % 100 == 0: print("(iter : %d) loss: %.4f" % ((iter + 1), loss_acc / 100)) loss_acc = 0 # reset accumulated loss print( "iter:{},耗时:{}s".format(iter, str(end_time - begin_time)) ) # ===========================================================================2020/05/20 16:30 if (iter + 1) % 10000 == 0: lr_factor /= 2 # lr decay print("learning rate is decayed! current lr : ", config.lr * lr_factor) if (iter + 1) % 10000 == 0: saver.save(sess, os.path.join(path, "./Check_Point/model.ckpt"), global_step=iter // 10000) print("model is saved!") # ===========================================================================2020/05/20 16:30 # 存模型 saver.save(sess, os.path.join(path, "./Check_Point/model.ckpt"), global_step=iter) print("model is saved!") # 将时间写入文件 with open('GE2E_epoch说话人{}_batch说话人{}_人均音频数{}_iter{}_迭代耗时.txt'.format( config.spk_num, config.N, config.M, config.iteration), mode='w', encoding='utf-8') as wf: wf.write( "epoch说话人{}个;batch说话人:{}个;人均音频数:{}条;迭代总次数:{};平均每次训练迭代耗时:{}\n". format(config.spk_num, config.N, config.M, config.iteration, total_times / config.iteration)) wf.write("开始训练时间_结束训练时间_耗时\n") for line in train_times: wf.write(line + '\n')
def test(self): enroll = tf.placeholder(shape=[None, config.N * config.M, 40], dtype=tf.float32, name="enroll") # enrollment batch (time x batch x n_mel) verif = tf.placeholder(shape=[None, config.N * config.M, 40], dtype=tf.float32, name="verif") # verification batch (time x batch x n_mel) self.fingerprint_input = tf.concat([enroll, verif], axis=1, name="fingerprint_input") embedded = self.creat_model() enroll_embed = normalize( tf.reduce_mean(tf.reshape(embedded[:config.N * config.M, :], shape=[config.N, config.M, -1]), axis=1)) # verification embedded vectors verif_embed = embedded[config.N * config.M:, :] similarity_matrix = similarity(embedded=verif_embed, w=1., b=0., center=enroll_embed) saver = tf.train.Saver(var_list=tf.global_variables()) with tf.Session() as sess: tf.global_variables_initializer().run() # load model print("model path :", config.model_path) ckpt = tf.train.get_checkpoint_state(checkpoint_dir=os.path.join(config.model_path, "Check_Point")) ckpt_list = ckpt.all_model_checkpoint_paths loaded = 0 for model in ckpt_list: if config.model_num == int(model[-1]): # find ckpt file which matches configuration model number print("ckpt file is loaded !", model) loaded = 1 saver.restore(sess, model) # restore variables from selected ckpt file break if loaded == 0: raise AssertionError("ckpt file does not exist! Check config.model_num or config.model_path.") print("test file path : ", "data/test") # return similarity matrix after enrollment and verification time1 = time.time() # for check inference time S = sess.run(similarity_matrix, feed_dict={enroll: random_batch(shuffle=False), verif: random_batch(shuffle=False, utter_start=config.M)}) S = S.reshape([config.N, config.M, -1]) time2 = time.time() np.set_printoptions(precision=2) print("inference time for %d utterences : %0.2fs" % (2 * config.M * config.N, time2 - time1)) print(S) # print similarity matrix # calculating EER diff = 1 EER = 0 EER_thres = 0 EER_FAR = 0 EER_FRR = 0 # through thresholds calculate false acceptance ratio (FAR) and false reject ratio (FRR) for thres in [0.01 * i + 0.5 for i in range(50)]: S_thres = S > thres # False acceptance ratio = false acceptance / mismatched population (enroll speaker != verification speaker) FAR = sum([np.sum(S_thres[i]) - np.sum(S_thres[i, :, i]) for i in range(config.N)]) / ( config.N - 1) / config.M / config.N # False reject ratio = false reject / matched population (enroll speaker = verification speaker) FRR = sum([config.M - np.sum(S_thres[i][:, i]) for i in range(config.N)]) / config.M / config.N # Save threshold when FAR = FRR (=EER) if diff > abs(FAR - FRR): diff = abs(FAR - FRR) EER = (FAR + FRR) / 2 EER_thres = thres EER_FAR = FAR EER_FRR = FRR print("\nEER : %0.2f (thres:%0.2f, FAR:%0.2f, FRR:%0.2f)" % (EER, EER_thres, EER_FAR, EER_FRR))
def test(path): tf.reset_default_graph() # draw graph enroll = tf.placeholder( shape=[None, config.N * config.M, 40], dtype=tf.float32) # enrollment batch (time x batch x n_mel) verif = tf.placeholder( shape=[None, config.N * config.M, 40], dtype=tf.float32) # verification batch (time x batch x n_mel) batch = tf.concat([enroll, verif], axis=1) # embedding lstm (3-layer default) with tf.variable_scope("lstm"): lstm_cells = [ tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer) ] lstm = tf.contrib.rnn.MultiRNNCell( lstm_cells) # make lstm op and variables outputs, _ = tf.nn.dynamic_rnn( cell=lstm, inputs=batch, dtype=tf.float32, time_major=True) # for TI-VS must use dynamic rnn embedded = outputs[-1] # the last ouput is the embedded d-vector embedded = normalize(embedded) # normalize print("embedded size: ", embedded.shape) # enrollment embedded vectors (speaker model) enroll_embed = normalize( tf.reduce_mean(tf.reshape(embedded[:config.N * config.M, :], shape=[config.N, config.M, -1]), axis=1)) # verification embedded vectors verif_embed = embedded[config.N * config.M:, :] similarity_matrix = similarity(embedded=verif_embed, w=1., b=0., center=enroll_embed) saver = tf.train.Saver(var_list=tf.global_variables()) with tf.Session() as sess: tf.global_variables_initializer().run() # load model print("model path :", path) ckpt = tf.train.get_checkpoint_state( checkpoint_dir=os.path.join(path, "Check_Point")) ckpt_list = ckpt.all_model_checkpoint_paths loaded = 0 for model in ckpt_list: if config.model_num == int( model[-1] ): # find ckpt file which matches configuration model number print("ckpt file is loaded !", model) loaded = 1 saver.restore( sess, model) # restore variables from selected ckpt file break if loaded == 0: raise AssertionError( "ckpt file does not exist! Check config.model_num or config.model_path." ) print("test file path : ", config.test_path) ''' test speaker:p225--p243 ''' # return similarity matrix after enrollment and verification time1 = time.time() # for check inference time if config.tdsv: S = sess.run(similarity_matrix, feed_dict={ enroll: random_batch(shuffle=False, noise_filenum=1), verif: random_batch(shuffle=False, noise_filenum=2) }) else: S = sess.run(similarity_matrix, feed_dict={ enroll: random_batch(shuffle=False), verif: random_batch(shuffle=False, utter_start=config.M) }) S = S.reshape([config.N, config.M, -1]) time2 = time.time() np.set_printoptions(precision=2) print("inference time for %d utterences : %0.2fs" % (2 * config.M * config.N, time2 - time1)) print(S) # print similarity matrix # calculating EER diff = 1 EER = 0 EER_thres = 0 EER_FAR = 0 EER_FRR = 0 # through thresholds calculate false acceptance ratio (FAR) and false reject ratio (FRR) for thres in [0.01 * i + 0.5 for i in range(50)]: S_thres = S > thres # False acceptance ratio = false acceptance / mismatched population (enroll speaker != verification speaker) FAR = sum([ np.sum(S_thres[i]) - np.sum(S_thres[i, :, i]) for i in range(config.N) ]) / (config.N - 1) / config.M / config.N # False reject ratio = false reject / matched population (enroll speaker = verification speaker) FRR = sum( [config.M - np.sum(S_thres[i][:, i]) for i in range(config.N)]) / config.M / config.N # Save threshold when FAR = FRR (=EER) if diff > abs(FAR - FRR): diff = abs(FAR - FRR) EER = (FAR + FRR) / 2 EER_thres = thres EER_FAR = FAR EER_FRR = FRR print("\nEER : %0.2f (thres:%0.2f, FAR:%0.2f, FRR:%0.2f)" % (EER, EER_thres, EER_FAR, EER_FRR))
def get_embeddings(path, args): tf.reset_default_graph() # reset graph if args.time_string == None: raise ValueError('must provide valid time_string') emb_dir = os.path.join(path, 'embeddings') os.makedirs(emb_dir, exist_ok=True) meta_path = os.path.join(emb_dir, 'meta.tsv') emb_path = os.path.join( emb_dir, 'emb_emt.tsv') if args.model_type == 'emt' else os.path.join( emb_dir, 'emb_spk.tsv') # draw graph feeder = Feeder(args.train_filename, args, hparams) datasets = ['emt4', 'vctk'] if args.model_type == 'emt' else ['vctk'] num_datasets = len(datasets) batch = tf.placeholder( shape=[num_datasets * args.N * args.M, None, config.n_mels], dtype=tf.float32) # input batch (time x batch x n_mel) w = tf.get_variable("w", initializer=np.array([10], dtype=np.float32)) b = tf.get_variable("b", initializer=np.array([-5], dtype=np.float32)) # embedded = triple_lstm(batch) print("{} Discriminator Model".format(args.model_type)) encoder = ReferenceEncoder( filters=hparams.reference_filters, kernel_size=(3, 3), strides=(2, 2), is_training=True, scope='Tacotron_model/inference/pretrained_ref_enc_{}'.format( args.model_type), depth=hparams.reference_depth) # [N, 128]) embedded = encoder(batch) # loss sim_matrix = similarity(embedded, w, b, num_datasets * args.N, args.M, P=hparams.reference_depth) print("similarity matrix size: ", sim_matrix.shape) loss = loss_cal(sim_matrix, num_datasets * args.N, args.M, type=config.loss) saver = tf.train.Saver() # training session with tf.Session() as sess: tf.global_variables_initializer().run() checkpoint_folder = os.path.join(path, "checkpoints", args.time_string) checkpoint_state = tf.train.get_checkpoint_state(checkpoint_folder) if (checkpoint_state and checkpoint_state.model_checkpoint_path): print('Loading checkpoint {}'.format( checkpoint_state.model_checkpoint_path)) saver.restore(sess, checkpoint_state.model_checkpoint_path) else: raise ValueError( 'No model to load at {}'.format(checkpoint_folder)) feeder_batch, meta = feeder.emb_batch(make_meta=True, datasets=datasets) emb, loss = sess.run([embedded, loss], feed_dict={batch: feeder_batch}) print("loss: {:.4f}".format(loss)) meta.to_csv(meta_path, sep='\t', index=False) pd.DataFrame(emb).to_csv(emb_path, sep='\t', index=False, header=False)
def get_similar_country_names(self, country_name): rate, most_similars = utils.most_similar(utils.similarity(self.normalized_country_items.keys(), country_name, 0.7)) return most_similars
def test(path): start_test_time = time.time() tf.reset_default_graph() # draw graph enroll = tf.placeholder(shape=[None, config.N*config.M, 40], dtype=tf.float32) # enrollment batch (time x batch x n_mel) verif = tf.placeholder(shape=[None, config.N*config.M, 40], dtype=tf.float32) # verification batch (time x batch x n_mel) batch = tf.concat([enroll, verif], axis=1) # embedding lstm (3-layer default) with tf.variable_scope("lstm"): lstm_cells = [tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer)] lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells) # make lstm op and variables outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=batch, dtype=tf.float32, time_major=True) # for TI-VS must use dynamic rnn embedded = outputs[-1] # the last ouput is the embedded d-vector embedded = normalize(embedded) # normalize print("embedded size: ", embedded.shape) # enrollment embedded vectors (speaker model) enroll_embed = normalize(tf.reduce_mean(tf.reshape(embedded[:config.N*config.M, :], shape= [config.N, config.M, -1]), axis=1)) # verification embedded vectors verif_embed = embedded[config.N*config.M:, :] similarity_matrix = similarity(embedded=verif_embed, w=1., b=0., center=enroll_embed) saver = tf.train.Saver(var_list=tf.global_variables()) with tf.Session() as sess: tf.global_variables_initializer().run() # load model print("model path :", path) ckpt = tf.train.get_checkpoint_state(checkpoint_dir=os.path.join(path, "Check_Point")) ckpt_list = ckpt.all_model_checkpoint_paths loaded = 0 for model in ckpt_list: if config.model_num == int(model[-1]): # find ckpt file which matches configuration model number print("ckpt file is loaded !", model) loaded = 1 saver.restore(sess, model) # restore variables from selected ckpt file break if loaded == 0: raise AssertionError("ckpt file does not exist! Check config.model_num or config.model_path.") # return similarity matrix after enrollment and verification enroll_batchs = {} for folder in os.listdir(config.enroll_path): enroll_dir = os.path.join(config.enroll_path, folder) if not os.path.isdir(enroll_dir): continue enroll_batchs[folder] = random_batch(path=enroll_dir, shuffle=False) after_enroll_time = time.time() num_verification = 0 if config.tdsv: for folder in os.listdir(config.verification_path): verification_dir = os.path.join(config.verification_path, folder) if not os.path.isdir(verification_dir): continue print('Verification Result of ' + folder) num_verification += 1 verification_batch = random_batch(path=verification_dir, shuffle=False) for key in enroll_batchs.keys(): enroll_batch = enroll_batchs[key] S = sess.run(similarity_matrix, feed_dict={enroll: enroll_batch, verif: verification_batch}) S = S.reshape([config.N, config.M, -1]) np.set_printoptions(precision=2) print('Score between ' + folder + '-' + key) print(np.mean(S)) # print similarity matrix duration = time.time()-start_test_time avg_verification_duration = (time.time() - after_enroll_time)/num_verification print('Test duration:' + str(duration) + 's') print('Verification duration:' + str(avg_verification_duration) + 's')
def test(path): tf.reset_default_graph() # draw graph enroll = tf.placeholder( shape=[None, config.N * config.M, 40], dtype=tf.float32) # enrollment batch (time x batch x n_mel) verif = tf.placeholder( shape=[None, config.N * config.M, 40], dtype=tf.float32) # verification batch (time x batch x n_mel) batch = tf.concat([enroll, verif], axis=1) # embedding lstm (3-layer default) with tf.variable_scope("lstm"): lstm_cells = [ tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer) ] lstm = tf.contrib.rnn.MultiRNNCell( lstm_cells) # make lstm op and variables outputs, _ = tf.nn.dynamic_rnn( cell=lstm, inputs=batch, dtype=tf.float32, time_major=True) # for TI-VS must use dynamic rnn embedded = outputs[-1] # the last ouput is the embedded d-vector embedded = normalize(embedded) # normalize print("embedded size: ", embedded.shape) # enrollment embedded vectors (speaker model) enroll_embed = normalize( tf.reduce_mean(tf.reshape(embedded[:config.N * config.M, :], shape=[config.N, config.M, -1]), axis=1)) # verification embedded vectors verif_embed = embedded[config.N * config.M:, :] similarity_matrix = similarity(embedded=verif_embed, w=1., b=0., center=enroll_embed) loss = loss_cal(similarity_matrix, type=config.loss) saver = tf.train.Saver(var_list=tf.global_variables()) with tf.Session() as sess: tf.global_variables_initializer().run() # load model #ckpt = tf.train.get_checkpoint_state(path) #checkpoints = ckpt.all_model_checkpoint_paths i = 139999 least_loss = 99999 #print("checkpoints : ",checkpoints) while (i < 399999): saver.restore(sess, os.path.join(path, "model.ckpt-" + str(i))) S, L = sess.run( [similarity_matrix, loss], feed_dict={ enroll: random_batch(shuffle=False), verif: random_batch(shuffle=False, utter_start=config.M) }) S = S.reshape([config.N, config.M, -1]) print("test file path : ", config.test_path) np.set_printoptions(precision=2) #print(S) if L < least_loss: #diff = abs(FAR-FRR) perfect_step = i least_loss = L print(i) print(str(L / (config.N * config.M))) i = i + 2500 print("\ncheckpoint: " + str(perfect_step) + " (loss:%0.2f)" % (least_loss))
def run_one_step(self, generation_step): population_size = utils.profile_value(self.conf['population_profile'], generation_step, self.conf['num_generations']) n_children = max(1, int(self.conf['birth_rate'] * population_size)) intra_depot = False if generation_step % self.conf[ 'extra_depot_every'] == 0 else True L = [] for _ in range(n_children): pool = self.selection() p1 = np.random.choice(pool) similarities = [ utils.similarity(p1, each) for each in self.population ] p2 = self.population[np.argmin(similarities)] p1, p2 = np.random.choice(pool, 2, replace=False) offspring = self.create_offspring(p1, p2) L.append(offspring) L = np.concatenate((L, self.population)) for each in L: if each.total_violation() > 0 and np.random.choice( (0, 1), p=(0.8, 0.2)): each.repair() if np.random.choice((0, 1), p=(.9, .1)): self.mutation(each, True) scores = np.argsort( [each.fitness_score(self.penalty_multiplier) for each in L]) if scores[0] != self.best_score: self.best_score = scores[0] self.best_count += 1 acis = [ each.average_capacity_infeasibility() for each in self.population ] self.mean_violations.append(np.mean(acis)) if len(self.mean_violations) > 20: self.mean_violations = self.mean_violations[-20:] ''' Penalty parameter adjustment ''' if generation_step % 4 == 0: prop_feasible = np.mean(np.array(self.mean_violations) == 0) if prop_feasible - 0.05 > self.conf['fraction_feasible_population']: # allow more violations -- except if the multplier is low if not self.penalty_multiplier <= 1: self.penalty_multiplier *= 0.85 elif prop_feasible + 0.05 < self.conf[ 'fraction_feasible_population']: # allow fewer violations self.penalty_multiplier *= 1.2 self.best = scores[0] if self.best_count >= 100: print('hey now I change') old_population = L[scores[:population_size // 4]] new_population = self.generate_initial_population( population_size - len(old_population)) self.population = np.concatenate((old_population, new_population)) self.best_count = 0 else: # ipdb.set_trace() self.population = L[scores[:population_size]]
def main(): if os.path.exists(config.use_output_path): os.system('rm ' + config.use_output_path) with open(config.use_output_path, 'a') as g: g.write(str(config) + '\n\n') # for item in config.record_time: # if os.path.exists(config.use_output_path + str(item)): # os.system('rm ' + config.use_output_path + str(item)) #CGMH sampling for paraphrase sim = config.sim # sta_vec=list(np.zeros([config.num_steps-1])) config.shuffle = False #original sentence input use_data = dataset_str(config.use_data_path) config.batch_size = 1 step_size = config.step_size start_time = time.time() proposal_cnt = 0 accept_cnt = 0 all_samples = [] all_acc_samples = [] all_chosen_samples = [] for sen_id in range(use_data.length): sent_ids = use_data.token_ids[sen_id] keys = use_data.keys[sen_id] searcher = ConstraintSearch(keys) sequence_length = len(sent_ids) #generate for each sentence sta_vec = np.zeros(sequence_length) input_ids = np.array(sent_ids) input_original = use_data.tokens[sen_id] prev_inds = [] old_prob = def_sent_scorer(tokenizer.decode(input_ids)) old_prob *= penalty_constraint( searcher.count_unsafisfied_constraint( searcher.sent2tag(input_ids))) if sim != None: old_prob *= similarity(input_ids, input_original, sta_vec) outputs = [] output_p = [] for iter in range(config.sample_time): # if iter in config.record_time: # with open(config.use_output_path, 'a', encoding='utf-8') as g: # g.write(bert_scorer.tokenizer.decode(input_ids)+'\n') # print(bert_scorer.tokenizer.decode(input_ids).encode('utf8', errors='ignore')) pos_set = get_sample_positions(sequence_length, prev_inds, step_size) action_set = [ choose_action(config.action_prob) for i in range(len(pos_set)) ] # if not check_constraint(input_ids): # if 0 not in pos_set: # pos_set[-1] = 0 keep_non = config.keep_non masked_sent, adjusted_pos_set = mask_sentence( input_ids, pos_set, action_set) prev_inds = pos_set proposal_prob = 1.0 # Q(x'|x) proposal_prob_reverse = 1.0 # Q(x|x') input_ids_tmp = np.array(masked_sent) # copy sequence_length_tmp = sequence_length for step_i in range(len(pos_set)): ind = adjusted_pos_set[step_i] ind_old = pos_set[step_i] action = action_set[step_i] if config.restrict_constr: if step_i == len(pos_set) - 1: use_constr = True else: use_constr = False else: use_constr = True #word replacement (action: 0) if action == 0: prob_mask = bert_scorer.mask_score(input_ids_tmp, ind, mode=0) input_candidate, prob_candidate, reverse_candidate_idx, _ = \ generate_candidate_input_with_mask(input_ids_tmp, sequence_length_tmp, ind, prob_mask, config.search_size, old_tok=input_ids[ind_old], mode=action) if sim is not None and use_constr: similarity_candidate = similarity_batch( input_candidate, input_original, sta_vec) prob_candidate = prob_candidate * similarity_candidate prob_candidate_norm = normalize(prob_candidate) prob_candidate_ind = sample_from_candidate( prob_candidate_norm) input_ids_tmp = input_candidate[ prob_candidate_ind] # changed proposal_prob *= prob_candidate_norm[ prob_candidate_ind] # Q(x'|x) proposal_prob_reverse *= prob_candidate_norm[ reverse_candidate_idx] # Q(x|x') sequence_length_tmp += 0 print('action:0', prob_candidate_norm[prob_candidate_ind], prob_candidate_norm[reverse_candidate_idx]) #word insertion(action:1) if action == 1: prob_mask = bert_scorer.mask_score(input_ids_tmp, ind, mode=0) input_candidate, prob_candidate, reverse_candidate_idx, non_idx = \ generate_candidate_input_with_mask(input_ids_tmp, sequence_length_tmp, ind, prob_mask, config.search_size, mode=action, old_tok=input_ids[ind_old], keep_non=keep_non) if sim is not None and use_constr: similarity_candidate = similarity_batch( input_candidate, input_original, sta_vec) prob_candidate = prob_candidate * similarity_candidate prob_candidate_norm = normalize(prob_candidate) prob_candidate_ind = sample_from_candidate( prob_candidate_norm) input_ids_tmp = input_candidate[prob_candidate_ind] if prob_candidate_ind == non_idx: if input_ids_tmp[-1] == PAD_IDX: input_ids_tmp = input_ids_tmp[:-1] print('action:1 insert non', 1.0, 1.0) else: proposal_prob *= prob_candidate_norm[ prob_candidate_ind] # Q(x'|x) proposal_prob_reverse *= 1.0 # Q(x|x'), reverse action is deleting sequence_length_tmp += 1 print('action:1', prob_candidate_norm[prob_candidate_ind], 1.0) #word deletion(action: 2) if action == 2: input_ids_for_del = np.concatenate( [input_ids_tmp[:ind], [MASK_IDX], input_ids_tmp[ind:]]) if keep_non: non_cand = np.array(input_ids_for_del) non_cand[ind] = input_ids[ind_old] input_candidate = np.array([input_ids_tmp, non_cand]) prob_candidate = np.array([ bert_scorer.sent_score(x) for x in input_candidate ]) non_idx = 1 if sim is not None and use_constr: similarity_candidate = similarity_batch( input_candidate, input_original, sta_vec) prob_candidate = prob_candidate * similarity_candidate prob_candidate_norm = normalize(prob_candidate) prob_candidate_ind = sample_from_candidate( prob_candidate_norm) input_ids_tmp = input_candidate[prob_candidate_ind] else: non_idx = -1 prob_candidate_ind = 0 input_ids_tmp = input_ids_tmp # already deleted if prob_candidate_ind == non_idx: print('action:2 delete non', 1.0, 1.0) else: # add mask, for evaluating reverse probability prob_mask = bert_scorer.mask_score(input_ids_for_del, ind, mode=0) input_candidate, prob_candidate, reverse_candidate_idx, _ = \ generate_candidate_input_with_mask(input_ids_for_del, sequence_length_tmp, ind, prob_mask, config.search_size, mode=0, old_tok=input_ids[ind_old]) if sim != None: similarity_candidate = similarity_batch( input_candidate, input_original, sta_vec) prob_candidate = prob_candidate * similarity_candidate prob_candidate_norm = normalize(prob_candidate) proposal_prob *= 1.0 # Q(x'|x) proposal_prob_reverse *= prob_candidate_norm[ reverse_candidate_idx] # Q(x|x'), reverse action is inserting sequence_length_tmp -= 1 print('action:2', 1.0, prob_candidate_norm[reverse_candidate_idx]) new_prob = def_sent_scorer(tokenizer.decode(input_ids_tmp)) new_prob *= penalty_constraint( searcher.count_unsafisfied_constraint( searcher.sent2tag(input_ids_tmp))) if sim != None: sim_constr = similarity(input_ids_tmp, input_original, sta_vec) new_prob *= sim_constr input_text_tmp = tokenizer.decode(input_ids_tmp) all_samples.append([ input_text_tmp, new_prob, searcher.count_unsafisfied_constraint( searcher.sent2tag(input_ids_tmp)), bert_scorer.sent_score(input_ids_tmp, log_prob=True), gpt2_scorer.sent_score(input_text_tmp, ppl=True) ]) if tokenizer.decode(input_ids_tmp) not in output_p: outputs.append(all_samples[-1]) if outputs != []: output_p.append(outputs[-1][0]) if proposal_prob == 0.0 or old_prob == 0.0: alpha_star = 1.0 else: alpha_star = (proposal_prob_reverse * new_prob) / (proposal_prob * old_prob) alpha = min(1, alpha_star) print( tokenizer.decode(input_ids_tmp).encode('utf8', errors='ignore')) print(alpha, old_prob, proposal_prob, new_prob, proposal_prob_reverse) proposal_cnt += 1 if choose_action([alpha, 1 - alpha]) == 0 and ( new_prob > old_prob * config.threshold or just_acc() == 0): if tokenizer.decode(input_ids_tmp) != tokenizer.decode( input_ids): accept_cnt += 1 print('Accept') all_acc_samples.append(all_samples[-1]) input_ids = input_ids_tmp sequence_length = sequence_length_tmp old_prob = new_prob # choose output from samples for num in range(config.min_length, 0, -1): outputss = [x for x in outputs if len(x[0].split()) >= num] print(num, outputss) if outputss != []: break if outputss == []: outputss.append([tokenizer.decode(input_ids), 0]) outputss = sorted(outputss, key=lambda x: x[1])[::-1] with open(config.use_output_path, 'a') as g: g.write(outputss[0][0] + '\t' + str(outputss[0][1]) + '\n') all_chosen_samples.append(outputss[0]) print('Sentence %d, used time %.2f\n' % (sen_id, time.time() - start_time)) print(proposal_cnt, accept_cnt, float(accept_cnt / proposal_cnt)) print("All samples:") all_samples_ = list(zip(*all_samples)) for metric in all_samples_[1:]: print(np.mean(np.array(metric))) print("All accepted samples:") all_samples_ = list(zip(*all_acc_samples)) for metric in all_samples_[1:]: print(np.mean(np.array(metric))) print("All chosen samples:") all_samples_ = list(zip(*all_chosen_samples)) for metric in all_samples_[1:]: print(np.mean(np.array(metric))) with open(config.use_output_path + '-result.csv', 'w', newline='') as f: csv_writer = csv.writer(f, delimiter='\t') csv_writer.writerow( ['Sentence', 'Prob_sim', 'Constraint_num', 'Log_prob', 'PPL']) csv_writer.writerows(all_samples)
# print(len(load_files_to_string(data+"/Supergirl"))) sum_list = batman_beyond_list + batman_animated_list + dharma_greg_list + teen_titans_list + will_grace_list + \ superman_list + supergirl_list all_words = list(set(sum_list)) batman_beyond_bow = create_bag_of_words(batman_beyond, all_words) batman_animated_bow = create_bag_of_words(batman_animated, all_words) dharma_greg_bow = create_bag_of_words(dharma_greg, all_words) teen_titans_bow = create_bag_of_words(teen_titans, all_words) will_grace_bow = create_bag_of_words(will_grace, all_words) superman_bow = create_bag_of_words(superman, all_words) supergirl_bow = create_bag_of_words(supergirl, all_words) print("Batman i batman") print(similarity(batman_beyond_bow, batman_animated_bow)) print("Batman i dharma") print(similarity(batman_beyond_bow, dharma_greg_bow)) print("Will i dharma") print(similarity(will_grace_bow, dharma_greg_bow)) print("Batman i teen titans") print(similarity(batman_beyond_bow, teen_titans_bow)) print("Superman i supergirl") print(similarity(superman_bow, supergirl_bow)) print("Superman i dharma") print(similarity(superman_bow, dharma_greg_bow)) print("Superman i batman") print(similarity(superman_bow, batman_animated_bow)) bow_list = [("Batman Beyond", batman_beyond_bow), ("Batman Animated", batman_animated_bow),
time_major=True) # for TI-VS must use dynamic rnn embedded = outputs[-1] # the last ouput is the embedded d-vector embedded = normalize(embedded) # normalize print("embedded size: ", embedded.shape) # enrollment embedded vectors (speaker model) enroll_embed = normalize( tf.reduce_mean(tf.reshape(embedded[:config.N * config.M, :], shape=[config.N, config.M, -1]), axis=1)) # verification embedded vectors verif_embed = embedded[config.N * config.M:, :] similarity_matrix = similarity(embedded=verif_embed, w=1., b=0., center=enroll_embed) saver = tf.train.Saver(var_list=tf.global_variables()) with tf.Session() as sess: tf.global_variables_initializer().run() # load model print("model path :", path) ckpt = tf.train.get_checkpoint_state( checkpoint_dir=os.path.join(path, "Check_Point")) ckpt_list = ckpt.all_model_checkpoint_paths loaded = 0 for model in ckpt_list: if config.model_num == int( model[-1]
def train(path): tf.reset_default_graph() # reset graph # draw graph batch = tf.placeholder( shape=[None, config.N * config.M, 40], dtype=tf.float32) # input batch (time x batch x n_mel) lr = tf.placeholder(dtype=tf.float32) # learning rate global_step = tf.Variable(0, name='global_step', trainable=False) w = tf.get_variable("w", initializer=np.array([10], dtype=np.float32)) b = tf.get_variable("b", initializer=np.array([-5], dtype=np.float32)) # embedding lstm (3-layer default) with tf.variable_scope("lstm"): lstm_cells = [ tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer) ] lstm = tf.contrib.rnn.MultiRNNCell( lstm_cells) # define lstm op and variables outputs, _ = tf.nn.dynamic_rnn( cell=lstm, inputs=batch, dtype=tf.float32, time_major=True) # for TI-VS must use dynamic rnn embedded = outputs[-1] # the last ouput is the embedded d-vector embedded = normalize(embedded) # normalize print("embedded size: ", embedded.shape) # loss sim_matrix = similarity(embedded, w, b) print("similarity matrix size: ", sim_matrix.shape) loss = loss_cal(sim_matrix, type=config.loss) # optimizer operation trainable_vars = tf.trainable_variables() # get variable list optimizer = optim( lr) # get optimizer (type is determined by configuration) grads, vars = zip(*optimizer.compute_gradients( loss)) # compute gradients of variables with respect to loss grads_clip, _ = tf.clip_by_global_norm(grads, 3.0) # l2 norm clipping by 3 grads_rescale = [0.01 * grad for grad in grads_clip[:2] ] + grads_clip[2:] # smaller gradient scale for w, b train_op = optimizer.apply_gradients( zip(grads_rescale, vars), global_step=global_step) # gradient update operation # check variables memory variable_count = np.sum( np.array([ np.prod(np.array(v.get_shape().as_list())) for v in trainable_vars ])) print("total variables :", variable_count) # record loss loss_summary = tf.summary.scalar("loss", loss) merged = tf.summary.merge_all() saver = tf.train.Saver() # training session with tf.Session() as sess: tf.global_variables_initializer().run() if (os.path.exists(path)): print("Restore from {}".format( os.path.join(path, "Check_Point/model.ckpt-2"))) saver.restore(sess, os.path.join(path, "Check_Point/model.ckpt-2") ) # restore variables from selected ckpt file else: os.makedirs(os.path.join(path, "Check_Point"), exist_ok=True) # make folder to save model os.makedirs(os.path.join(path, "logs"), exist_ok=True) # make folder to save log writer = tf.summary.FileWriter(os.path.join(path, "logs"), sess.graph) epoch = 0 lr_factor = 1 # lr decay factor ( 1/2 per 10000 iteration) loss_acc = 0 # accumulated loss ( for running average of loss) for iter in range(config.iteration): # run forward and backward propagation and update parameters _, loss_cur, summary = sess.run([train_op, loss, merged], feed_dict={ batch: random_batch(), lr: config.lr * lr_factor }) loss_acc += loss_cur # accumulated loss for each 100 iteration if iter % 10 == 0: writer.add_summary(summary, iter) # write at tensorboard if (iter + 1) % 100 == 0: print("(iter : %d) loss: %.4f" % ((iter + 1), loss_acc / 100)) loss_acc = 0 # reset accumulated loss if (iter + 1) % 3000 == 0: lr_factor /= 2 # lr decay print("learning rate is decayed! current lr : ", config.lr * lr_factor) if (iter + 1) % 2500 == 0: saver.save(sess, os.path.join(path, "./Check_Point/model.ckpt"), global_step=iter // 2500) print("model is saved!")
def found_similar(text, items): rate, similar_items = utils.most_similar(utils.similarity(items, text, min_ratio=0.96)) if len(similar_items) > 0: return similar_items[0]