Exemplo n.º 1
0
def eval_reverse_proposal(input_original,
                          masked_sent,
                          input_ids_old,
                          pos_set,
                          reverse_action_set,
                          sim=None):
    proposal_prob_reverse = 1.0  # Q(x|x')
    input_ids_tmp = np.array(masked_sent)
    for step_i in range(len(pos_set)):
        ind = pos_set[step_i]  # note: here the positions are exchanged
        action = reverse_action_set[step_i]
        old_tok = input_ids_old[ind]
        # word replacement (action: 0)
        if action == 0:
            prob_mask = bert_scorer.mask_score(input_ids_tmp, ind, mode=0)
            input_ids_tmp[ind] = old_tok
            proposal_prob_reverse *= prob_mask[old_tok]  # Q(x|x')
            if sim is not None:
                proposal_prob_reverse *= similarity(input_ids_tmp,
                                                    input_original)

        # word insertion(action:1)
        if action == 1:
            prob_mask = bert_scorer.mask_score(input_ids_tmp, ind, mode=0)
            input_ids_tmp[ind] = old_tok
            proposal_prob_reverse *= prob_mask[old_tok]  # Q(x|x')
            if sim is not None:
                proposal_prob_reverse *= similarity(input_ids_tmp,
                                                    input_original)

        # word deletion(action: 2)
        if action == 2:
            input_ids_tmp = input_ids_tmp  # already deleted
            proposal_prob_reverse *= 1.0  # Q(x|x')
    return proposal_prob_reverse, input_ids_tmp
Exemplo n.º 2
0
def main(path_to_data, file_d, query):
    d2s = load(open(path_to_data + 'dense_to_sparse.json', 'r'))
    i2t = load(open(path_to_data + 'ID-title_dict.json', 'r'))
    t2i = load(open(path_to_data + 'title-ID_dict.json', 'r'))
    s2d = load(open(path_to_data + 'sparse_to_dense.json', 'r'))
    p_c = [query]
    p_ids = list(map(concept_to_dense_id, [t2i], [s2d], p_c))
    print("The query is '{0}'".format(p_c[0]))
    similarity(p_ids, path_to_data, file_d, d2s=d2s, i2t=i2t)
Exemplo n.º 3
0
    def maxSimTxt(self, intxt, simCondision=0.1, simType='simple'):
        """
        找出知识库里的和输入句子相似度最高的句子
        simType=simple, simple_POS, vec
        """
        self.lastTxt.append(intxt)
        if simType not in ('simple', 'simple_pos', 'vec'):
            return 'error:  maxSimTxt的simType类型不存在: {}'.format(simType)

        # 如果没有加载词向量,那么降级成 simple_pos 方法
        embedding = self.vecModel
        if simType == 'vec' and not embedding:
            simType = 'simple_pos'

        for t in self.zhishiku:
            questions = t.q_vec if simType == 'vec' else t.q_word
            in_vec = jieba.lcut(intxt) if simType == 'simple' else pseg.lcut(
                intxt)

            t.sim = max(
                similarity(
                    in_vec, question, method=simType, embedding=embedding)
                for question in questions)
        maxSim = max(self.zhishiku, key=lambda x: x.sim)
        logger.info('maxSim=' + format(maxSim.sim, '.0%'))

        if maxSim.sim < simCondision:
            return '抱歉,我没有理解您的意思。请您准确描述问题。'

        return maxSim.a
Exemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(
        description='Rank corpus based on laser cosine distance')
    parser.add_argument('--debug', help='debug mode', action='store_true')
    parser.add_argument('--src_sents', help='source sentences')
    parser.add_argument('--tgt_sents', help='target sentences')
    parser.add_argument('--src_embs',
                        help='laser embeddings for source sentences')
    parser.add_argument('--tgt_embs',
                        help='laser embeddings for target sentences')
    parser.add_argument('--output_path', help='path to ranked corpus')
    parser.add_argument('--output_corpus', help='path to ranked corpus')
    o = parser.parse_args()

    try:
        os.makedirs(o.output_path)
    except FileExistsError:
        # directory already exists
        pass

    output_corpus = os.path.join(o.output_path, o.output_corpus)

    src_emb = load_laser_embs(o.src_embs)
    tgt_emb = load_laser_embs(o.tgt_embs)

    sim = []
    for v1, v2 in zip(src_emb, tgt_emb):
        sim.append(similarity(v1, v2))

    sim_sorted = sorted(range(len(sim)), key=lambda k: sim[k], reverse=True)

    with open(output_corpus,
              'w') as output, open(o.src_sents,
                                   'r') as src, open(o.tgt_sents, 'r') as tgt:
        src = src.readlines()
        tgt = tgt.readlines()

        pbar = tqdm.tqdm(total=len(src))

        for similarity_index in sim_sorted:
            pbar.update(1)
            src_sentence = src[similarity_index].strip()
            tgt_sentence = tgt[similarity_index].strip()

            # Exclude almost identical sentences or too short sentence-pairs;
            # exclude sentences containing a lot of numbers
            if levenshtein_distance(
                    src_sentence,
                    tgt_sentence) < 30 or perc_numeric(src_sentence) > 0.3:
                continue

            output.write('{0}\t{1}'.format(src[similarity_index].strip(),
                                           tgt[similarity_index]))

    output.close()
Exemplo n.º 5
0
def search_thread(results, desc_repr, codevecs, i, n_results, sim_measure):
    # 1. compute code similarities
    if sim_measure == 'cos':
        chunk_sims = np.dot(codevecs, desc_repr.T)[:, 0]  # [pool_size]
    else:
        chunk_sims = similarity(codevecs, desc_repr, sim_measure)  # [pool_size]

    # 2. select the top K results
    negsims = np.negative(chunk_sims)
    maxinds = np.argpartition(negsims, kth=n_results - 1)
    maxinds = maxinds[:n_results]
    chunk_codes = [codebase[i][k] for k in maxinds]
    chunk_sims = chunk_sims[maxinds]
    results.extend(zip(chunk_codes, chunk_sims))
Exemplo n.º 6
0
def main():
    word_pair, simi = load_standard('./wordsim353_annotator1.txt')
    #model = load_w2v_model('../../paper/word2vec/vec.txt', logging)
    model_path = '../../paper/data/srwe_model/wiki_small.w2v.r.0.001.model'
    model = load_w2v_model(model_path, logging)
    new_simi = []
    for pair in word_pair:
        if pair[0] not in model or pair[1] not in model:
            logging.error('%s not in vocab.' % pair[0] if pair[0] not in model else pair[1])
            new_simi.append(0.0)
            continue
        new_simi.append(similarity(model[pair[0]], model[pair[1]]))
    print model_path
    res = scipy.stats.spearmanr(simi, new_simi)
    print res
Exemplo n.º 7
0
def generate_article():
    keywords = request.form.get("topic")
    if keywords == None:
        return render_template("home.html")
    else:
        keywords = keywords.split(" ")
        kwords = []
        for word in keywords:
            kwords.append(word.lower())
        keywords = kwords

        articles = []
        for file in os.listdir("articles/"):
            if file.endswith(".txt"):
                text = open(os.path.join("articles/", file), "r").read()
                source = file[:file.index("-")]
                articles.append(Article(text, source))
        weighted_articles = []
        for art in articles:
            weighted_articles.append((similarity(art.vector, keywords), art))
        weighted_articles = sorted(weighted_articles, key=lambda x: -x[0])
        temp = []
        for pair in weighted_articles:
            if pair[0] > 0:
                temp.append(pair)
        weighted_articles = temp
        if len(weighted_articles) >= 3:
            model = weighted_articles[0:3]
        else:
            model = weighted_articles
        articles = []
        for pair in model:
            art = pair[1]
            articles.append(art)
        generated_article, sources = group_sentences(articles)
        title = ""
        art_text = ""
        for sentence in generated_article:
            art_text += sentence[0] + " "
        if len(generated_article) > 0:
            title = create_title(art_text)
        else:
            title = "Sorry, we couldn't find any related articles!"
        #generate the text and display some how
        tit_text = title.decode('utf8')
        art_text = art_text.decode('utf8')
        return render_template("home.html", title=tit_text, article=art_text)
Exemplo n.º 8
0
 def train(self):
     embedded = self.creat_model()
     lr = tf.placeholder(dtype=tf.float32, name="learning_rate")  # learning rate
     global_step = tf.Variable(0, name='global_step', trainable=False)
     w = tf.get_variable("w", initializer=np.array([10], dtype=np.float32))
     b = tf.get_variable("b", initializer=np.array([-5], dtype=np.float32))
     sim_matrix = similarity(embedded, w, b)
     loss = loss_cal(sim_matrix, type=config.loss)
     trainable_vars = tf.trainable_variables()  # get variable list
     optimizer = optim(lr)  # get optimizer (type is determined by configuration)
     grads, vars = zip(*optimizer.compute_gradients(loss))  # compute gradients of variables with respect to loss
     grads_clip, _ = tf.clip_by_global_norm(grads, 3.0)  # l2 norm clipping by 3
     grads_rescale = [0.01 * grad for grad in grads_clip[:2]] + grads_clip[2:]  # smaller gradient scale for w, b
     train_op = optimizer.apply_gradients(zip(grads_rescale, vars),
                                          global_step=global_step)  # gradient update operation
     # check variables memory
     variable_count = np.sum(np.array([np.prod(np.array(v.get_shape().as_list())) for v in trainable_vars]))
     print("total variables :", variable_count)
     tf.summary.scalar("loss", loss)
     merged = tf.summary.merge_all()
     saver = tf.train.Saver()
     with tf.Session() as sess:
         tf.global_variables_initializer().run()
         os.makedirs(os.path.join(config.model_path, "Check_Point"), exist_ok=True)  # make folder to save model
         os.makedirs(os.path.join(config.model_path, "logs"), exist_ok=True)  # make folder to save log
         writer = tf.summary.FileWriter(os.path.join(config.model_path, "logs"), sess.graph)
         lr_factor = 1  # lr decay factor ( 1/2 per 10000 iteration)
         loss_acc = 0  # accumulated loss ( for running average of loss)
         for iter in range(config.iteration):
             # run forward and backward propagation and update parameters
             _, loss_cur, summary = sess.run([train_op, loss, merged],
                                             feed_dict={self.fingerprint_input: random_batch(),
                                                        lr: config.lr * lr_factor})
             loss_acc += loss_cur  # accumulated loss for each 100 iteration
             if iter % 10 == 0:
                 writer.add_summary(summary, iter)  # write at tensorboard
             if (iter + 1) % 100 == 0:
                 print("(iter : %d) loss: %.4f" % ((iter + 1), loss_acc / 100))
                 loss_acc = 0  # reset accumulated loss
             if (iter + 1) % 1000 == 0:
                 lr_factor /= 2  # lr decay
                 print("learning rate is decayed! current lr : ", config.lr * lr_factor)
             if (iter + 1) % 1000 == 0:
                 saver.save(sess, os.path.join(config.model_path, "./Check_Point/model.ckpt"),
                            global_step=iter // 1000)
                 print("model is saved!")
Exemplo n.º 9
0
def worst_among_most_similar(population,
                            child,
                            goal_function,
                            c_f = None,
                            s = None):
    '''Create new population pool according to the worst among the most similar strategy.

    Args:
        population: The population pool.
        child: New chromosome to be added to the population.
        goal_function: The function we are optimising.

    Returns:
        The resulting population
    '''
    if c_f == None:
        c_f = parameters.cf
    if s == None:
        s = parameters.s
    # parent_A = random.choice(population)
    #
    # crowding_selection_group = random.sample(population, s)


    cf_groups = []
    for i in range(c_f):
        # cf_groups.append(random.sample(population, s))
        cf_groups.append(population.sample(s))

    most_similar = []

    for group in cf_groups:
        most_similar.append(group.ix[
            group['decoded'].apply(lambda x: similarity(child.decoded,x)).idxmax()
        ])
        # most_similar.append(max(group, key=lambda x: similarity(child, x)))

    most_similar = pd.DataFrame(most_similar)

    worst = most_similar.fitness.idxmin()
    population = population.drop(worst)

    child.name=worst
    population = population.append(child)

    return population
Exemplo n.º 10
0
def form_children(population,
                  c_s = None,
                  c_f = None,
                  s = None):
    '''Select a pair of parents and form children

    Args:
        population: The population selection pool.

    Returns:
        The resulting children
    '''
    if c_s == None:
        c_s = parameters.cs
    if c_f == None:
        c_f = parameters.cf
    if s == None:
        s = parameters.s
    # import pdb; pdb.set_trace();
    # parent_a = random.choice(population)

    # crowding_selection_group = random.sample(population, c_s)

    parent_a = population.sample(1).iloc[0]

    crowding_selection_group = population.sample(c_s)
    # import pdb; pdb.set_trace()
    # pool_values['decoded'].apply(lambda x: euclidean(x, peak)).idxmin()
    parent_b = crowding_selection_group.ix[
        crowding_selection_group['decoded'].apply(lambda x: similarity(parent_a.decoded, x)).idxmin()
    ]
    # parent_b.orient('index')

    # parent_b = max(crowding_selection_group, key= lambda x: similarity(parent_a, x))

    # import pdb; pdb.set_trace()

    child_a, child_b = crossover(parent_a, parent_b)

    child_a = mutation(child_a)
    child_b = mutation(child_b)

    return child_a, child_b
Exemplo n.º 11
0
 def clusterOpinion(self, cluster, threshold):
     opinions = cluster.getOpinions()
     num = len(opinions)
     clusters = []
     checked1 = []
     for i in range(num):
         oc = OpinionCluster()
         opinion1 = opinions[i]
         if opinion1 in checked1:
             continue
         if opinion1 not in oc.getOpinions():
             oc.addOpinion(opinion1)
         checked1.append(opinion1)
         for j in range(i + 1, num):
             opinion2 = opinions[j]
             if opinion2 in checked1:
                 continue
             sim = similarity(opinion1.opinion, opinion2.opinion)
             if sim > threshold:
                 if opinion2 not in oc.getOpinions():
                     oc.addOpinion(opinion2)
                 checked1.append(opinion2)
         clusters.append(oc)
     return clusters
Exemplo n.º 12
0
    def getSummary(self, freqStrLen):
        opinionStrs = []
        for op in self._opinions:
            opinion = op.opinion
            opinionStrs.append(opinion)

        # 统计字频率
        word_counter = collections.Counter(list(
            "".join(opinionStrs))).most_common()

        freqStr = ""
        for item in word_counter:
            if item[1] >= freqStrLen:
                freqStr += item[0]

        maxSim = -1
        maxOpinion = ""
        for opinion in opinionStrs:
            sim = similarity(freqStr, opinion)
            if sim > maxSim:
                maxSim = sim
                maxOpinion = opinion

        return maxOpinion
Exemplo n.º 13
0
def train(path):
    tf.reset_default_graph()    # reset graph

    # draw graph
    batch = tf.placeholder(shape= [None, config.N*config.M, 40], dtype=tf.float32)  # input batch (time x batch x n_mel)
    lr = tf.placeholder(dtype= tf.float32)  # learning rate
    global_step = tf.Variable(0, name='global_step', trainable=False)
    w = tf.get_variable("w", initializer= np.array([10], dtype=np.float32))
    b = tf.get_variable("b", initializer= np.array([-5], dtype=np.float32))

    # embedding lstm (3-layer default)
    with tf.variable_scope("lstm"):
        lstm_cells = [tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer)]
        lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells)    # define lstm op and variables
        outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=batch, dtype=tf.float32, time_major=True)   # for TI-VS must use dynamic rnn
        embedded = outputs[-1]                            # the last ouput is the embedded d-vector
        embedded = normalize(embedded)                    # normalize
    print("embedded size: ", embedded.shape)

    # loss
    sim_matrix = similarity(embedded, w, b)
    print("similarity matrix size: ", sim_matrix.shape)
    loss = loss_cal(sim_matrix, type=config.loss)

    # optimizer operation
    trainable_vars= tf.trainable_variables()                # get variable list
    optimizer= optim(lr)                                    # get optimizer (type is determined by configuration)
    grads, vars= zip(*optimizer.compute_gradients(loss))    # compute gradients of variables with respect to loss
    grads_clip, _ = tf.clip_by_global_norm(grads, 3.0)      # l2 norm clipping by 3
    grads_rescale= [0.01*grad for grad in grads_clip[:2]] + grads_clip[2:]   # smaller gradient scale for w, b
    train_op= optimizer.apply_gradients(zip(grads_rescale, vars), global_step= global_step)   # gradient update operation

    # check variables memory
    variable_count = np.sum(np.array([np.prod(np.array(v.get_shape().as_list())) for v in trainable_vars]))
    print("total variables :", variable_count)

    # record loss
    loss_summary = tf.summary.scalar("loss", loss)
    merged = tf.summary.merge_all()
    saver = tf.train.Saver()

    # training session
    with tf.Session() as sess:
        tf.global_variables_initializer().run()

        if(os.path.exists(path)):
            print("Restore from {}".format(os.path.join(path, "Check_Point/model.ckpt-2")))
            saver.restore(sess, os.path.join(path, "Check_Point/model.ckpt-2"))  # restore variables from selected ckpt file
        else:
            os.makedirs(os.path.join(path, "Check_Point"), exist_ok=True)  # make folder to save model
            os.makedirs(os.path.join(path, "logs"), exist_ok=True)          # make folder to save log

        writer = tf.summary.FileWriter(os.path.join(path, "logs"), sess.graph)
        epoch = 0
        lr_factor = 1   # lr decay factor ( 1/2 per 10000 iteration)
        loss_acc = 0    # accumulated loss ( for running average of loss)

        for iter in range(config.iteration):
            # run forward and backward propagation and update parameters
            _, loss_cur, summary = sess.run([train_op, loss, merged],
                                  feed_dict={batch: random_batch(), lr: config.lr*lr_factor})

            loss_acc += loss_cur    # accumulated loss for each 100 iteration

            if iter % 10 == 0:
                writer.add_summary(summary, iter)   # write at tensorboard
            if (iter+1) % 100 == 0:
                print("(iter : %d) loss: %.4f" % ((iter+1),loss_acc/100))
                loss_acc = 0                        # reset accumulated loss
            if (iter+1) % 10000 == 0:
                lr_factor /= 2                      # lr decay
                print("learning rate is decayed! current lr : ", config.lr*lr_factor)
            if (iter+1) % 10000 == 0:
                saver.save(sess, os.path.join(path, "./Check_Point/model.ckpt"), global_step=iter//10000)
                print("model is saved!")
Exemplo n.º 14
0
def test(path):
    tf.reset_default_graph()

    # draw graph
    enroll = tf.placeholder(shape=[None, config.N*config.M, 40], dtype=tf.float32) # enrollment batch (time x batch x n_mel)
    verif = tf.placeholder(shape=[None, config.N*config.M, 40], dtype=tf.float32)  # verification batch (time x batch x n_mel)
    batch = tf.concat([enroll, verif], axis=1)

    # embedding lstm (3-layer default)
    with tf.variable_scope("lstm"):
        lstm_cells = [tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer)]
        lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells)    # make lstm op and variables
        outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=batch, dtype=tf.float32, time_major=True)   # for TI-VS must use dynamic rnn
        embedded = outputs[-1]                            # the last ouput is the embedded d-vector
        embedded = normalize(embedded)                    # normalize

    print("embedded size: ", embedded.shape)

    # enrollment embedded vectors (speaker model)
    enroll_embed = normalize(tf.reduce_mean(tf.reshape(embedded[:config.N*config.M, :], shape= [config.N, config.M, -1]), axis=1))
    # verification embedded vectors
    verif_embed = embedded[config.N*config.M:, :]

    similarity_matrix = similarity(embedded=verif_embed, w=1., b=0., center=enroll_embed)

    saver = tf.train.Saver(var_list=tf.global_variables())
    with tf.Session() as sess:
        tf.global_variables_initializer().run()

        # load model
        print("model path :", path)
        ckpt = tf.train.get_checkpoint_state(checkpoint_dir=os.path.join(path, "Check_Point"))
        ckpt_list = ckpt.all_model_checkpoint_paths
        loaded = 0
        for model in ckpt_list:
            if config.model_num == int(model[-1]):    # find ckpt file which matches configuration model number
                print("ckpt file is loaded !", model)
                loaded = 1
                saver.restore(sess, model)  # restore variables from selected ckpt file
                break

        if loaded == 0:
            raise AssertionError("ckpt file does not exist! Check config.model_num or config.model_path.")

        print("test file path : ", config.test_path)
        '''
            test speaker:p225--p243
        '''
        # return similarity matrix after enrollment and verification
        time1 = time.time() # for check inference time
        if config.tdsv:
            S = sess.run(similarity_matrix, feed_dict={enroll:random_batch(shuffle=False, noise_filenum=1),
                                                       verif:random_batch(shuffle=False, noise_filenum=2)})
        else:
            S = sess.run(similarity_matrix, feed_dict={enroll:random_batch(shuffle=False),
                                                       verif:random_batch(shuffle=False, utter_start=config.M)})
        S = S.reshape([config.N, config.M, -1])
        time2 = time.time()

        np.set_printoptions(precision=2)
        print("inference time for %d utterences : %0.2fs"%(2*config.M*config.N, time2-time1))
        print(S)    # print similarity matrix

        # calculating EER
        diff = 1; EER=0; EER_thres = 0; EER_FAR=0; EER_FRR=0

        # through thresholds calculate false acceptance ratio (FAR) and false reject ratio (FRR)
        for thres in [0.01*i+0.5 for i in range(50)]:
            S_thres = S>thres

            # False acceptance ratio = false acceptance / mismatched population (enroll speaker != verification speaker)
            FAR = sum([np.sum(S_thres[i])-np.sum(S_thres[i,:,i]) for i in range(config.N)])/(config.N-1)/config.M/config.N

            # False reject ratio = false reject / matched population (enroll speaker = verification speaker)
            FRR = sum([config.M-np.sum(S_thres[i][:,i]) for i in range(config.N)])/config.M/config.N

            # Save threshold when FAR = FRR (=EER)
            if diff> abs(FAR-FRR):
                diff = abs(FAR-FRR)
                EER = (FAR+FRR)/2
                EER_thres = thres
                EER_FAR = FAR
                EER_FRR = FRR

        print("\nEER : %0.2f (thres:%0.2f, FAR:%0.2f, FRR:%0.2f)"%(EER,EER_thres,EER_FAR,EER_FRR))
Exemplo n.º 15
0
first = [encoded_file_animals[8]]
second = [encoded_file_animals[9]]
third = [encoded_file_buildings[0]]

first = Variable(torch.LongTensor(first))
second = Variable(torch.LongTensor(second))
third = Variable(torch.LongTensor(third))

first_emb = mean_vectors(emb(first).data.numpy()[0])
second_emb = mean_vectors(emb(second).data.numpy()[0])
third_emb = mean_vectors(emb(third).data.numpy()[0])

output1 = model(first)
output2 = model(second)
output3 = model(third)

v1 = output1.data.numpy()[0][0]
v2 = output2.data.numpy()[0][0]
v3 = output3.data.numpy()[0][0]

print(first_emb)
print(second_emb)
print(third_emb)


print(similarity(v1, v2))
print(similarity(v2, v3))

print(similarity(first_emb, second_emb))
print(similarity(second_emb, third_emb))
Exemplo n.º 16
0
def main():
    if os.path.exists(config.use_output_path):
        os.system('rm ' + config.use_output_path)
    with open(config.use_output_path, 'a') as g:
        g.write(str(config) + '\n\n')
    sim = config.sim
    # sta_vec=list(np.zeros([config.num_steps-1]))
    config.shuffle = False
    #original sentence input
    use_data = dataset_str(config.use_data_path)
    config.batch_size = 1
    step_size = config.step_size

    start_time = time.time()
    proposal_cnt = 0
    accept_cnt = 0
    all_samples = []
    all_acc_samples = []
    all_chosen_samples = []
    for sen_id in range(use_data.length):
        sent_ids = use_data.token_ids[sen_id]
        keys = use_data.keys[sen_id]
        searcher = ConstraintSearch(keys)
        sequence_length = len(sent_ids)
        #generate for each sentence
        sta_vec = np.zeros(sequence_length)
        input_ids = np.array(sent_ids)
        input_original = use_data.tokens[sen_id]
        prev_inds = []
        old_prob = def_sent_scorer(tokenizer.decode(input_ids))
        old_prob_pen = penalty_constraint(
            searcher.count_unsafisfied_constraint(
                searcher.sent2tag(input_ids)))
        if sim != None:
            old_prob *= similarity(input_ids, input_original, sta_vec)

        outputs = []
        output_p = []
        for iter in range(config.sample_time):
            pos_set = np.array(
                get_sample_positions(sequence_length, prev_inds, step_size))
            prev_inds = pos_set
            proposal_cnt += 1

            search_cands, constr_num = searcher.search_template(
                input_ids, pos_set)
            group_prob = 1.0
            new_prob_pen = penalty_constraint(constr_num)
            original_temp = searcher.sent2tag(input_ids)
            original_constr_num = searcher.count_unsafisfied_constraint(
                original_temp)
            input_ids_old = np.array(input_ids)
            if len(search_cands) == 0:
                print('No candidate satisfies constraints. Continue.', pos_set)
            else:
                candidates = []
                candidate_probs = []
                for cand_template, action_set in search_cands:
                    masked_sent, adjusted_pos_set = mask_sentence(
                        input_ids, pos_set, action_set)
                    proposal_prob, input_ids_tmp = eval_template(
                        searcher,
                        input_original,
                        cand_template,
                        masked_sent,
                        adjusted_pos_set,
                        action_set,
                        sim=None)
                    input_text_tmp = tokenizer.decode(input_ids_tmp)
                    new_prob = def_sent_scorer(input_text_tmp)
                    if sim != None:
                        sim_constr = similarity(input_ids_tmp, input_original,
                                                sta_vec)
                        new_prob *= sim_constr
                    candidates.append(
                        (input_ids_tmp, proposal_prob, cand_template,
                         action_set, adjusted_pos_set))
                    candidate_probs.append(new_prob)

                candidate_probs_norm = normalize(np.array(candidate_probs))
                cand_idx = sample_from_candidate(
                    np.array(candidate_probs_norm))
                input_ids_tmp, proposal_prob, cand_template, action_set, adjusted_pos_set = candidates[
                    cand_idx]
                new_prob = candidate_probs[cand_idx]
                input_ids_new = np.array(input_ids_tmp)
                new_pos_set = np.array(adjusted_pos_set)
                print(cand_template)
                print(
                    tokenizer.decode(input_ids_new).encode('utf8',
                                                           errors='ignore'))

                # evaluate reverse proposal
                reverse_action_set = get_reverse_action_set(action_set)
                reverse_search_cands, reverse_min_constr_num, = searcher.search_template(
                    input_ids_new, new_pos_set, prune=False)
                reverse_group_prob = penalty_constraint(original_constr_num -
                                                        reverse_min_constr_num)
                reverse_search_cands_pruned = [(x[0], x[2])
                                               for x in reverse_search_cands
                                               if x[1] == original_constr_num]

                # check reverse search cand
                reverse_search_cand_str = [
                    ','.join(x[0]) for x in reverse_search_cands
                ]
                original_temp_str = ','.join(original_temp)
                if original_temp_str not in reverse_search_cand_str:
                    print('Warning', original_temp, cand_template, pos_set,
                          action_set, new_pos_set)
                if len(reverse_search_cands_pruned) == 0:
                    print('Warning')
                    reverse_search_cands_pruned = [original_temp]

                # evaluate reverse_candidate_probs_norm
                reverse_cand_idx = -1
                reverse_candidate_probs = []
                for c_idx, (reverse_cand_template, r_action_set
                            ) in enumerate(reverse_search_cands_pruned):
                    if ','.join(reverse_cand_template) == original_temp_str:
                        reverse_candidate_probs.append(old_prob)
                        reverse_cand_idx = c_idx
                    else:
                        masked_sent, new_adjusted_pos_set = mask_sentence(
                            input_ids_new, new_pos_set, r_action_set)
                        _, r_input_ids_tmp = eval_template(
                            searcher,
                            input_original,
                            reverse_cand_template,
                            masked_sent,
                            new_adjusted_pos_set,
                            r_action_set,
                            sim=None)
                        r_input_text_tmp = tokenizer.decode(r_input_ids_tmp)
                        r_new_prob = def_sent_scorer(r_input_text_tmp)
                        if sim != None:
                            sim_constr = similarity(input_ids_tmp,
                                                    input_original, sta_vec)
                            r_new_prob *= sim_constr
                        # candidates.append((input_ids_tmp, proposal_prob))
                        reverse_candidate_probs.append(r_new_prob)
                reverse_candidate_probs_norm = normalize(
                    np.array(reverse_candidate_probs))

                # evaluate proposal_prob_reverse
                r_masked_sent, pos_set_ = mask_sentence(
                    input_ids_new, new_pos_set, reverse_action_set)
                assert (pos_set == pos_set_).all()
                proposal_prob_reverse, input_ids_tmp_0 = \
                 eval_reverse_proposal(input_original, r_masked_sent, input_ids_old, pos_set, reverse_action_set, sim=None)

                if (input_ids_tmp_0 != input_ids_old).any():
                    print('Warning, ', input_ids_old, input_ids_new,
                          input_ids_tmp_0)
                assert (input_ids_tmp_0 == input_ids_old).all()

                # decide acceptance
                sequence_length_new = len(input_ids_new)
                input_text_new = tokenizer.decode(input_ids_new)
                if proposal_prob == 0.0 or old_prob == 0.0:
                    alpha_star = 1.0
                else:
                    alpha_star = (comb(sequence_length_new, 3) * proposal_prob_reverse * reverse_group_prob *
                                  reverse_candidate_probs_norm[reverse_cand_idx] * new_prob * new_prob_pen) / \
                                 (comb(sequence_length, 3) * proposal_prob  * group_prob *
                                  candidate_probs_norm[cand_idx] * old_prob * old_prob_pen)
                alpha = min(1, alpha_star)

                all_samples.append([
                    input_text_new, new_prob * new_prob_pen, new_prob,
                    constr_num,
                    bert_scorer.sent_score(input_ids_new, log_prob=True),
                    gpt2_scorer.sent_score(input_text_new, ppl=True)
                ])
                if tokenizer.decode(input_ids_new) not in output_p:
                    outputs.append(all_samples[-1])
                if outputs != []:
                    output_p.append(outputs[-1][0])
                print(alpha, old_prob, proposal_prob, new_prob,
                      new_prob * new_prob_pen, proposal_prob_reverse)
                if choose_action([
                        alpha, 1 - alpha
                ]) == 0 and (new_prob > old_prob * config.threshold
                             or just_acc() == 0):
                    if tokenizer.decode(input_ids_new) != tokenizer.decode(
                            input_ids):
                        accept_cnt += 1
                        print('Accept')
                        all_acc_samples.append(all_samples[-1])
                    input_ids = input_ids_new
                    sequence_length = sequence_length_new
                    assert sequence_length == len(input_ids)
                    old_prob = new_prob
                print('')

        # choose output from samples
        for num in range(config.min_length, 0, -1):
            outputss = [x for x in outputs if len(x[0].split()) >= num]
            print(num, outputss)
            if outputss != []:
                break
        if outputss == []:
            outputss.append([tokenizer.decode(input_ids), 0])
        outputss = sorted(outputss, key=lambda x: x[1])[::-1]
        with open(config.use_output_path, 'a') as g:
            g.write(outputss[0][0] + '\t' + str(outputss[0][1]) + '\n')
        all_chosen_samples.append(outputss[0])

        print('Sentence %d, used time %.2f\n' %
              (sen_id, time.time() - start_time))
    print(proposal_cnt, accept_cnt, float(accept_cnt / proposal_cnt))

    print("All samples:")
    all_samples_ = list(zip(*all_samples))
    for metric in all_samples_[1:]:
        print(np.mean(np.array(metric)))

    print("All accepted samples:")
    all_samples_ = list(zip(*all_acc_samples))
    for metric in all_samples_[1:]:
        print(np.mean(np.array(metric)))

    print("All chosen samples:")
    all_samples_ = list(zip(*all_chosen_samples))
    for metric in all_samples_[1:]:
        print(np.mean(np.array(metric)))

    with open(config.use_output_path + '-result.csv', 'w', newline='') as f:
        csv_writer = csv.writer(f, delimiter='\t')
        csv_writer.writerow(
            ['Sentence', 'Prob_sim', 'Constraint_num', 'Log_prob', 'PPL'])
        csv_writer.writerows(all_samples)
Exemplo n.º 17
0
import time

if __name__ == '__main__':
    tf_config = tf.ConfigProto()
    tf_config.gpu_options.per_process_gpu_memory_fraction = 0.1
    sess = tf.Session(config=tf_config)

    N, M = 4, 5
    embed = tf.placeholder(dtype=tf.float32, shape=(N * 2 * M, 3))

    # new loss
    embed_1 = embed[:N * M]
    embed_2 = embed[N * M:]
    center_1 = embedd2center(embed_1, N, M)
    center_2 = embedd2center(embed_2, N, M)
    new_loss = loss_cal(similarity(embed_1, 1.0, 0.0, N, M, center_2), name='softmax', N=N, M=M) + \
               loss_cal(similarity(embed_2, 1.0, 0.0, N, M, center_1), name='softmax', N=N, M=M)

    # oldloss
    old_loss = loss_cal(similarity(embed, 1.0, 0.0, N, M * 2), N=N, M=M * 2)
    sess.run(tf.global_variables_initializer())

    arr = np.random.rand(N * M * 2, 128)

    times = []
    print('Calculating old loss')
    x = sess.run(old_loss, feed_dict={embed: arr})
    print(x)

    times = []
    print('Calculating new loss')
Exemplo n.º 18
0
def test(path):
    tf.reset_default_graph()

    # draw graph
    enroll = tf.placeholder(
        shape=[None, config.N * config.M, 40],
        dtype=tf.float32)  # enrollment batch (time x batch x n_mel)
    verif = tf.placeholder(
        shape=[None, config.N * config.M, 40],
        dtype=tf.float32)  # verification batch (time x batch x n_mel)
    batch = tf.concat([enroll, verif], axis=1)

    # embedding lstm (3-layer default)
    with tf.variable_scope("lstm"):
        lstm_cells = [
            rnn_cell.LSTMCell(num_units=config.hidden, num_proj=config.proj)
            for i in range(config.num_layer)
        ]
        lstm = rnn_cell.MultiRNNCell(lstm_cells)  # make lstm op and variables
        outputs, _ = tf.nn.dynamic_rnn(
            cell=lstm, inputs=batch, dtype=tf.float32,
            time_major=True)  # for TI-VS must use dynamic rnn
        embedded = outputs[-1]  # the last ouput is the embedded d-vector
        embedded = normalize(embedded)  # normalize

    print("embedded size: ", embedded.shape)

    # enrollment embedded vectors (speaker model)
    enroll_embed = normalize(
        tf.reduce_mean(tf.reshape(embedded[:config.N * config.M, :],
                                  shape=[config.N, config.M, -1]),
                       axis=1))
    # verification embedded vectors
    verif_embed = embedded[config.N * config.M:, :]

    similarity_matrix = similarity(embedded=verif_embed,
                                   w=1.,
                                   b=0.,
                                   center=enroll_embed)

    saver = tf.train.Saver(var_list=tf.global_variables())
    with tf.Session() as sess:
        tf.global_variables_initializer().run()

        # load model
        print("model path :", path)
        ckpt = tf.train.get_checkpoint_state(
            checkpoint_dir=os.path.join(path, "Check_Point"))
        ckpt_list = ckpt.all_model_checkpoint_paths
        loaded = 0
        for model in ckpt_list:
            if config.model_num == int(
                    model.split('-')[-1]
            ):  # find ckpt file which matches configuration model number
                print("ckpt file is loaded !", model)
                loaded = 1
                saver.restore(
                    sess, model)  # restore variables from selected ckpt file
                break

        if loaded == 0:
            raise AssertionError(
                "ckpt file does not exist! Check config.model_num or config.model_path."
            )

        print("test file path : ", config.test_path)

        # return similarity matrix after enrollment and verification
        time1 = time.time()  # for check inference time
        if config.tdsv:
            S = sess.run(similarity_matrix,
                         feed_dict={
                             enroll: random_batch(shuffle=False,
                                                  noise_filenum=1),
                             verif: random_batch(shuffle=False,
                                                 noise_filenum=2)
                         })
        else:
            S = sess.run(similarity_matrix,
                         feed_dict={
                             enroll: random_batch(shuffle=False),
                             verif: random_batch(shuffle=False, utter_start=0)
                         })
        S = S.reshape([config.N, config.M, -1])
        time2 = time.time()
        np.set_printoptions(precision=2)
        print("inference time for %d utterences : %0.2fs" %
              (2 * config.M * config.N, time2 - time1))
        print(S)  # print similarity matrix

        # 多人单句声音注册

        arr = []
        for i in range(config.N - 1):
            sim = S[-1, :, i]
            r = np.max(abs(sim))
            arr.append(r)
        # threshold = S[-1]
        # arr = np.delete(threshold[0], -1)
        print(arr)
        # max_th = max(arr)
        if (arr[0] > 0.69) | (arr[1] > 0.75) | (arr[2] > 0.69) | (arr[3] > 0.73) | (arr[4] > 0.75)\
                | (arr[5] > 0.75) | (arr[6] > 0.73):
            r = 1
        else:
            r = 0
        print(r)

        # threshold = S[-1]
        # arr = np.delete(threshold[0], -1)
        # print(arr)
        # max_th = max(abs(arr))
        # if max_th >= 0.80:
        #     r = 1
        # else:
        #     r = 0
        # print(r)

        return r
def validate(valid_set, model, pool_size, K, sim_measure):
    """
    simple validation in a code pool. 
    @param: poolsize - size of the code pool, if -1, load the whole test set
    """
    def ACC(real, predict):
        sum = 0.0
        for val in real:
            try:
                index = predict.index(val)
            except ValueError:
                index = -1
            if index != -1: sum = sum + 1
        return sum / float(len(real))

    def MAP(real, predict):
        sum = 0.0
        for id, val in enumerate(real):
            try:
                index = predict.index(val)
            except ValueError:
                index = -1
            if index != -1: sum = sum + (id + 1) / float(index + 1)
        return sum / float(len(real))

    def MRR(real, predict):
        sum = 0.0
        for val in real:
            try:
                index = predict.index(val)
            except ValueError:
                index = -1
            if index != -1: sum = sum + 1.0 / float(index + 1)
        return sum / float(len(real))

    def NDCG(real, predict):
        dcg = 0.0
        idcg = IDCG(len(real))
        for i, predictItem in enumerate(predict):
            if predictItem in real:
                itemRelevance = 1
                rank = i + 1
                dcg += (math.pow(2, itemRelevance) -
                        1.0) * (math.log(2) / math.log(rank + 1))
        return dcg / float(idcg)

    def IDCG(n):
        idcg = 0
        itemRelevance = 1
        for i in range(n):
            idcg += (math.pow(2, itemRelevance) - 1.0) * (math.log(2) /
                                                          math.log(i + 2))
        return idcg

    model.eval()
    device = next(model.parameters()).device

    data_loader = torch.utils.data.DataLoader(
        dataset=valid_set,
        batch_size=pool_size,
        shuffle=True,
        drop_last=True,
        num_workers=1)  # batch_size=10000
    accs, mrrs, maps, ndcgs = [], [], [], []
    code_reprs, desc_reprs = [], []
    n_processed = 0
    for batch in tqdm(data_loader):
        if len(
                batch
        ) == 6:  # toks, tok_len, descs, desc_len, bad_descs, bad_desc_len
            code_batch = [tensor.to(device) for tensor in batch[:2]]
            desc_batch = [tensor.to(device) for tensor in batch[2:4]]
        elif len(batch) == 4:
            code_batch = [tensor.to(device) for tensor in batch[:2]]
            desc_batch = [tensor.to(device) for tensor in batch[2:4]]
        #else: # code_ids, type_ids, code_mask, good_ids, good_mask, bad_ids, bad_mask
        #code_batch = [tensor.to(device) for tensor in batch[:3]]
        #desc_batch = [tensor.to(device) for tensor in batch[3:5]]
        #assert(False, 'something wrong in the valid dataloader.')
        with torch.no_grad():
            code_repr = model.code_encoding(
                *code_batch).data.cpu().numpy().astype(np.float32)
            desc_repr = model.desc_encoding(
                *desc_batch).data.cpu().numpy().astype(
                    np.float32)  # [poolsize x hid_size]
            if sim_measure == 'cos':
                code_repr = normalize(code_repr)
                desc_repr = normalize(desc_repr)
        code_reprs.append(code_repr)
        desc_reprs.append(desc_repr)
        n_processed += batch[0].size(0)  # batch_size
    code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs)

    for k in tqdm(range(0, n_processed, pool_size)):
        code_pool, desc_pool = code_reprs[k:k +
                                          pool_size], desc_reprs[k:k +
                                                                 pool_size]
        for i in range(min(10000, pool_size)):  # for i in range(pool_size):
            desc_vec = np.expand_dims(desc_pool[i], axis=0)  # [1 x dim]
            n_results = K
            if sim_measure == 'cos':
                sims = np.dot(code_pool, desc_vec.T)[:, 0]  # [pool_size]
            else:
                sims = similarity(code_pool, desc_vec,
                                  sim_measure)  # [pool_size]

            negsims = np.negative(sims)
            predict = np.argpartition(negsims, kth=n_results -
                                      1)  #predict=np.argsort(negsims)#
            predict = predict[:n_results]
            predict = [int(k) for k in predict]
            real = [i]
            accs.append(ACC(real, predict))
            mrrs.append(MRR(real, predict))
            maps.append(MAP(real, predict))
            ndcgs.append(NDCG(real, predict))
    return np.mean(accs), np.mean(mrrs), np.mean(maps), np.mean(ndcgs)
Exemplo n.º 20
0
def train(path, args):
    tf.reset_default_graph()  # reset graph
    timestamp = time_string() if args.time_string == None else args.time_string

    # draw graph
    feeder = Feeder(args.train_filename, args, hparams)

    output_classes = max(
        [int(f) for f in feeder.total_emt]) + 1 if args.model_type in [
            'emt', 'accent'
        ] else max([int(f) for f in feeder.total_spk]) + 1

    batch = tf.placeholder(
        shape=[args.N * args.M, None, config.n_mels],
        dtype=tf.float32)  # input batch (time x batch x n_mel)
    labels = tf.placeholder(shape=[args.N * args.M], dtype=tf.int32)
    lr = tf.placeholder(dtype=tf.float32)  # learning rate
    global_step = tf.Variable(0, name='global_step', trainable=False)
    w = tf.get_variable("w", initializer=np.array([10], dtype=np.float32))
    b = tf.get_variable("b", initializer=np.array([-5], dtype=np.float32))

    # embedded = triple_lstm(batch)
    print("Training {} Discriminator Model".format(args.model_type))
    encoder = ReferenceEncoder(
        filters=hparams.reference_filters,
        kernel_size=(3, 3),
        strides=(2, 2),
        is_training=True,
        scope='Tacotron_model/inference/pretrained_ref_enc_{}'.format(
            args.model_type),
        depth=hparams.reference_depth)  # [N, 128])
    embedded = encoder(batch)
    embedded = normalize(embedded)

    if args.discriminator:
        logit = tf.layers.dense(
            embedded,
            output_classes,
            name='Tacotron_model/inference/pretrained_ref_enc_{}_dense'.format(
                args.model_type))
        labels_one_hot = tf.one_hot(tf.to_int32(labels), output_classes)
        # loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logit,labels=labels_one_hot))
        loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=logit,
                                                    labels=labels_one_hot))
        acc, acc_op = tf.metrics.accuracy(labels=tf.argmax(labels_one_hot, 1),
                                          predictions=tf.argmax(logit, 1))
        val_acc, val_acc_op = tf.metrics.accuracy(
            labels=tf.argmax(labels_one_hot, 1),
            predictions=tf.argmax(logit, 1))
    else:
        # loss
        sim_matrix = similarity(embedded,
                                w,
                                b,
                                args.N,
                                args.M,
                                P=hparams.reference_depth)
        print("similarity matrix size: ", sim_matrix.shape)
        loss = loss_cal(sim_matrix, args.N, args.M, type=config.loss)
        val_acc_op = tf.constant(1.)

    # optimizer operation
    trainable_vars = tf.trainable_variables()  # get variable list
    optimizer = optim(
        lr)  # get optimizer (type is determined by configuration)
    grads, vars = zip(*optimizer.compute_gradients(
        loss))  # compute gradients of variables with respect to loss

    if args.discriminator:
        grads_rescale = grads
    else:
        grads_clip, _ = tf.clip_by_global_norm(grads,
                                               3.0)  # l2 norm clipping by 3
        grads_rescale = [0.01 * grad for grad in grads_clip[:2]
                         ] + grads_clip[2:]  # smaller gradient scale for w, b

    train_op = optimizer.apply_gradients(
        zip(grads_rescale,
            vars), global_step=global_step)  # gradient update operation

    # check variables memory
    variable_count = np.sum(
        np.array([
            np.prod(np.array(v.get_shape().as_list())) for v in trainable_vars
        ]))
    print("total variables :", variable_count)

    # record loss
    loss_summary = tf.summary.scalar("loss", loss)
    merged = tf.summary.merge_all()
    saver = tf.train.Saver(max_to_keep=20)
    loss_window = ValueWindow(100)
    acc_window = ValueWindow(100)
    val_loss_window = ValueWindow(5)
    val_acc_window = ValueWindow(5)

    # training session
    with tf.Session() as sess:
        tf.local_variables_initializer().run()
        tf.global_variables_initializer().run()

        checkpoint_folder = os.path.join(path, "checkpoints", timestamp)
        logs_folder = os.path.join(path, "logs", timestamp)
        os.makedirs(checkpoint_folder,
                    exist_ok=True)  # make folder to save model
        os.makedirs(logs_folder, exist_ok=True)  # make folder to save log
        model_name = '{}_disc_model.ckpt'.format(args.model_type)
        checkpoint_path = os.path.join(checkpoint_folder, model_name)

        if args.restore:
            checkpoint_state = tf.train.get_checkpoint_state(checkpoint_folder)
            if (checkpoint_state and checkpoint_state.model_checkpoint_path):
                print('Loading checkpoint {}'.format(
                    checkpoint_state.model_checkpoint_path))
                saver.restore(sess, checkpoint_state.model_checkpoint_path)
            else:
                print('No model to load at {}'.format(checkpoint_folder))
                saver.save(sess, checkpoint_path, global_step=global_step)
        else:
            print('Starting new training!')
            saver.save(sess, checkpoint_path, global_step=global_step)

        writer = tf.summary.FileWriter(logs_folder, sess.graph)
        lr_factor = 1  # lr decay factor ( 1/2 per 10000 iteration)

        iterations = 30000 if args.model_type == 'emt' else config.iteration
        for iter in range(iterations):
            if args.discriminator:
                batch_iter, _, labels_iter = feeder.random_batch_disc()
            else:
                batch_iter, _, labels_iter = feeder.random_batch()
            # run forward and backward propagation and update parameters
            step, _, loss_cur, summary, acc_cur = sess.run(
                [global_step, train_op, loss, merged, acc_op],
                feed_dict={
                    batch: batch_iter,
                    labels: labels_iter,
                    lr: config.lr * lr_factor
                })

            loss_window.append(loss_cur)
            acc_window.append(acc_cur)

            if step % 10 == 0:
                writer.add_summary(summary, step)  # write at tensorboard
            if (step + 1) % 20 == 0:
                val_loss_cur_batch = 0
                val_acc_cur_batch = 0
                for iter in range(VAL_ITERS):
                    if args.discriminator:
                        batch_iter, _, labels_iter = feeder.random_batch_disc(
                            TEST=True)
                    else:
                        batch_iter, _, labels_iter = feeder.random_batch(
                            TEST=True)
                    # run forward and backward propagation and update parameters
                    val_loss_cur, val_acc_cur = sess.run([loss, val_acc_op],
                                                         feed_dict={
                                                             batch: batch_iter,
                                                             labels:
                                                             labels_iter
                                                         })
                    val_loss_cur_batch += val_loss_cur
                    val_acc_cur_batch += val_acc_cur
                val_loss_cur_batch /= VAL_ITERS
                val_acc_cur_batch /= VAL_ITERS
                val_loss_window.append(val_loss_cur_batch)
                val_acc_window.append(val_acc_cur_batch)

                message = "(iter : %d) loss: %.4f" % (
                    (step + 1), loss_window.average)
                if args.discriminator:
                    message += ', acc: {:.2f}%'.format(acc_window.average)
                message += ", val_loss: %.4f" % (val_loss_window.average)
                if args.discriminator:
                    message += ', val_acc: {:.2f}%'.format(
                        val_acc_window.average)
                print(message)

            lr_changed = False
            if args.model_type == 'emt':
                if step > 6000:
                    lr_changed = True if lr_factor != .01 else False
                    lr_factor = .01
                elif step > 4000:
                    lr_changed = True if lr_factor != .1 else False
                    lr_factor = .1
                if lr_changed:
                    print("learning rate is decayed! current lr : ",
                          config.lr * lr_factor)
            elif args.model_type == 'spk':
                if step > 300:  #4000:
                    lr_changed = True if lr_factor != .01 else False
                    lr_factor = .01
                elif step > 180:  #2500:
                    lr_changed = True if lr_factor != .1 else False
                    lr_factor = .1
                if lr_changed:
                    print("learning rate is decayed! current lr : ",
                          config.lr * lr_factor)
            if step % config.save_checkpoint_iters == 0:
                saver.save(sess, checkpoint_path, global_step=global_step)
Exemplo n.º 21
0
def repr_code(args, ast2id, code2id, nl2id, id2nl):
    with torch.no_grad():
        device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
        config = getattr(configs, 'config_' + args.model)()

        ##### Define model ######
        logger.info('Constructing Model..')
        logger.info(os.getcwd())
        model = getattr(models, args.model)(config, ast2id)  # initialize the model
        if args.reload_from > 0:
            ckpt_path = f'./output/{args.model}/{args.dataset}/models/step{args.reload_from}.h5'
            model.load_state_dict(torch.load(ckpt_path, map_location=device))
        model = model.to(device)
        model.eval()
        pool_size=100
        sim_measure='cos'
        #data_path = args.data_path + args.dataset + '/'
        '''
        use_set = eval(config['dataset_name'])(data_path, config['use_names'], config['name_len'],
                                  config['use_apis'], config['api_len'],
                                  config['use_tokens'], config['tokens_len'])
    
        data_loader = torch.utils.data.DataLoader(dataset=use_set, batch_size=args.batch_size, 
                                      shuffle=False, drop_last=False, num_workers=1)
        '''
        valid_data_set = TreeDataSet(file_name=args.data_dir + '/train.json',
                                     ast_path=args.data_dir + '/tree/train/',
                                     ast2id=ast2id,
                                     nl2id=nl2id,
                                     max_ast_size=args.code_max_len,
                                     max_simple_name_size=args.max_simple_name_len,
                                     k=args.k,
                                     max_comment_size=args.comment_max_len,
                                     use_code=True,
                                     desc=config['valid_desc'],
                                     desclen=config['desc_len']
                                     )
        data_loader = DataLoaderX(dataset=valid_data_set,
                                  batch_size=args.batch_size,
                                  shuffle=False,

                                  num_workers=2)
        accs, mrrs, maps, ndcgs = [], [], [], []
        code_reprs, desc_reprs = [], []
        n_processed = 0
        for batch in tqdm(data_loader):
            if len(batch) == 8:  # seq_tensor, rel_par, rel_bro, rel_semantic, descs, desc_len, bad_descs, bad_desc_len
                code_batch = [tensor.to(device).long() for tensor in batch[:4]]
                desc_batch = [tensor.to(device).long() for tensor in batch[4:6]]
            with torch.no_grad():

                code_repr = addCodeMaskToCalcuCodeRepr(model, *code_batch).data.cpu().numpy().astype(np.float32)
                desc_repr = model.desc_encoding(*desc_batch).data.cpu().numpy().astype(
                    np.float32)  # [poolsize x hid_size]
                if sim_measure == 'cos':
                    code_repr = normalize(code_repr)
                    desc_repr = normalize(desc_repr)
            code_reprs.append(code_repr)
            desc_reprs.append(desc_repr)
            n_processed += batch[0].size(0)
        code_reprs, desc_reprs = np.vstack(code_reprs), np.vstack(desc_reprs)
        n_processed -= (n_processed % 100)
        for k in tqdm(range(0, n_processed - pool_size, pool_size)):
            code_pool, desc_pool = code_reprs[k:k + pool_size], desc_reprs[k:k + pool_size]
            sum = 0.0
            for i in range(min(10000, pool_size)):  # for i in range(pool_size):
                desc_vec = np.expand_dims(desc_pool[i], axis=0)  # [1 x dim]

                if sim_measure == 'cos':
                    sims = np.dot(code_pool, desc_vec.T)[:, 0]  # [pool_size]
                else:
                    sims = similarity(code_pool, desc_vec, sim_measure)  # [pool_size]
                if sims[i] > 0.4:
                    sum += 1;
                # negsims=np.negative(sims.T)
                # predict = np.argpartition(negsims, kth=n_results-1)#predict=np.argsort(negsims)#
                # predict = predict[:n_results]
                #
                # predict = [int(k) for k in predict]
                # real = [i]
                # for val in real:
                #     try:
                #         index = predict.index(val)
                #     except ValueError:
                #         index = -1
                #     if index != -1: sum = sum + 1

            accs.append(sum / float(pool_size))
            # accs.append(ACC(real,predict))
            # mrrs.append(MRR(real,predict))
            # maps.append(MAP(real,predict))
            # ndcgs.append(NDCG(real,predict))
        logger.info({'acc': np.mean(accs), 'err': 1 - np.mean(accs)})
        return {'acc': np.mean(accs), 'err': 1 - np.mean(accs)}
Exemplo n.º 22
0
def train(path):
    tf.reset_default_graph()  # reset graph

    # draw graph
    batch = tf.placeholder(
        shape=[None, config.N * config.M,
               40], dtype=tf.float32)  # input batch (time x batch x n_mel)
    lr = tf.placeholder(dtype=tf.float32)  # learning rate
    global_step = tf.Variable(0, name='global_step', trainable=False)
    w = tf.get_variable("w", initializer=np.array([10], dtype=np.float32))
    b = tf.get_variable("b", initializer=np.array([-5], dtype=np.float32))

    # embedding lstm (3-layer default)
    with tf.variable_scope("lstm"):
        lstm_cells = [
            tf.contrib.rnn.LSTMCell(num_units=config.hidden,
                                    num_proj=config.proj)
            for i in range(config.num_layer)
        ]
        lstm = tf.contrib.rnn.MultiRNNCell(
            lstm_cells)  # define lstm op and variables
        outputs, _ = tf.nn.dynamic_rnn(
            cell=lstm, inputs=batch, dtype=tf.float32,
            time_major=True)  # for TI-VS must use dynamic rnn
        embedded = outputs[-1]  # the last ouput is the embedded d-vector
        embedded = normalize(embedded)  # normalize
    print("embedded size: ", embedded.shape)

    # loss
    sim_matrix = similarity(embedded, w, b)
    print("similarity matrix size: ", sim_matrix.shape)
    loss = loss_cal(sim_matrix, type=config.loss)

    # optimizer operation
    trainable_vars = tf.trainable_variables()  # get variable list
    optimizer = optim(
        lr)  # get optimizer (type is determined by configuration)
    grads, vars = zip(*optimizer.compute_gradients(
        loss))  # compute gradients of variables with respect to loss
    grads_clip, _ = tf.clip_by_global_norm(grads, 3.0)  # l2 norm clipping by 3
    grads_rescale = [0.01 * grad for grad in grads_clip[:2]
                     ] + grads_clip[2:]  # smaller gradient scale for w, b
    train_op = optimizer.apply_gradients(
        zip(grads_rescale,
            vars), global_step=global_step)  # gradient update operation

    # check variables memory
    variable_count = np.sum(
        np.array([
            np.prod(np.array(v.get_shape().as_list())) for v in trainable_vars
        ]))
    print("total variables :", variable_count)

    # record loss
    loss_summary = tf.summary.scalar("loss", loss)
    merged = tf.summary.merge_all()
    saver = tf.train.Saver()
    iter = 0

    # training session
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        if config.restore:

            # Restore saved model if the user requested it, default = True
            try:
                ckpt = tf.train.latest_checkpoint(
                    checkpoint_dir=os.path.join(path, "Check_Point"))

                #                if (checkpoint_state and checkpoint_state.model_checkpoint_path):
                #                    print('Loading checkpoint {}'.format(checkpoint_state.model_checkpoint_path))
                #saver = tf.train.import_meta_graph(os.path.join(path,"Check_Point/model.cpkt.meta"))

                #ckpt = tf.train.load_checkpoint(os.path.join(path,"Check_Point/model"))
                saver.restore(sess, ckpt)

#                else:
#                    print('No model to load at {}'.format(save_dir))

#                    saver.save(sess, checkpoint_path, global_step=global_step)

            except:
                print('Cannot restore checkpoint exception')

        #if loaded == 0:
        #    raise AssertionError("ckpt file does not exist! Check config.model_num or config.model_path.")

        #print("train file path : ", config.test_path)

        else:

            os.makedirs(os.path.join(path, "Check_Point"),
                        exist_ok=True)  # make folder to save model
            os.makedirs(os.path.join(path, "logs"),
                        exist_ok=True)  # make folder to save log

        writer = tf.summary.FileWriter(os.path.join(path, "logs"), sess.graph)
        epoch = 0
        lr_factor = 1  # lr decay factor ( 1/2 per 10000 iteration)
        loss_acc = 0  # accumulated loss ( for running average of loss)
        iter = 0
        training_data_size = len(os.listdir(config.train_path))
        print("train_size: ", training_data_size)
        prev_iter = -1

        #        while iter  < config.iteration :
        while iter < config.iteration:
            prev_iter = iter

            # run forward and backward propagation and update parameters
            iter, _, loss_cur, summary = sess.run(
                [global_step, train_op, loss, merged],
                feed_dict={
                    batch: random_batch(),
                    lr: config.lr * lr_factor
                })

            loss_acc += loss_cur  # accumulated loss for each 100 iteration

            if (iter - prev_iter > 1):
                epoch = config.N * (iter + 1) // training_data_size
                #lr_factor = lr_factor / (2**(epoch//100))
                lr_factor = lr_factor / (2**(iter // 10000))
                print("restored epoch:", epoch)
                print("restored learning rate:", lr_factor * config.lr)

            #if iter % 1000 == 0:
            #    writer.add_summary(summary, iter)   # write at tensorboard
            if (iter + 1) % 100 == 0:
                print("(iter : %d) loss: %.4f" % ((iter + 1), loss_acc / 100))
                loss_acc = 0  # reset accumulated loss

            #if config.N * (iter+1) % training_data_size == 0:
            #    epoch = epoch + 1
            #    print("epoch: ", epoch)

            if (iter + 1) % 10000 == 0:
                lr_factor /= 2
                print("learning rate is decayed! current lr : ",
                      config.lr * lr_factor)

            #if ((config.N * (iter+1)) / training_data_size)%100  == 0:
            #    lr_factor = lr_factor / 2
            #    print("learning factor: " , lr_factor)
            #    print("learning rate is decayed! current lr : ", config.lr*lr_factor)

            if (iter + 1) % 5000 == 0:
                saver.save(sess,
                           os.path.join(path, "Check_Point/model.ckpt"),
                           global_step=iter)  #pooooooooooooint
                writer.add_summary(summary, iter)  # write at tensorboard
                print("model is saved!")
Exemplo n.º 23
0
def train(path):
    tf.reset_default_graph()  # reset graph

    # draw graph
    batch = tf.placeholder(
        shape=[None, config.N * config.M,
               40], dtype=tf.float32)  # input batch (time x batch x n_mel)
    lr = tf.placeholder(dtype=tf.float32)  # learning rate
    global_step = tf.Variable(0, name='global_step', trainable=False)
    w = tf.get_variable("w", initializer=np.array([10], dtype=np.float32))
    b = tf.get_variable("b", initializer=np.array([-5], dtype=np.float32))

    # embedding lstm (3-layer default)
    with tf.variable_scope("lstm"):
        lstm_cells = [
            tf.contrib.rnn.LSTMCell(num_units=config.hidden,
                                    num_proj=config.proj)
            for i in range(config.num_layer)
        ]
        lstm = tf.contrib.rnn.MultiRNNCell(
            lstm_cells)  # define lstm op and variables
        outputs, _ = tf.nn.dynamic_rnn(
            cell=lstm, inputs=batch, dtype=tf.float32,
            time_major=True)  # for TI-VS must use dynamic rnn
        embedded = outputs[-1]  # the last ouput is the embedded d-vector
        embedded = normalize(embedded)  # normalize
    print("embedded size: ", embedded.shape)

    # loss
    sim_matrix = similarity(embedded, w, b)
    print("similarity matrix size: ", sim_matrix.shape)
    loss = loss_cal(sim_matrix, type=config.loss)

    # optimizer operation
    trainable_vars = tf.trainable_variables()  # get variable list
    optimizer = optim(
        lr)  # get optimizer (type is determined by configuration)
    grads, vars = zip(*optimizer.compute_gradients(
        loss))  # compute gradients of variables with respect to loss
    grads_clip, _ = tf.clip_by_global_norm(grads, 3.0)  # l2 norm clipping by 3
    grads_rescale = [0.01 * grad for grad in grads_clip[:2]
                     ] + grads_clip[2:]  # smaller gradient scale for w, b
    train_op = optimizer.apply_gradients(
        zip(grads_rescale,
            vars), global_step=global_step)  # gradient update operation

    # check variables memory
    variable_count = np.sum(
        np.array([
            np.prod(np.array(v.get_shape().as_list())) for v in trainable_vars
        ]))
    print("total variables :", variable_count)

    # record loss
    loss_summary = tf.summary.scalar("loss", loss)
    merged = tf.summary.merge_all()
    saver = tf.train.Saver()

    # training session
    # with tf.Session() as sess:
    gpu_options = tf.GPUOptions(allow_growth=True)
    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess:
        tf.global_variables_initializer().run()
        os.makedirs(os.path.join(path, "Check_Point"),
                    exist_ok=True)  # make folder to save model
        os.makedirs(os.path.join(path, "logs"),
                    exist_ok=True)  # make folder to save log
        writer = tf.summary.FileWriter(os.path.join(path, "logs"), sess.graph)
        epoch = 0
        lr_factor = 1  # lr decay factor ( 1/2 per 10000 iteration)
        loss_acc = 0  # accumulated loss ( for running average of loss)

        train_times = [
        ]  #===========================================================================2020/05/20 16:30
        total_times = 0  #===========================================================================2020/05/20 16:30
        for iter in range(config.iteration):
            # run forward and backward propagation and update parameters
            # 记录迭代训练开始时间
            begin_time = time.clock(
            )  #===========================================================================2020/05/20 16:30
            _, loss_cur, summary = sess.run([train_op, loss, merged],
                                            feed_dict={
                                                batch: random_batch(),
                                                lr: config.lr * lr_factor
                                            })
            # 记录迭代训练结束时间
            end_time = time.clock(
            )  # ===========================================================================2020/05/20 16:30
            total_times += end_time - begin_time  # ===========================================================================2020/05/20 16:30
            train_times.append(
                str(begin_time) + '_' + str(end_time) + '_' +
                str(end_time - begin_time)
            )  # ===========================================================================2020/05/20 16:30

            loss_acc += loss_cur  # accumulated loss for each 100 iteration

            if iter % 10 == 0:
                writer.add_summary(summary, iter)  # write at tensorboard
            if (iter + 1) % 100 == 0:
                print("(iter : %d) loss: %.4f" % ((iter + 1), loss_acc / 100))
                loss_acc = 0  # reset accumulated loss
                print(
                    "iter:{},耗时:{}s".format(iter, str(end_time - begin_time))
                )  # ===========================================================================2020/05/20 16:30
            if (iter + 1) % 10000 == 0:
                lr_factor /= 2  # lr decay
                print("learning rate is decayed! current lr : ",
                      config.lr * lr_factor)
            if (iter + 1) % 10000 == 0:
                saver.save(sess,
                           os.path.join(path, "./Check_Point/model.ckpt"),
                           global_step=iter // 10000)
                print("model is saved!")
        # ===========================================================================2020/05/20 16:30
        # 存模型
        saver.save(sess,
                   os.path.join(path, "./Check_Point/model.ckpt"),
                   global_step=iter)
        print("model is saved!")

        # 将时间写入文件
        with open('GE2E_epoch说话人{}_batch说话人{}_人均音频数{}_iter{}_迭代耗时.txt'.format(
                config.spk_num, config.N, config.M, config.iteration),
                  mode='w',
                  encoding='utf-8') as wf:
            wf.write(
                "epoch说话人{}个;batch说话人:{}个;人均音频数:{}条;迭代总次数:{};平均每次训练迭代耗时:{}\n".
                format(config.spk_num, config.N, config.M, config.iteration,
                       total_times / config.iteration))
            wf.write("开始训练时间_结束训练时间_耗时\n")
            for line in train_times:
                wf.write(line + '\n')
Exemplo n.º 24
0
    def test(self):
        enroll = tf.placeholder(shape=[None, config.N * config.M, 40], dtype=tf.float32,
                                name="enroll")  # enrollment batch (time x batch x n_mel)
        verif = tf.placeholder(shape=[None, config.N * config.M, 40], dtype=tf.float32,
                               name="verif")  # verification batch (time x batch x n_mel)
        self.fingerprint_input = tf.concat([enroll, verif], axis=1, name="fingerprint_input")
        embedded = self.creat_model()
        enroll_embed = normalize(
            tf.reduce_mean(tf.reshape(embedded[:config.N * config.M, :], shape=[config.N, config.M, -1]), axis=1))
        # verification embedded vectors
        verif_embed = embedded[config.N * config.M:, :]
        similarity_matrix = similarity(embedded=verif_embed, w=1., b=0., center=enroll_embed)
        saver = tf.train.Saver(var_list=tf.global_variables())
        with tf.Session() as sess:
            tf.global_variables_initializer().run()
            # load model
            print("model path :", config.model_path)
            ckpt = tf.train.get_checkpoint_state(checkpoint_dir=os.path.join(config.model_path, "Check_Point"))
            ckpt_list = ckpt.all_model_checkpoint_paths
            loaded = 0
            for model in ckpt_list:
                if config.model_num == int(model[-1]):  # find ckpt file which matches configuration model number
                    print("ckpt file is loaded !", model)
                    loaded = 1
                    saver.restore(sess, model)  # restore variables from selected ckpt file
                    break
            if loaded == 0:
                raise AssertionError("ckpt file does not exist! Check config.model_num or config.model_path.")
            print("test file path : ", "data/test")
            # return similarity matrix after enrollment and verification
            time1 = time.time()  # for check inference time

            S = sess.run(similarity_matrix, feed_dict={enroll: random_batch(shuffle=False),
                                                       verif: random_batch(shuffle=False, utter_start=config.M)})
            S = S.reshape([config.N, config.M, -1])
            time2 = time.time()
            np.set_printoptions(precision=2)
            print("inference time for %d utterences : %0.2fs" % (2 * config.M * config.N, time2 - time1))
            print(S)  # print similarity matrix
            # calculating EER
            diff = 1
            EER = 0
            EER_thres = 0
            EER_FAR = 0
            EER_FRR = 0
            # through thresholds calculate false acceptance ratio (FAR) and false reject ratio (FRR)
            for thres in [0.01 * i + 0.5 for i in range(50)]:
                S_thres = S > thres
                # False acceptance ratio = false acceptance / mismatched population (enroll speaker != verification speaker)
                FAR = sum([np.sum(S_thres[i]) - np.sum(S_thres[i, :, i]) for i in range(config.N)]) / (
                        config.N - 1) / config.M / config.N
                # False reject ratio = false reject / matched population (enroll speaker = verification speaker)
                FRR = sum([config.M - np.sum(S_thres[i][:, i]) for i in range(config.N)]) / config.M / config.N
                # Save threshold when FAR = FRR (=EER)
                if diff > abs(FAR - FRR):
                    diff = abs(FAR - FRR)
                    EER = (FAR + FRR) / 2
                    EER_thres = thres
                    EER_FAR = FAR
                    EER_FRR = FRR
            print("\nEER : %0.2f (thres:%0.2f, FAR:%0.2f, FRR:%0.2f)" % (EER, EER_thres, EER_FAR, EER_FRR))
Exemplo n.º 25
0
def test(path):
    tf.reset_default_graph()

    # draw graph
    enroll = tf.placeholder(
        shape=[None, config.N * config.M, 40],
        dtype=tf.float32)  # enrollment batch (time x batch x n_mel)
    verif = tf.placeholder(
        shape=[None, config.N * config.M, 40],
        dtype=tf.float32)  # verification batch (time x batch x n_mel)
    batch = tf.concat([enroll, verif], axis=1)

    # embedding lstm (3-layer default)
    with tf.variable_scope("lstm"):
        lstm_cells = [
            tf.contrib.rnn.LSTMCell(num_units=config.hidden,
                                    num_proj=config.proj)
            for i in range(config.num_layer)
        ]
        lstm = tf.contrib.rnn.MultiRNNCell(
            lstm_cells)  # make lstm op and variables
        outputs, _ = tf.nn.dynamic_rnn(
            cell=lstm, inputs=batch, dtype=tf.float32,
            time_major=True)  # for TI-VS must use dynamic rnn
        embedded = outputs[-1]  # the last ouput is the embedded d-vector
        embedded = normalize(embedded)  # normalize

    print("embedded size: ", embedded.shape)

    # enrollment embedded vectors (speaker model)
    enroll_embed = normalize(
        tf.reduce_mean(tf.reshape(embedded[:config.N * config.M, :],
                                  shape=[config.N, config.M, -1]),
                       axis=1))
    # verification embedded vectors
    verif_embed = embedded[config.N * config.M:, :]

    similarity_matrix = similarity(embedded=verif_embed,
                                   w=1.,
                                   b=0.,
                                   center=enroll_embed)

    saver = tf.train.Saver(var_list=tf.global_variables())
    with tf.Session() as sess:
        tf.global_variables_initializer().run()

        # load model
        print("model path :", path)
        ckpt = tf.train.get_checkpoint_state(
            checkpoint_dir=os.path.join(path, "Check_Point"))
        ckpt_list = ckpt.all_model_checkpoint_paths
        loaded = 0
        for model in ckpt_list:
            if config.model_num == int(
                    model[-1]
            ):  # find ckpt file which matches configuration model number
                print("ckpt file is loaded !", model)
                loaded = 1
                saver.restore(
                    sess, model)  # restore variables from selected ckpt file
                break

        if loaded == 0:
            raise AssertionError(
                "ckpt file does not exist! Check config.model_num or config.model_path."
            )

        print("test file path : ", config.test_path)
        '''
            test speaker:p225--p243
        '''
        # return similarity matrix after enrollment and verification
        time1 = time.time()  # for check inference time
        if config.tdsv:
            S = sess.run(similarity_matrix,
                         feed_dict={
                             enroll: random_batch(shuffle=False,
                                                  noise_filenum=1),
                             verif: random_batch(shuffle=False,
                                                 noise_filenum=2)
                         })
        else:
            S = sess.run(similarity_matrix,
                         feed_dict={
                             enroll:
                             random_batch(shuffle=False),
                             verif:
                             random_batch(shuffle=False, utter_start=config.M)
                         })
        S = S.reshape([config.N, config.M, -1])
        time2 = time.time()

        np.set_printoptions(precision=2)
        print("inference time for %d utterences : %0.2fs" %
              (2 * config.M * config.N, time2 - time1))
        print(S)  # print similarity matrix

        # calculating EER
        diff = 1
        EER = 0
        EER_thres = 0
        EER_FAR = 0
        EER_FRR = 0

        # through thresholds calculate false acceptance ratio (FAR) and false reject ratio (FRR)
        for thres in [0.01 * i + 0.5 for i in range(50)]:
            S_thres = S > thres

            # False acceptance ratio = false acceptance / mismatched population (enroll speaker != verification speaker)
            FAR = sum([
                np.sum(S_thres[i]) - np.sum(S_thres[i, :, i])
                for i in range(config.N)
            ]) / (config.N - 1) / config.M / config.N

            # False reject ratio = false reject / matched population (enroll speaker = verification speaker)
            FRR = sum(
                [config.M - np.sum(S_thres[i][:, i])
                 for i in range(config.N)]) / config.M / config.N

            # Save threshold when FAR = FRR (=EER)
            if diff > abs(FAR - FRR):
                diff = abs(FAR - FRR)
                EER = (FAR + FRR) / 2
                EER_thres = thres
                EER_FAR = FAR
                EER_FRR = FRR

        print("\nEER : %0.2f (thres:%0.2f, FAR:%0.2f, FRR:%0.2f)" %
              (EER, EER_thres, EER_FAR, EER_FRR))
Exemplo n.º 26
0
def get_embeddings(path, args):
    tf.reset_default_graph()  # reset graph
    if args.time_string == None:
        raise ValueError('must provide valid time_string')

    emb_dir = os.path.join(path, 'embeddings')
    os.makedirs(emb_dir, exist_ok=True)
    meta_path = os.path.join(emb_dir, 'meta.tsv')

    emb_path = os.path.join(
        emb_dir, 'emb_emt.tsv') if args.model_type == 'emt' else os.path.join(
            emb_dir, 'emb_spk.tsv')

    # draw graph
    feeder = Feeder(args.train_filename, args, hparams)
    datasets = ['emt4', 'vctk'] if args.model_type == 'emt' else ['vctk']
    num_datasets = len(datasets)

    batch = tf.placeholder(
        shape=[num_datasets * args.N * args.M, None, config.n_mels],
        dtype=tf.float32)  # input batch (time x batch x n_mel)
    w = tf.get_variable("w", initializer=np.array([10], dtype=np.float32))
    b = tf.get_variable("b", initializer=np.array([-5], dtype=np.float32))

    # embedded = triple_lstm(batch)
    print("{} Discriminator Model".format(args.model_type))
    encoder = ReferenceEncoder(
        filters=hparams.reference_filters,
        kernel_size=(3, 3),
        strides=(2, 2),
        is_training=True,
        scope='Tacotron_model/inference/pretrained_ref_enc_{}'.format(
            args.model_type),
        depth=hparams.reference_depth)  # [N, 128])
    embedded = encoder(batch)

    # loss
    sim_matrix = similarity(embedded,
                            w,
                            b,
                            num_datasets * args.N,
                            args.M,
                            P=hparams.reference_depth)
    print("similarity matrix size: ", sim_matrix.shape)
    loss = loss_cal(sim_matrix,
                    num_datasets * args.N,
                    args.M,
                    type=config.loss)

    saver = tf.train.Saver()

    # training session
    with tf.Session() as sess:
        tf.global_variables_initializer().run()

        checkpoint_folder = os.path.join(path, "checkpoints", args.time_string)

        checkpoint_state = tf.train.get_checkpoint_state(checkpoint_folder)
        if (checkpoint_state and checkpoint_state.model_checkpoint_path):
            print('Loading checkpoint {}'.format(
                checkpoint_state.model_checkpoint_path))
            saver.restore(sess, checkpoint_state.model_checkpoint_path)
        else:
            raise ValueError(
                'No model to load at {}'.format(checkpoint_folder))
        feeder_batch, meta = feeder.emb_batch(make_meta=True,
                                              datasets=datasets)
        emb, loss = sess.run([embedded, loss], feed_dict={batch: feeder_batch})
        print("loss: {:.4f}".format(loss))
        meta.to_csv(meta_path, sep='\t', index=False)
        pd.DataFrame(emb).to_csv(emb_path, sep='\t', index=False, header=False)
Exemplo n.º 27
0
 def get_similar_country_names(self, country_name):
     rate, most_similars = utils.most_similar(utils.similarity(self.normalized_country_items.keys(), country_name, 0.7))
     return most_similars
Exemplo n.º 28
0
def test(path):
    start_test_time = time.time()
    tf.reset_default_graph()

    # draw graph
    enroll = tf.placeholder(shape=[None, config.N*config.M, 40], dtype=tf.float32) # enrollment batch (time x batch x n_mel)
    verif = tf.placeholder(shape=[None, config.N*config.M, 40], dtype=tf.float32)  # verification batch (time x batch x n_mel)
    batch = tf.concat([enroll, verif], axis=1)

    # embedding lstm (3-layer default)
    with tf.variable_scope("lstm"):
        lstm_cells = [tf.contrib.rnn.LSTMCell(num_units=config.hidden, num_proj=config.proj) for i in range(config.num_layer)]
        lstm = tf.contrib.rnn.MultiRNNCell(lstm_cells)    # make lstm op and variables
        outputs, _ = tf.nn.dynamic_rnn(cell=lstm, inputs=batch, dtype=tf.float32, time_major=True)   # for TI-VS must use dynamic rnn
        embedded = outputs[-1]                            # the last ouput is the embedded d-vector
        embedded = normalize(embedded)                    # normalize

    print("embedded size: ", embedded.shape)

    # enrollment embedded vectors (speaker model)
    enroll_embed = normalize(tf.reduce_mean(tf.reshape(embedded[:config.N*config.M, :], shape= [config.N, config.M, -1]), axis=1))
    # verification embedded vectors
    verif_embed = embedded[config.N*config.M:, :]

    similarity_matrix = similarity(embedded=verif_embed, w=1., b=0., center=enroll_embed)

    saver = tf.train.Saver(var_list=tf.global_variables())
    with tf.Session() as sess:
        tf.global_variables_initializer().run()

        # load model
        print("model path :", path)
        ckpt = tf.train.get_checkpoint_state(checkpoint_dir=os.path.join(path, "Check_Point"))
        ckpt_list = ckpt.all_model_checkpoint_paths
        loaded = 0
        for model in ckpt_list:
            if config.model_num == int(model[-1]):    # find ckpt file which matches configuration model number
                print("ckpt file is loaded !", model)
                loaded = 1
                saver.restore(sess, model)  # restore variables from selected ckpt file
                break

        if loaded == 0:
            raise AssertionError("ckpt file does not exist! Check config.model_num or config.model_path.")


        # return similarity matrix after enrollment and verification
        enroll_batchs = {}
        for folder in os.listdir(config.enroll_path):
            enroll_dir = os.path.join(config.enroll_path, folder)
            if not os.path.isdir(enroll_dir):
                continue
            enroll_batchs[folder] = random_batch(path=enroll_dir, shuffle=False)

        after_enroll_time = time.time()
        num_verification = 0
        if config.tdsv:
            for folder in os.listdir(config.verification_path):
                verification_dir = os.path.join(config.verification_path, folder)
                if not os.path.isdir(verification_dir):
                    continue
                print('Verification Result of ' + folder)
                num_verification += 1
                verification_batch = random_batch(path=verification_dir, shuffle=False)
                for key in enroll_batchs.keys():
                    enroll_batch = enroll_batchs[key]
                    S = sess.run(similarity_matrix, feed_dict={enroll: enroll_batch, verif: verification_batch})
                    S = S.reshape([config.N, config.M, -1])
                    np.set_printoptions(precision=2)
                    print('Score between ' + folder + '-' + key)
                    print(np.mean(S))    # print similarity matrix
    duration = time.time()-start_test_time
    avg_verification_duration = (time.time() - after_enroll_time)/num_verification
    print('Test duration:' + str(duration) + 's')
    print('Verification duration:' + str(avg_verification_duration) + 's')
Exemplo n.º 29
0
def test(path):
    tf.reset_default_graph()

    # draw graph
    enroll = tf.placeholder(
        shape=[None, config.N * config.M, 40],
        dtype=tf.float32)  # enrollment batch (time x batch x n_mel)
    verif = tf.placeholder(
        shape=[None, config.N * config.M, 40],
        dtype=tf.float32)  # verification batch (time x batch x n_mel)
    batch = tf.concat([enroll, verif], axis=1)

    # embedding lstm (3-layer default)
    with tf.variable_scope("lstm"):
        lstm_cells = [
            tf.contrib.rnn.LSTMCell(num_units=config.hidden,
                                    num_proj=config.proj)
            for i in range(config.num_layer)
        ]
        lstm = tf.contrib.rnn.MultiRNNCell(
            lstm_cells)  # make lstm op and variables
        outputs, _ = tf.nn.dynamic_rnn(
            cell=lstm, inputs=batch, dtype=tf.float32,
            time_major=True)  # for TI-VS must use dynamic rnn
        embedded = outputs[-1]  # the last ouput is the embedded d-vector
        embedded = normalize(embedded)  # normalize

    print("embedded size: ", embedded.shape)

    # enrollment embedded vectors (speaker model)
    enroll_embed = normalize(
        tf.reduce_mean(tf.reshape(embedded[:config.N * config.M, :],
                                  shape=[config.N, config.M, -1]),
                       axis=1))
    # verification embedded vectors
    verif_embed = embedded[config.N * config.M:, :]

    similarity_matrix = similarity(embedded=verif_embed,
                                   w=1.,
                                   b=0.,
                                   center=enroll_embed)
    loss = loss_cal(similarity_matrix, type=config.loss)

    saver = tf.train.Saver(var_list=tf.global_variables())
    with tf.Session() as sess:
        tf.global_variables_initializer().run()

        # load model

        #ckpt = tf.train.get_checkpoint_state(path)
        #checkpoints =  ckpt.all_model_checkpoint_paths
        i = 139999
        least_loss = 99999
        #print("checkpoints : ",checkpoints)

        while (i < 399999):
            saver.restore(sess, os.path.join(path, "model.ckpt-" + str(i)))

            S, L = sess.run(
                [similarity_matrix, loss],
                feed_dict={
                    enroll: random_batch(shuffle=False),
                    verif: random_batch(shuffle=False, utter_start=config.M)
                })
            S = S.reshape([config.N, config.M, -1])
            print("test file path : ", config.test_path)
            np.set_printoptions(precision=2)
            #print(S)

            if L < least_loss:
                #diff = abs(FAR-FRR)
                perfect_step = i
                least_loss = L

            print(i)
            print(str(L / (config.N * config.M)))
            i = i + 2500

        print("\ncheckpoint: " + str(perfect_step) + " (loss:%0.2f)" %
              (least_loss))
Exemplo n.º 30
0
    def run_one_step(self, generation_step):
        population_size = utils.profile_value(self.conf['population_profile'],
                                              generation_step,
                                              self.conf['num_generations'])
        n_children = max(1, int(self.conf['birth_rate'] * population_size))
        intra_depot = False if generation_step % self.conf[
            'extra_depot_every'] == 0 else True

        L = []
        for _ in range(n_children):
            pool = self.selection()
            p1 = np.random.choice(pool)
            similarities = [
                utils.similarity(p1, each) for each in self.population
            ]
            p2 = self.population[np.argmin(similarities)]

            p1, p2 = np.random.choice(pool, 2, replace=False)
            offspring = self.create_offspring(p1, p2)
            L.append(offspring)

        L = np.concatenate((L, self.population))

        for each in L:
            if each.total_violation() > 0 and np.random.choice(
                (0, 1), p=(0.8, 0.2)):
                each.repair()
            if np.random.choice((0, 1), p=(.9, .1)):
                self.mutation(each, True)

        scores = np.argsort(
            [each.fitness_score(self.penalty_multiplier) for each in L])
        if scores[0] != self.best_score:
            self.best_score = scores[0]
        self.best_count += 1
        acis = [
            each.average_capacity_infeasibility() for each in self.population
        ]
        self.mean_violations.append(np.mean(acis))
        if len(self.mean_violations) > 20:
            self.mean_violations = self.mean_violations[-20:]
        '''
        Penalty parameter adjustment
        '''
        if generation_step % 4 == 0:

            prop_feasible = np.mean(np.array(self.mean_violations) == 0)
            if prop_feasible - 0.05 > self.conf['fraction_feasible_population']:
                # allow more violations -- except if the multplier is low
                if not self.penalty_multiplier <= 1:
                    self.penalty_multiplier *= 0.85
            elif prop_feasible + 0.05 < self.conf[
                    'fraction_feasible_population']:
                # allow fewer violations
                self.penalty_multiplier *= 1.2

        self.best = scores[0]
        if self.best_count >= 100:
            print('hey now I change')
            old_population = L[scores[:population_size // 4]]
            new_population = self.generate_initial_population(
                population_size - len(old_population))
            self.population = np.concatenate((old_population, new_population))
            self.best_count = 0
        else:
            # ipdb.set_trace()
            self.population = L[scores[:population_size]]
Exemplo n.º 31
0
def main():
    if os.path.exists(config.use_output_path):
        os.system('rm ' + config.use_output_path)
    with open(config.use_output_path, 'a') as g:
        g.write(str(config) + '\n\n')
    # for item in config.record_time:
    # 	if os.path.exists(config.use_output_path + str(item)):
    # 		os.system('rm ' + config.use_output_path + str(item))
    #CGMH sampling for paraphrase
    sim = config.sim
    # sta_vec=list(np.zeros([config.num_steps-1]))
    config.shuffle = False
    #original sentence input
    use_data = dataset_str(config.use_data_path)
    config.batch_size = 1
    step_size = config.step_size

    start_time = time.time()
    proposal_cnt = 0
    accept_cnt = 0
    all_samples = []
    all_acc_samples = []
    all_chosen_samples = []
    for sen_id in range(use_data.length):
        sent_ids = use_data.token_ids[sen_id]
        keys = use_data.keys[sen_id]
        searcher = ConstraintSearch(keys)
        sequence_length = len(sent_ids)
        #generate for each sentence
        sta_vec = np.zeros(sequence_length)
        input_ids = np.array(sent_ids)
        input_original = use_data.tokens[sen_id]
        prev_inds = []
        old_prob = def_sent_scorer(tokenizer.decode(input_ids))
        old_prob *= penalty_constraint(
            searcher.count_unsafisfied_constraint(
                searcher.sent2tag(input_ids)))
        if sim != None:
            old_prob *= similarity(input_ids, input_original, sta_vec)

        outputs = []
        output_p = []
        for iter in range(config.sample_time):
            # if iter in config.record_time:
            # 	with open(config.use_output_path, 'a', encoding='utf-8') as g:
            # 		g.write(bert_scorer.tokenizer.decode(input_ids)+'\n')
            # print(bert_scorer.tokenizer.decode(input_ids).encode('utf8', errors='ignore'))
            pos_set = get_sample_positions(sequence_length, prev_inds,
                                           step_size)
            action_set = [
                choose_action(config.action_prob) for i in range(len(pos_set))
            ]
            # if not check_constraint(input_ids):
            # 	if 0 not in pos_set:
            # 		pos_set[-1] = 0
            keep_non = config.keep_non
            masked_sent, adjusted_pos_set = mask_sentence(
                input_ids, pos_set, action_set)
            prev_inds = pos_set

            proposal_prob = 1.0  # Q(x'|x)
            proposal_prob_reverse = 1.0  # Q(x|x')
            input_ids_tmp = np.array(masked_sent)  # copy
            sequence_length_tmp = sequence_length

            for step_i in range(len(pos_set)):

                ind = adjusted_pos_set[step_i]
                ind_old = pos_set[step_i]
                action = action_set[step_i]
                if config.restrict_constr:
                    if step_i == len(pos_set) - 1:
                        use_constr = True
                    else:
                        use_constr = False
                else:
                    use_constr = True
                #word replacement (action: 0)
                if action == 0:
                    prob_mask = bert_scorer.mask_score(input_ids_tmp,
                                                       ind,
                                                       mode=0)
                    input_candidate, prob_candidate, reverse_candidate_idx, _ = \
                     generate_candidate_input_with_mask(input_ids_tmp, sequence_length_tmp, ind, prob_mask, config.search_size,
                                                        old_tok=input_ids[ind_old], mode=action)
                    if sim is not None and use_constr:
                        similarity_candidate = similarity_batch(
                            input_candidate, input_original, sta_vec)
                        prob_candidate = prob_candidate * similarity_candidate
                    prob_candidate_norm = normalize(prob_candidate)
                    prob_candidate_ind = sample_from_candidate(
                        prob_candidate_norm)
                    input_ids_tmp = input_candidate[
                        prob_candidate_ind]  # changed
                    proposal_prob *= prob_candidate_norm[
                        prob_candidate_ind]  # Q(x'|x)
                    proposal_prob_reverse *= prob_candidate_norm[
                        reverse_candidate_idx]  # Q(x|x')
                    sequence_length_tmp += 0
                    print('action:0', prob_candidate_norm[prob_candidate_ind],
                          prob_candidate_norm[reverse_candidate_idx])

                #word insertion(action:1)
                if action == 1:
                    prob_mask = bert_scorer.mask_score(input_ids_tmp,
                                                       ind,
                                                       mode=0)

                    input_candidate, prob_candidate, reverse_candidate_idx, non_idx = \
                     generate_candidate_input_with_mask(input_ids_tmp, sequence_length_tmp, ind, prob_mask, config.search_size,
                                                        mode=action, old_tok=input_ids[ind_old], keep_non=keep_non)

                    if sim is not None and use_constr:
                        similarity_candidate = similarity_batch(
                            input_candidate, input_original, sta_vec)
                        prob_candidate = prob_candidate * similarity_candidate
                    prob_candidate_norm = normalize(prob_candidate)
                    prob_candidate_ind = sample_from_candidate(
                        prob_candidate_norm)
                    input_ids_tmp = input_candidate[prob_candidate_ind]
                    if prob_candidate_ind == non_idx:
                        if input_ids_tmp[-1] == PAD_IDX:
                            input_ids_tmp = input_ids_tmp[:-1]
                        print('action:1 insert non', 1.0, 1.0)
                    else:
                        proposal_prob *= prob_candidate_norm[
                            prob_candidate_ind]  # Q(x'|x)
                        proposal_prob_reverse *= 1.0  # Q(x|x'), reverse action is deleting
                        sequence_length_tmp += 1
                        print('action:1',
                              prob_candidate_norm[prob_candidate_ind], 1.0)

                #word deletion(action: 2)
                if action == 2:
                    input_ids_for_del = np.concatenate(
                        [input_ids_tmp[:ind], [MASK_IDX], input_ids_tmp[ind:]])
                    if keep_non:
                        non_cand = np.array(input_ids_for_del)
                        non_cand[ind] = input_ids[ind_old]
                        input_candidate = np.array([input_ids_tmp, non_cand])
                        prob_candidate = np.array([
                            bert_scorer.sent_score(x) for x in input_candidate
                        ])
                        non_idx = 1
                        if sim is not None and use_constr:
                            similarity_candidate = similarity_batch(
                                input_candidate, input_original, sta_vec)
                            prob_candidate = prob_candidate * similarity_candidate
                        prob_candidate_norm = normalize(prob_candidate)
                        prob_candidate_ind = sample_from_candidate(
                            prob_candidate_norm)
                        input_ids_tmp = input_candidate[prob_candidate_ind]
                    else:
                        non_idx = -1
                        prob_candidate_ind = 0
                        input_ids_tmp = input_ids_tmp  # already deleted

                    if prob_candidate_ind == non_idx:
                        print('action:2 delete non', 1.0, 1.0)
                    else:
                        # add mask, for evaluating reverse probability
                        prob_mask = bert_scorer.mask_score(input_ids_for_del,
                                                           ind,
                                                           mode=0)
                        input_candidate, prob_candidate, reverse_candidate_idx, _ = \
                         generate_candidate_input_with_mask(input_ids_for_del, sequence_length_tmp, ind, prob_mask,
                                                            config.search_size, mode=0, old_tok=input_ids[ind_old])

                        if sim != None:
                            similarity_candidate = similarity_batch(
                                input_candidate, input_original, sta_vec)
                            prob_candidate = prob_candidate * similarity_candidate
                        prob_candidate_norm = normalize(prob_candidate)

                        proposal_prob *= 1.0  # Q(x'|x)
                        proposal_prob_reverse *= prob_candidate_norm[
                            reverse_candidate_idx]  # Q(x|x'), reverse action is inserting
                        sequence_length_tmp -= 1

                        print('action:2', 1.0,
                              prob_candidate_norm[reverse_candidate_idx])

            new_prob = def_sent_scorer(tokenizer.decode(input_ids_tmp))
            new_prob *= penalty_constraint(
                searcher.count_unsafisfied_constraint(
                    searcher.sent2tag(input_ids_tmp)))
            if sim != None:
                sim_constr = similarity(input_ids_tmp, input_original, sta_vec)
                new_prob *= sim_constr
            input_text_tmp = tokenizer.decode(input_ids_tmp)
            all_samples.append([
                input_text_tmp, new_prob,
                searcher.count_unsafisfied_constraint(
                    searcher.sent2tag(input_ids_tmp)),
                bert_scorer.sent_score(input_ids_tmp, log_prob=True),
                gpt2_scorer.sent_score(input_text_tmp, ppl=True)
            ])
            if tokenizer.decode(input_ids_tmp) not in output_p:
                outputs.append(all_samples[-1])
            if outputs != []:
                output_p.append(outputs[-1][0])
            if proposal_prob == 0.0 or old_prob == 0.0:
                alpha_star = 1.0
            else:
                alpha_star = (proposal_prob_reverse *
                              new_prob) / (proposal_prob * old_prob)
            alpha = min(1, alpha_star)
            print(
                tokenizer.decode(input_ids_tmp).encode('utf8',
                                                       errors='ignore'))
            print(alpha, old_prob, proposal_prob, new_prob,
                  proposal_prob_reverse)
            proposal_cnt += 1
            if choose_action([alpha, 1 - alpha]) == 0 and (
                    new_prob > old_prob * config.threshold or just_acc() == 0):
                if tokenizer.decode(input_ids_tmp) != tokenizer.decode(
                        input_ids):
                    accept_cnt += 1
                    print('Accept')
                    all_acc_samples.append(all_samples[-1])
                input_ids = input_ids_tmp
                sequence_length = sequence_length_tmp
                old_prob = new_prob

        # choose output from samples
        for num in range(config.min_length, 0, -1):
            outputss = [x for x in outputs if len(x[0].split()) >= num]
            print(num, outputss)
            if outputss != []:
                break
        if outputss == []:
            outputss.append([tokenizer.decode(input_ids), 0])
        outputss = sorted(outputss, key=lambda x: x[1])[::-1]
        with open(config.use_output_path, 'a') as g:
            g.write(outputss[0][0] + '\t' + str(outputss[0][1]) + '\n')
        all_chosen_samples.append(outputss[0])

        print('Sentence %d, used time %.2f\n' %
              (sen_id, time.time() - start_time))
    print(proposal_cnt, accept_cnt, float(accept_cnt / proposal_cnt))

    print("All samples:")
    all_samples_ = list(zip(*all_samples))
    for metric in all_samples_[1:]:
        print(np.mean(np.array(metric)))

    print("All accepted samples:")
    all_samples_ = list(zip(*all_acc_samples))
    for metric in all_samples_[1:]:
        print(np.mean(np.array(metric)))

    print("All chosen samples:")
    all_samples_ = list(zip(*all_chosen_samples))
    for metric in all_samples_[1:]:
        print(np.mean(np.array(metric)))

    with open(config.use_output_path + '-result.csv', 'w', newline='') as f:
        csv_writer = csv.writer(f, delimiter='\t')
        csv_writer.writerow(
            ['Sentence', 'Prob_sim', 'Constraint_num', 'Log_prob', 'PPL'])
        csv_writer.writerows(all_samples)
Exemplo n.º 32
0
# print(len(load_files_to_string(data+"/Supergirl")))

sum_list = batman_beyond_list + batman_animated_list + dharma_greg_list + teen_titans_list + will_grace_list + \
           superman_list + supergirl_list
all_words = list(set(sum_list))

batman_beyond_bow = create_bag_of_words(batman_beyond, all_words)
batman_animated_bow = create_bag_of_words(batman_animated, all_words)
dharma_greg_bow = create_bag_of_words(dharma_greg, all_words)
teen_titans_bow = create_bag_of_words(teen_titans, all_words)
will_grace_bow = create_bag_of_words(will_grace, all_words)
superman_bow = create_bag_of_words(superman, all_words)
supergirl_bow = create_bag_of_words(supergirl, all_words)

print("Batman i batman")
print(similarity(batman_beyond_bow, batman_animated_bow))
print("Batman i dharma")
print(similarity(batman_beyond_bow, dharma_greg_bow))
print("Will i dharma")
print(similarity(will_grace_bow, dharma_greg_bow))
print("Batman i teen titans")
print(similarity(batman_beyond_bow, teen_titans_bow))
print("Superman i supergirl")
print(similarity(superman_bow, supergirl_bow))
print("Superman i dharma")
print(similarity(superman_bow, dharma_greg_bow))
print("Superman i batman")
print(similarity(superman_bow, batman_animated_bow))

bow_list = [("Batman Beyond", batman_beyond_bow),
            ("Batman Animated", batman_animated_bow),
Exemplo n.º 33
0
        time_major=True)  # for TI-VS must use dynamic rnn
    embedded = outputs[-1]  # the last ouput is the embedded d-vector
    embedded = normalize(embedded)  # normalize

print("embedded size: ", embedded.shape)

# enrollment embedded vectors (speaker model)
enroll_embed = normalize(
    tf.reduce_mean(tf.reshape(embedded[:config.N * config.M, :],
                              shape=[config.N, config.M, -1]),
                   axis=1))
# verification embedded vectors
verif_embed = embedded[config.N * config.M:, :]

similarity_matrix = similarity(embedded=verif_embed,
                               w=1.,
                               b=0.,
                               center=enroll_embed)

saver = tf.train.Saver(var_list=tf.global_variables())
with tf.Session() as sess:
    tf.global_variables_initializer().run()

    # load model
    print("model path :", path)
    ckpt = tf.train.get_checkpoint_state(
        checkpoint_dir=os.path.join(path, "Check_Point"))
    ckpt_list = ckpt.all_model_checkpoint_paths
    loaded = 0
    for model in ckpt_list:
        if config.model_num == int(
                model[-1]
Exemplo n.º 34
0
def train(path):
    tf.reset_default_graph()  # reset graph

    # draw graph
    batch = tf.placeholder(
        shape=[None, config.N * config.M,
               40], dtype=tf.float32)  # input batch (time x batch x n_mel)
    lr = tf.placeholder(dtype=tf.float32)  # learning rate
    global_step = tf.Variable(0, name='global_step', trainable=False)
    w = tf.get_variable("w", initializer=np.array([10], dtype=np.float32))
    b = tf.get_variable("b", initializer=np.array([-5], dtype=np.float32))

    # embedding lstm (3-layer default)
    with tf.variable_scope("lstm"):
        lstm_cells = [
            tf.contrib.rnn.LSTMCell(num_units=config.hidden,
                                    num_proj=config.proj)
            for i in range(config.num_layer)
        ]
        lstm = tf.contrib.rnn.MultiRNNCell(
            lstm_cells)  # define lstm op and variables
        outputs, _ = tf.nn.dynamic_rnn(
            cell=lstm, inputs=batch, dtype=tf.float32,
            time_major=True)  # for TI-VS must use dynamic rnn
        embedded = outputs[-1]  # the last ouput is the embedded d-vector
        embedded = normalize(embedded)  # normalize
    print("embedded size: ", embedded.shape)

    # loss
    sim_matrix = similarity(embedded, w, b)
    print("similarity matrix size: ", sim_matrix.shape)
    loss = loss_cal(sim_matrix, type=config.loss)

    # optimizer operation
    trainable_vars = tf.trainable_variables()  # get variable list
    optimizer = optim(
        lr)  # get optimizer (type is determined by configuration)
    grads, vars = zip(*optimizer.compute_gradients(
        loss))  # compute gradients of variables with respect to loss
    grads_clip, _ = tf.clip_by_global_norm(grads, 3.0)  # l2 norm clipping by 3
    grads_rescale = [0.01 * grad for grad in grads_clip[:2]
                     ] + grads_clip[2:]  # smaller gradient scale for w, b
    train_op = optimizer.apply_gradients(
        zip(grads_rescale,
            vars), global_step=global_step)  # gradient update operation

    # check variables memory
    variable_count = np.sum(
        np.array([
            np.prod(np.array(v.get_shape().as_list())) for v in trainable_vars
        ]))
    print("total variables :", variable_count)

    # record loss
    loss_summary = tf.summary.scalar("loss", loss)
    merged = tf.summary.merge_all()
    saver = tf.train.Saver()

    # training session
    with tf.Session() as sess:
        tf.global_variables_initializer().run()

        if (os.path.exists(path)):
            print("Restore from {}".format(
                os.path.join(path, "Check_Point/model.ckpt-2")))
            saver.restore(sess, os.path.join(path, "Check_Point/model.ckpt-2")
                          )  # restore variables from selected ckpt file
        else:
            os.makedirs(os.path.join(path, "Check_Point"),
                        exist_ok=True)  # make folder to save model
            os.makedirs(os.path.join(path, "logs"),
                        exist_ok=True)  # make folder to save log

        writer = tf.summary.FileWriter(os.path.join(path, "logs"), sess.graph)
        epoch = 0
        lr_factor = 1  # lr decay factor ( 1/2 per 10000 iteration)
        loss_acc = 0  # accumulated loss ( for running average of loss)

        for iter in range(config.iteration):
            # run forward and backward propagation and update parameters
            _, loss_cur, summary = sess.run([train_op, loss, merged],
                                            feed_dict={
                                                batch: random_batch(),
                                                lr: config.lr * lr_factor
                                            })

            loss_acc += loss_cur  # accumulated loss for each 100 iteration

            if iter % 10 == 0:
                writer.add_summary(summary, iter)  # write at tensorboard
            if (iter + 1) % 100 == 0:
                print("(iter : %d) loss: %.4f" % ((iter + 1), loss_acc / 100))
                loss_acc = 0  # reset accumulated loss
            if (iter + 1) % 3000 == 0:
                lr_factor /= 2  # lr decay
                print("learning rate is decayed! current lr : ",
                      config.lr * lr_factor)
            if (iter + 1) % 2500 == 0:
                saver.save(sess,
                           os.path.join(path, "./Check_Point/model.ckpt"),
                           global_step=iter // 2500)
                print("model is saved!")
def found_similar(text, items):
    rate, similar_items = utils.most_similar(utils.similarity(items, text, min_ratio=0.96))

    if len(similar_items) > 0:
        return similar_items[0]