示例#1
0
def test_brae_corpus():
    source_phrase, target_phrase, src_tar_pair, src_word_dict, tar_word_dict = read_forced_decode("../data/fd.txt")
    config = BRAEConfig("../conf/brae.conf")
    src_embedding = WordEmbedding(src_word_dict, dim=50)
    tar_embedding = WordEmbedding(tar_word_dict, dim=50)
    brae = BilingualPhraseRAE(src_embedding, tar_embedding, config=config)
    brae.train_using_lbfgs(source_phrase, target_phrase, src_tar_pair)
示例#2
0
 def test_something(self):
     bin_word_map = WordEmbedding.load_word2vec_word_map(
         "text.bin", binary=True, unicode_errors='replace')
     embedding = WordEmbedding(bin_word_map,
                               filename="text.bin",
                               unicode_errors='replace')
     self.assertEqual(True, True)
示例#3
0
def pre_model(src_dict, tar_dict, config, verbose):
    if rand_word_init:
        src_embedding = WordEmbedding(src_dict, dim=config.dim)
        tar_embedding = WordEmbedding(tar_dict, dim=config.dim)
    else:
        src_embedding = WordEmbedding(src_dict, filename="data/zh.token.dim%d.bin" % config.dim, dim=config.dim)
        tar_embedding = WordEmbedding(tar_dict, filename="data/en.token.dim%d.bin" % config.dim, dim=config.dim)
    return BilingualPhraseRAEISOMAP(src_embedding, tar_embedding, config=config, verbose=verbose)
示例#4
0
def pre_model(src_dict, tar_dict, config, verbose):
    if rand_word_init:
        src_embedding = WordEmbedding(src_dict, dim=config.dim)
        tar_embedding = WordEmbedding(tar_dict, dim=config.dim)
    else:
        en_embedding_name = "../gbrae/data/embedding/en.token.dim%d.bin" % config.dim
        zh_embedding_name = "../gbrae/data/embedding/zh.token.dim%d.bin" % config.dim
        src_embedding = WordEmbedding(src_dict, filename=zh_embedding_name, dim=config.dim)
        tar_embedding = WordEmbedding(tar_dict, filename=en_embedding_name, dim=config.dim)
    return BilingualPhraseRAE(src_embedding, tar_embedding, config=config, verbose=verbose)
示例#5
0
def test_brae():
    np.random.seed(0)
    src_word_idx = {"a": 0, "b": 1, "c": 2, "d": 3, "e": 4}
    src_embedding = WordEmbedding(src_word_idx, dim=3)
    tar_word_idx = {"a": 0, "b": 1, "c": 2, "d": 3, "e": 4}
    tar_embedding = WordEmbedding(tar_word_idx, dim=3)
    src_pos, src_neg = [[1], ], [[4]]
    tar_pos, tar_neg = [[1], ], [[4]]
    brae = BilingualPhraseRAE(src_embedding, tar_embedding)
    src_pos_nodes, src_pos_seq = brae.source_encoder.generate_node_path(src_pos)
    src_neg_nodes, src_neg_seq = brae.source_encoder.generate_node_path(src_neg)
    tar_pos_nodes, tar_pos_seq = brae.target_encoder.generate_node_path(tar_pos)
    tar_neg_nodes, tar_neg_seq = brae.target_encoder.generate_node_path(tar_neg)
    print brae.compute_result_grad(src_pos_nodes, src_pos_seq, src_neg_nodes, src_neg_seq,
                                   tar_pos_nodes, tar_pos_seq, tar_neg_nodes, tar_neg_seq)
示例#6
0
def bi_normal(seed):
    # pre_logger()
    np.random.seed(seed)
    train, dev, test, word_idx = read_sst(
        u"sst.bi.train",
        u"sst.bi.dev",
        u"sst.bi.test",
    )
    # embedding = WordEmbedding(word_idx, filename=u"GoogleNews-vectors-negative300.bin")
    embedding_initializer = UniformInitializer(scale=0.1)
    weight_initializer = GlorotUniformInitializer()
    # embedding = WordEmbedding(word_idx, filename=u"imdb.50.bin", initializer=embedding_initializer)
    embedding = WordEmbedding(word_idx,
                              dim=64,
                              initializer=embedding_initializer)
    from src.recurrent import RecurrentClassifier
    classifier = RecurrentClassifier(embedding,
                                     recurrent_encoder=RecurrentNormEncoder,
                                     in_dim=embedding.dim,
                                     hidden_dim=64,
                                     initializer=weight_initializer,
                                     batch_size=64,
                                     num_label=2,
                                     pooling="final",
                                     activation="tanh")
    classifier.train(train, dev, test)
示例#7
0
def test_rae_sentiment():
    train, dev, test, word_idx = read_sst(u"E:\\Corpus\\mr\\mr.shuffle.train",
                                          u"E:\\Corpus\\mr\\mr.shuffle.test",
                                          u"E:\\Corpus\\mr\\mr.shuffle.test",
                                          )
    embedding = WordEmbedding(word_idx, dim=3)  # fname=u"F:\\Corpus\\imdb.50.bin")
    classifier = PhraseRAEClassifier(embedding=embedding, n_out=2, uniform_range=0.01, normalize=False,
                                     weight_rec=0.001, weight_l2=0.01, dropout=0, verbose=True)
    classifier.fit(train, dev, test)
示例#8
0
def pre_model(src_dict, tar_dict, config, verbose):
    if rand_word_init:
        src_embedding = WordEmbedding(src_dict, dim=config.dim)
        tar_embedding = WordEmbedding(tar_dict, dim=config.dim)
    else:
        en_embedding_name = "data/embedding/en.token.min%d.dim%d.bin" % (
            config.min_count, config.dim)
        zh_embedding_name = "data/embedding/zh.token.min%d.dim%d.bin" % (
            config.min_count, config.dim)
        src_embedding = WordEmbedding(src_dict,
                                      filename=zh_embedding_name,
                                      dim=config.dim)
        tar_embedding = WordEmbedding(tar_dict,
                                      filename=en_embedding_name,
                                      dim=config.dim)
    return BilingualPhraseRAEBiLinear(src_embedding,
                                      tar_embedding,
                                      config=config,
                                      verbose=verbose)
示例#9
0
def pre_classifier(word_idx, embedding_name, labels_nums, word_dim, hidden_dims, batch_size, dropout, act):
    hidden_dims = [int(hidden) for hidden in hidden_dims.split("_")]
    embedding_initializer = UniformInitializer(scale=0.1)
    weight_initializer = GlorotUniformInitializer()
    embedding = WordEmbedding(word_idx, dim=word_dim, filename=embedding_name, binary=True,
                              initializer=embedding_initializer, add_unknown_word=True)
    classifier = MultiTaskHierarchicalClassifier(embedding, in_dim=embedding.dim, hidden_dims=hidden_dims,
                                     initializer=weight_initializer, batch_size=batch_size,
                                     dropout=dropout, labels_nums=labels_nums, activation=act,
                                     )
    return classifier
示例#10
0
def test_cnn():
    import numpy as np
    np.random.seed(0)
    train, dev, test, word_idx = read_sst(
        u"C:\\Users\\roger\\NLP\\Corpus\\sst_bi\\sst.bi.train",
        u"C:\\Users\\roger\\NLP\\Corpus\\sst_bi\\sst.bi.dev",
        u"C:\\Users\\roger\\NLP\\Corpus\\sst_bi\\sst.bi.test",
    )
    embedding = WordEmbedding(
        word_idx,
        dim=5)  # fname=u"F:\\Corpus\\GoogleNews-vectors-negative300.bin")
    classifier = ShallowCNNClassifier(embedding,
                                      n_out=2,
                                      verbose=True,
                                      weight_l2=0.001)
    classifier.fit(train, dev, test)
    acc, pred = classifier.test(test[0], test[1])
    print acc
示例#11
0
def bi():
    pre_logger()
    train, dev, test, word_idx = read_sst(
        u"sst.bi.train",
        u"sst.bi.dev",
        u"sst.bi.test",
    )
    # embedding = WordEmbedding(word_idx, filename=u"GoogleNews-vectors-negative300.bin")
    embedding_initializer = UniformInitializer(scale=0.1)
    weight_initializer = GlorotUniformInitializer()
    # embedding = WordEmbedding(word_idx, filename=u"imdb.50.bin", initializer=embedding_initializer)
    embedding = WordEmbedding(word_idx,
                              dim=50,
                              initializer=embedding_initializer)
    classifier = EmbeddingClassifier(
        embedding,
        in_dim=embedding.dim,
        hidden_dim=50,
        initializer=weight_initializer,
        batch_size=64,
        num_label=2,
        activation="tanh",
    )
    classifier.train(train, dev, test)
def main():
    config_name = sys.argv[1]
    forced_decode_data = "../gbrae/data/250w/tune_hyperparameter/tune.data"
    brae_config = BRAEConfig(config_name)
    train_data = "../gbrae/data/250w/tune_hyperparameter/train/tune.train"
    dev_data = "../gbrae/data/250w/tune_hyperparameter/dev/tune.dev"
    test_data = "../gbrae/data/250w/tune_hyperparameter/test/tune.test"
    train_name = "dim%d_lrec%f_lsem%f_ll2%f_alpha%f_seed%d_batch%d_min%d_lr%f" % (
        brae_config.dim,
        brae_config.weight_rec,
        brae_config.weight_sem,
        brae_config.weight_l2,
        brae_config.alpha,
        brae_config.random_seed,
        brae_config.batch_size,
        brae_config.min_count,
        brae_config.optimizer.param["lr"],
    )
    model_name = "model/%s" % train_name
    temp_model = model_name + ".temp"
    start_iter = int(sys.argv[3]) if len(sys.argv) > 3 else 0
    end_iter = int(sys.argv[4]) if len(sys.argv) > 4 else 26
    pre_logger("brae_" + train_name)
    np.random.seed(brae_config.random_seed)
    if start_iter == 0:
        print "Load Dict ..."
        en_embedding_name = "../gbrae/data/embedding/en.token.dim%d.bin" % brae_config.dim
        zh_embedding_name = "../gbrae/data/embedding/zh.token.dim%d.bin" % brae_config.dim
        tar_word_dict = WordEmbedding.load_word2vec_word_map(en_embedding_name,
                                                             binary=True,
                                                             oov=True)
        src_word_dict = WordEmbedding.load_word2vec_word_map(zh_embedding_name,
                                                             binary=True,
                                                             oov=True)
        print "Compiling Model ..."
        brae = pre_model(src_word_dict,
                         tar_word_dict,
                         brae_config,
                         verbose=True)
        print "Load All Data ..."
        src_phrases, tar_phrases, src_tar_pair = read_phrase_list(
            forced_decode_data, src_word_dict, tar_word_dict)
        src_train = [p[WORD_INDEX] for p in src_phrases]
        tar_train = [p[WORD_INDEX] for p in tar_phrases]
        print "Write Binary Data ..."
        with open(temp_model, 'wb') as fout:
            pickle.dump(src_train, fout)
            pickle.dump(tar_train, fout)
            pickle.dump(src_tar_pair, fout)
            pickle.dump(brae, fout)
            pickle.dump(np.random.get_state(), fout)
        if end_iter == 1:
            exit(1)
    else:
        with open(temp_model, 'rb') as fin:
            src_train = pickle.load(fin)
            tar_train = pickle.load(fin)
            src_tar_pair = pickle.load(fin)
            brae = pickle.load(fin)
            np.random.set_state(pickle.load(fin))
    src_phrase2id = dict()
    tar_phrase2id = dict()
    for phrase, i in zip(src_phrases, xrange(len(src_phrases))):
        src_phrase2id[phrase[TEXT_INDEX]] = i
    for phrase, i in zip(tar_phrases, xrange(len(tar_phrases))):
        tar_phrase2id[phrase[TEXT_INDEX]] = i
    train_pair = load_sub_data_pair(train_data, src_phrase2id, tar_phrase2id)
    dev_pair = load_sub_data_pair(dev_data, src_phrase2id, tar_phrase2id)
    test_pair = load_sub_data_pair(test_data, src_phrase2id, tar_phrase2id)
    brae.tune_hyper_parameter(src_train,
                              tar_train,
                              train_pair,
                              dev_pair,
                              test_pair,
                              brae_config,
                              model_name,
                              start_iter=start_iter,
                              end_iter=end_iter)
    brae.save_model("%s.tune.model" % model_name)
示例#13
0
def main():
    min_count = int(sys.argv[1])
    dim = 50
    '''
    forced_decode_data = "data/brae.train.data"
    src_count_path = "data/src.trans.data"
    tar_count_path = "data/tar.trans.data"
    tar_para_path = "data/tar.para.data"
    src_para_path = "data/src.para.data"
    gbrae_data_name = "model/gbrae.data.min.count.%d.pkl" % min_count
    gbrae_dict_name = "model/gbrae.dict.min.count.%d.pkl" % min_count
    gbrae_phrase_dict_name = "model/gbrae.phrase.text.dict.pkl"
    '''
    forced_decode_data = "data/250w/tune_hyperparameter/tune.data"
    src_count_path = "data/250w/tune_hyperparameter/tune.data"
    #tar_count_path = "data/250w/phrase-table.filtered"
    tar_para_path = "data/250w/enBP_alignPhraProb.xml"
    src_para_path = "data/250w/chBP_alignPhraProb.xml"
    gbrae_data_name = "data/250w/tune_hyperparameter/gbrae.data.tune.min.count.%d.pkl" % min_count
    gbrae_dict_name = "data/250w/tune_hyperparameter/train/gbrae.dict.tune.min.count.%d.pkl" % min_count
    gbrae_phrase_dict_name = "data/250w/tune_hyperparameter/gbrae.tune.phrase.text.dict.pkl"
    print "Load Word Dict ..."
    en_embedding_name = "data/embedding/en.token.dim%d.bin" % dim
    zh_embedding_name = "data/embedding/zh.token.dim%d.bin" % dim
    tar_word_dict = WordEmbedding.load_word2vec_word_map(en_embedding_name,
                                                         binary=True,
                                                         oov=True)
    src_word_dict = WordEmbedding.load_word2vec_word_map(zh_embedding_name,
                                                         binary=True,
                                                         oov=True)
    print "Load All Data ..."
    src_phrases, tar_phrases, src_tar_pair = read_phrase_list(
        forced_decode_data, src_word_dict, tar_word_dict)
    print "Load Para Data ..."
    src_phrases = read_para_list(src_para_path, src_phrases, src_word_dict)
    tar_phrases = read_para_list(tar_para_path, tar_phrases, tar_word_dict)
    print "Load Trans Data ..."
    src_phrases, tar_phrases = read_trans_list(src_count_path, src_phrases,
                                               tar_phrases, src_word_dict,
                                               tar_word_dict)
    #tar_phrases, src_phrases = read_trans_list(tar_count_path, tar_phrases, src_phrases,
    #tar_word_dict, src_word_dict)
    src_phrase2id = dict()
    tar_phrase2id = dict()
    for phrase, i in zip(src_phrases, xrange(len(src_phrases))):
        src_phrase2id[phrase[TEXT_INDEX]] = i
    for phrase, i in zip(tar_phrases, xrange(len(tar_phrases))):
        tar_phrase2id[phrase[TEXT_INDEX]] = i
    src_phrases = clean_text(src_phrases)
    tar_phrases = clean_text(tar_phrases)
    with open(gbrae_dict_name, 'wb') as fout:
        print "Write Word Dict ..."
        pickle.dump(src_word_dict, fout)
        pickle.dump(tar_word_dict, fout)
    with open(gbrae_data_name, 'wb') as fout:
        print "Write Source Phrases Data ..."
        pickle.dump(src_phrases, fout)
        print "Write Target Phrases Data ..."
        pickle.dump(tar_phrases, fout)
        pickle.dump(src_tar_pair, fout)
    with open(gbrae_phrase_dict_name, 'wb') as fout:
        print "Write Source Phrases Dictionary ..."
        pickle.dump(src_phrase2id, fout)
        print "Write Target Phrases Dictionary ..."
        pickle.dump(tar_phrase2id, fout)
示例#14
0
def main(_):
    phrase_file = FLAGS.phrase_file
    src_para_file = FLAGS.src_para
    tar_para_file = FLAGS.tar_para
    trans_file = FLAGS.trans_file
    src_phrase_list, tar_phrase_list, bi_phrase_list, src_word_idx, tar_word_idx = prepare_data(
        phrase_file, src_para_file, tar_para_file, trans_file)
    ssbrae_config = SSBRAEConfig(FLAGS.config_name)
    src_word_embedding = WordEmbedding(src_word_idx,
                                       dim=50,
                                       name="src_word_embedding")
    tar_word_embedding = WordEmbedding(tar_word_idx,
                                       dim=50,
                                       name="tar_word_embedding")
    sess = tf.Session()

    ssbrae_encoder = SSBRAEEncoder(
        src_word_embedding, tar_word_embedding, ssbrae_config.activation,
        ssbrae_config.normalize, ssbrae_config.weight_rec,
        ssbrae_config.weight_sem, ssbrae_config.weight_embedding,
        ssbrae_config.alpha, ssbrae_config.beta, ssbrae_config.max_src_len,
        ssbrae_config.max_tar_len, ssbrae_config.n_epoch,
        ssbrae_config.batch_size, ssbrae_config.dropout,
        ssbrae_config.optimizer_config, ssbrae_config.para,
        ssbrae_config.trans, ssbrae_config.para_num, ssbrae_config.trans_num,
        sess)

    train_phrase_list = bi_phrase_list[:-2 * ssbrae_encoder.batch_size]
    valid_phrase_list = bi_phrase_list[-2 * ssbrae_encoder.
                                       batch_size:-ssbrae_encoder.batch_size]
    test_phrase_list = bi_phrase_list[-ssbrae_encoder.batch_size:]

    pre_logger("ssbrae")
    logger.info("Now train ssbrae encoder\n")
    for i in range(ssbrae_encoder.n_epoch):
        logger.info("Now train ssbrae encoder epoch %d\n" % i)
        start_time = time.time()
        losses = []

        train_phrase_index = get_train_sequence(train_phrase_list,
                                                ssbrae_encoder.batch_size)
        num_batches = int(len(train_phrase_index) / ssbrae_encoder.batch_size)
        for j in range(num_batches):
            (src_pos, tar_pos, src_neg, tar_neg, src_para, tar_para, src_para_weight, tar_para_weight, src_tar_trans,\
             tar_src_trans, src_tar_trans_weight, tar_src_trans_weight) = ssbrae_encoder.get_batch(src_phrase_list,
                                                                                                  tar_phrase_list,
                                                                                                  train_phrase_list,
                                                                                                  train_phrase_index,
                                                                                                  src_word_idx,
                                                                                                  tar_word_idx, j)
            result = ssbrae_encoder.ssbrae_train_step(
                src_pos, tar_pos, src_neg, tar_neg, src_para, tar_para,
                src_para_weight, tar_para_weight, src_tar_trans, tar_src_trans,
                src_tar_trans_weight, tar_src_trans_weight)
            if ssbrae_encoder.para and ssbrae_encoder.trans:
                logger.info(
                    "train ssbrae_para epoch %d, step %d, total loss:%f, loss_l2: %f, loss_rec: %f,"
                    "loss_sem:%f, loss_para:%f, loss_trans:%f\n" %
                    (i, j, result[1], result[2], result[3], result[4],
                     result[5], result[6]))
            elif ssbrae_encoder.para and not ssbrae_encoder.trans:
                logger.info(
                    "train ssbrae_para epoch %d, step %d, total loss:%f, loss_l2: %f, loss_rec: %f,"
                    "loss_sem:%f, loss_para:%f\n" %
                    (i, j, result[1], result[2], result[3], result[4],
                     result[5]))
            elif ssbrae_encoder.trans and not ssbrae_encoder.para:
                logger.info(
                    "train ssbrae_para epoch %d, step %d, total loss:%f, loss_l2: %f, loss_rec: %f,"
                    "loss_sem:%f, loss_trans:%f\n" %
                    (i, j, result[1], result[2], result[3], result[4],
                     result[5]))
            else:
                raise ValueError("No such configuration")
            losses.append(result[1:])

        use_time = time.time() - start_time

        valid_phrase_index = get_train_sequence(valid_phrase_list,
                                                ssbrae_encoder.batch_size)
        num_batches = int(len(valid_phrase_index) / ssbrae_encoder.batch_size)
        dev_loss = []
        for j in range(num_batches):
            (src_pos, tar_pos, src_neg, tar_neg, src_para, tar_para, src_para_weight, tar_para_weight, src_tar_trans, \
             tar_src_trans, src_tar_trans_weight, tar_src_trans_weight) = ssbrae_encoder.get_batch(src_phrase_list,
                                                                                                   tar_phrase_list,
                                                                                                   valid_phrase_list,
                                                                                                   valid_phrase_index,
                                                                                                   src_word_idx,
                                                                                                   tar_word_idx, j)
            dev_loss.append(
                ssbrae_encoder.ssbrae_predict_step(
                    src_pos, tar_pos, src_neg, tar_neg, src_para, tar_para,
                    src_para_weight, tar_para_weight, src_tar_trans,
                    tar_src_trans, src_tar_trans_weight, tar_src_trans_weight))
        logger.info("train ssbrae encoder epoch %d, use time:%d\n" %
                    (i, use_time))
        ave_train_loss = np.average(losses, axis=0)
        ave_dev_loss = np.average(dev_loss, axis=0)
        if ssbrae_encoder.para and ssbrae_encoder.trans:
            logger.info(
                "train: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, para loss:%f, trans loss:%f\n"
                % (ave_train_loss[0], ave_train_loss[1], ave_train_loss[2],
                   ave_train_loss[3], ave_train_loss[4], ave_train_loss[5]))
            logger.info(
                "dev: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, para loss:%f, trans loss:%f"
                % (ave_dev_loss[0], ave_dev_loss[1], ave_dev_loss[2],
                   ave_dev_loss[3], ave_dev_loss[4], ave_dev_loss[5]))
        elif ssbrae_encoder.para and not ssbrae_encoder.trans:
            logger.info(
                "train: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, para loss:%f\n"
                % (ave_train_loss[1], ave_train_loss[2], ave_train_loss[3],
                   ave_train_loss[4], ave_train_loss[5]))
            logger.info(
                "dev: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, para loss:%f"
                % (ave_dev_loss[0], ave_dev_loss[1], ave_dev_loss[2],
                   ave_dev_loss[3], ave_dev_loss[4]))
        elif ssbrae_encoder.trans and not ssbrae_encoder.para:
            logger.info(
                "train: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, trans loss:%f\n"
                % (ave_train_loss[1], ave_train_loss[2], ave_train_loss[3],
                   ave_train_loss[4], ave_train_loss[5]))
            logger.info(
                "dev: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, trans loss:%f"
                % (ave_dev_loss[0], ave_dev_loss[1], ave_dev_loss[2],
                   ave_dev_loss[3], ave_dev_loss[4]))

        checkpoint_path = os.path.join(FLAGS.train_dir,
                                       "ssbare_encoder.epoch%d.ckpt" % i)
        #ssbrae_encoder.saver.save(ssbrae_encoder.sess, checkpoint_path, global_step=ssbrae_encoder.global_step)
        ssbrae_encoder.saver.save(ssbrae_encoder.sess, checkpoint_path)

    test_phrase_index = get_train_sequence(test_phrase_list,
                                           ssbrae_encoder.batch_size)
    num_batches = int(len(test_phrase_index) / ssbrae_encoder.batch_size)
    test_loss = []
    for j in range(num_batches):
        (src_pos, tar_pos, src_neg, tar_neg, src_para, tar_para, src_para_weight, tar_para_weight, src_tar_trans, \
         tar_src_trans, src_tar_trans_weight, tar_src_trans_weight) = ssbrae_encoder.get_batch(src_phrase_list,
                                                                                               tar_phrase_list,
                                                                                               test_phrase_list,
                                                                                               test_phrase_index,
                                                                                               src_word_idx,
                                                                                               tar_word_idx, j)
        test_loss.append(
            ssbrae_encoder.ssbrae_predict_step(
                src_pos, tar_pos, src_neg, tar_neg, src_para, tar_para,
                src_para_weight, tar_para_weight, src_tar_trans, tar_src_trans,
                src_tar_trans_weight, tar_src_trans_weight))

    ave_test_loss = np.average(test_loss, axis=0)
    if ssbrae_encoder.para and ssbrae_encoder.trans:
        logger.info(
            "test: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, para loss:%f, trans loss:%f"
            % (ave_test_loss[0], ave_test_loss[1], ave_test_loss[2],
               ave_test_loss[3], ave_test_loss[4], ave_test_loss[5]))
    elif ssbrae_encoder.para and not ssbrae_encoder.trans:
        logger.info(
            "test: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, para loss:%f"
            % (ave_test_loss[0], ave_test_loss[1], ave_test_loss[2],
               ave_test_loss[3], ave_test_loss[4]))
    elif ssbrae_encoder.trans and not ssbrae_encoder.para:
        logger.info(
            "test: total loss:%f, l2 loss:%f, rec loss:%f, sem loss:%f, trans loss:%f"
            % (ave_test_loss[0], ave_test_loss[1], ave_test_loss[2],
               ave_test_loss[3], ave_test_loss[4]))
示例#15
0
def main():
    train_test = sys.argv[1]
    if train_test not in ["train", "predict"]:
        sys.stderr("train or predict")
        exit(1)
    config_name = sys.argv[2]
    forced_decode_data = "../gbrae/data/250w/phrase-table.filtered"
    phrase_data_path = "data/phrase.list"
    brae_config = BRAEConfig(config_name)
    train_name = "dim%d_lrec%f_lsem%f_ll2%f_alpha%f_seed%d_batch%d_min%d_lr%f" % (brae_config.dim,
                                                                                  brae_config.weight_rec,
                                                                                  brae_config.weight_sem,
                                                                                  brae_config.weight_l2,
                                                                                  brae_config.alpha,
                                                                                  brae_config.random_seed,
                                                                                  brae_config.batch_size,
                                                                                  brae_config.min_count,
                                                                                  brae_config.optimizer.param["lr"],)
    model_name = "model/%s" % train_name
    temp_model = model_name + ".temp"
    if train_test == "train":
        start_iter = int(sys.argv[3]) if len(sys.argv) > 3 else 0
        end_iter = int(sys.argv[4]) if len(sys.argv) > 4 else 26
        pre_logger("brae_" + train_name)
        np.random.seed(brae_config.random_seed)
        if start_iter == 0:
            print "Load Dict ..."
            en_embedding_name = "../gbrae/data/embedding/en.token.dim%d.bin" % brae_config.dim
            zh_embedding_name = "../gbrae/data/embedding/zh.token.dim%d.bin" % brae_config.dim
            tar_word_dict = WordEmbedding.load_word2vec_word_map(en_embedding_name, binary=True, oov=True)
            src_word_dict = WordEmbedding.load_word2vec_word_map(zh_embedding_name, binary=True, oov=True)
            print "Compiling Model ..."
            brae = pre_model(src_word_dict, tar_word_dict, brae_config, verbose=True)
            print "Load All Data ..."
            src_phrases, tar_phrases, src_tar_pair = read_phrase_list(forced_decode_data, src_word_dict, tar_word_dict)
            src_train = [p[WORD_INDEX] for p in src_phrases]
            tar_train = [p[WORD_INDEX] for p in tar_phrases]
            print "Write Binary Data ..."
            with open(temp_model, 'wb') as fout:
                pickle.dump(src_train, fout)
                pickle.dump(tar_train, fout)
                pickle.dump(src_tar_pair, fout)
                pickle.dump(brae, fout)
                pickle.dump(np.random.get_state(), fout)
            if end_iter == 1:
                exit(1)
        else:
            with open(temp_model, 'rb') as fin:
                src_train = pickle.load(fin)
                tar_train = pickle.load(fin)
                src_tar_pair = pickle.load(fin)
                brae = pickle.load(fin)
                np.random.set_state(pickle.load(fin))
        brae.train(src_train, tar_train, src_tar_pair, brae_config, model_name, start_iter, end_iter)
        brae.save_model("%s.model" % model_name)
    elif train_test == "predict":
        num_process = int(sys.argv[3]) if len(sys.argv) > 3 else 0
        brae_predict(phrase_data_path, train_name + ".pred", model_file="%s.model" % model_name, num_process=num_process)
    else:
        sys.stderr("train or predict")
        exit(1)
示例#16
0
def main(_):
    phrase_file = FLAGS.phrase_file
    src_para_file = FLAGS.src_para_file
    tar_para_file = FLAGS.tar_para_file
    trans_file = FLAGS.trans_file
    src_phrase_list, tar_phrase_list, bi_phrase_list, src_word_idx, tar_word_idx = prepare_data(phrase_file,
                                                                                                src_para_file,
                                                                                                tar_para_file,
                                                                                                trans_file)
    # src rae encoder
    src_config_name = FLAGS.src_config_name
    src_rae_config = BRAEConfig(src_config_name)
    src_embedding = WordEmbedding(src_word_idx, dim=50, name="src_embedding")
    sess = tf.Session()
    src_rae_encoder = RAEEncoder(src_rae_config.activation, src_embedding, src_rae_config.normalize,
                                 src_rae_config.weight_rec, src_rae_config.weight_embedding, src_rae_config.n_epoch,
                                 src_rae_config.max_src_len, src_rae_config.batch_size, src_rae_config.dropout,
                                 src_rae_config.optimizer_config, sess, name="rae_encoder")
    # tar rae encoder
    tar_config_name = FLAGS.tar_config_name
    tar_rae_config = BRAEConfig(tar_config_name)
    tar_embedding = WordEmbedding(tar_word_idx, dim=50, name="tar_embedding")
    tar_rae_encoder = RAEEncoder(tar_rae_config.activation, tar_embedding, tar_rae_config.normalize,
                                 tar_rae_config.weight_rec, tar_rae_config.weight_embedding, tar_rae_config.n_epoch,
                                 tar_rae_config.max_tar_len, tar_rae_config.batch_size, tar_rae_config.dropout,
                                 tar_rae_config.optimizer_config, sess, name="tar_rae_encoder")

    train_phrase_list = src_phrase_list[: - 2 * src_rae_config.batch_size],
    valid_phrase_list = src_phrase_list[-2 * src_rae_config.batch_size: - src_rae_config.batch_size]
    test_phrase_list = src_phrase_list[- src_rae_config.batch_size:]

    logger.info("Now train the src rae encoder:\n")
    for i in range(src_rae_encoder.n_epoch):
        logger.info("Now train src rae epoch %d\n" % (i + 1))
        start_time = time.time()

        src_train_index = get_train_sequence(train_phrase_list, src_rae_encoder.batch_size)
        batch_number = int(len(src_train_index) / src_rae_encoder.batch_size)
        losses = []
        for j in range(batch_number):
            inputs = src_rae_encoder.get_batch(train_phrase_list, src_train_index, j, src_word_idx)
            loss = src_rae_encoder.train_step(inputs)
            logging.info("src rae epoch %d, step %d, loss: %f\n" % (i, j, loss))
            losses.append(loss)

        src_valid_index = get_train_sequence(valid_phrase_list, src_rae_encoder.batch_size)
        valid_batches = int(len(src_valid_index) / src_rae_encoder.batch_size)
        dev_loss = []
        for j in range(valid_batches):
            inputs = src_rae_encoder.get_batch(valid_phrase_list, src_valid_index, j, src_word_idx)
            dev_loss.append(src_rae_encoder.predict_step(inputs))

        use_time = time.time() - start_time
        logger.info("src rae epoch %d, time: %d, train loss:%f, development loss:%f\n"
                    % (i, use_time, sess.run(tf.reduce_mean(losses)), sess.run(tf.reduce_mean(dev_loss))))

        checkpoint_path = os.path.join(FLAGS.train_dir, "src_rae.epoch%d.ckpt" % i)
        src_rae_encoder.saver.save(src_rae_encoder.sess, checkpoint_path, global_step=src_rae_encoder.global_step)

    src_test_index = get_train_sequence(test_phrase_list, src_rae_encoder.batch_size)
    test_batches = int(len(src_test_index) / src_rae_encoder.batch_size)
    test_loss = []
    for j in range(test_batches):
        inputs = src_rae_encoder.get_batch(test_phrase_list, src_test_index, j, src_word_idx)
        test_loss.append(src_rae_encoder.predict_step(inputs))
    logger.info("src test loss : %f\n" % sess.run(tf.reduce_mean(test_loss)))
示例#17
0
    def __init__(self, entity_index, relation_index, entity_dim=100, k=100,
                 initializer=default_initializer, regularization_weight=0.0001):
        self.relation_num = len(relation_index)
        self.scorer = SingleLayerModel(entity_dim=entity_dim, relation_num=self.relation_num,
                                       k=k, initializer=UniformInitializer(scale=1 / np.sqrt(entity_dim * 2)))
        self.entity_embedding = WordEmbedding(entity_index, dim=entity_dim, initializer=initializer)
        self.regularization_weight = regularization_weight
        self.e1_index = T.lscalar()
        self.e2_index = T.lscalar()
        self.ec_index = T.lscalar()
        self.relation_index = T.lscalar()
        self.pos_score = self.scorer.score(self.entity_embedding[self.e1_index],
                                           self.entity_embedding[self.e2_index],
                                           self.relation_index)
        self.neg_score = self.scorer.score(self.entity_embedding[self.e1_index],
                                           self.entity_embedding[self.ec_index],
                                           self.relation_index)
        self.loss_max_margin =  T.maximum(0.0, self.neg_score - self.pos_score + 1.0)

        self.e1_index_batch = T.lvector()
        self.e2_index_batch = T.lvector()
        self.ec_index_batch = T.lvector()
        self.relation_index_batch = T.lvector()
        self.pos_score_batch = self.scorer.score_batch(self.entity_embedding[self.e1_index_batch],
                                                       self.entity_embedding[self.e2_index_batch],
                                                       self.relation_index_batch)
        self.neg_score_batch = self.scorer.score_batch(self.entity_embedding[self.e1_index_batch],
                                                       self.entity_embedding[self.ec_index_batch],
                                                       self.relation_index_batch)
        self.loss_max_margin_batch =  T.sum(T.maximum(0.0, self.neg_score_batch - self.pos_score_batch + 1.0))

        self.pos_score_relation = self.scorer.score_one_relation(self.entity_embedding[self.e1_index_batch],
                                                              self.entity_embedding[self.e2_index_batch],
                                                              self.relation_index)
        self.neg_score_relation = self.scorer.score_one_relation(self.entity_embedding[self.e1_index_batch],
                                                              self.entity_embedding[self.ec_index_batch],
                                                              self.relation_index)
        self.loss_max_margin_relation =  T.sum(T.maximum(0.0, self.neg_score_relation - self.pos_score_relation + 1.0))

        self.params = self.entity_embedding.params + self.scorer.params
        self.l2_norm = self.entity_embedding.l2_norm + self.scorer.l2_norm
        self.l2_loss = self.regularization_weight * self.l2_norm / 2
        sgd_optimizer = AdaDeltaOptimizer(lr=0.95, norm_lim=-1)

        self.loss = self.loss_max_margin + self.l2_loss
        updates = sgd_optimizer.get_update(self.loss, self.params)

        self.loss_batch = self.loss_max_margin_batch + self.l2_loss
        updates_batch = sgd_optimizer.get_update(self.loss_batch, self.params)

        grad_margin_relation = T.grad(self.loss_max_margin_relation, self.params)
        grad_l2 = T.grad(self.l2_loss, self.params)

        self.train_one_instance = theano.function(inputs=[self.e1_index, self.e2_index,
                                                          self.ec_index, self.relation_index],
                                                  outputs=[self.loss, self.loss_max_margin, self.l2_loss],
                                                  updates=updates)

        self.score_one_instance = theano.function(inputs=[self.e1_index, self.e2_index, self.relation_index],
                                                  outputs=[self.pos_score])

        self.train_batch_instance = theano.function(inputs=[self.e1_index_batch, self.e2_index_batch,
                                                            self.ec_index_batch, self.relation_index_batch],
                                                    outputs=[self.loss_batch, self.loss_max_margin_batch, self.l2_loss],
                                                    updates=updates_batch)

        self.score_batch_instance = theano.function(inputs=[self.e1_index_batch, self.e2_index_batch,
                                                            self.relation_index_batch],
                                                    outputs=self.pos_score_batch)

        self.grad_relation_margin = theano.function(inputs=[self.e1_index_batch, self.e2_index_batch,
                                                            self.ec_index_batch, self.relation_index],
                                                    outputs=[self.loss_max_margin_relation] + grad_margin_relation,
                                                    )

        self.forward_relation_margin = theano.function(inputs=[self.e1_index_batch, self.e2_index_batch,
                                                               self.ec_index_batch, self.relation_index],
                                                       outputs=[self.loss_max_margin_relation],
                                                       )

        self.grad_l2 = theano.function(inputs=[], outputs=[self.l2_loss] + grad_l2,)

        self.forward_l2 = theano.function(inputs=[], outputs=[self.l2_loss],)

        self.score_relation_instance = theano.function(inputs=[self.e1_index_batch, self.e2_index_batch,
                                                               self.relation_index],
                                                       outputs=self.pos_score_relation)
示例#18
0
    def __init__(self,
                 key_index,
                 label_num,
                 pretrain_name=None,
                 encoder='lstm',
                 word_dim=300,
                 hidden='100_100',
                 dropout=0.5,
                 regularization_weight=0.0001,
                 optimizer_name='adagrad',
                 lr=0.1,
                 norm_lim=-1,
                 label2index_filename=None):
        self.label2index, self.index2label = self.load_label_index(
            label2index_filename, label_num)

        self.indexs = T.imatrix()  # (batch, max_len)
        self.golden = T.ivector()  # (batch, )
        self.max_len = T.iscalar()  # max length

        self.s1_mask = self.indexs[:, :self.max_len] > 0
        self.s1_mask = self.s1_mask * T.constant(1.0,
                                                 dtype=theano.config.floatX)

        if pretrain_name is None:
            self.embedding = WordEmbedding(
                key_index,
                dim=word_dim,
                initializer=UniformInitializer(scale=0.01))
        else:
            self.embedding = WordEmbedding(key_index,
                                           filename=pretrain_name,
                                           normalize=False,
                                           binary=True)
            assert self.embedding.dim == word_dim

        self.word_embeddings = self.embedding[self.indexs[:, :self.max_len]]

        if type(hidden) is str:
            hidden_dims = [int(hid) for hid in hidden.split('_')]
        else:
            hidden_dims = [hidden]

        if encoder == 'lstm':
            encoder_layer = LSTMEncoder(in_dim=word_dim,
                                        hidden_dim=hidden_dims[0],
                                        pooling='final',
                                        prefix="LSTM_",
                                        dropout=dropout)
        elif encoder == 'bilstm':
            encoder_layer = BiLSTMEncoder(in_dim=word_dim,
                                          hidden_dim=hidden_dims[0],
                                          pooling='final',
                                          prefix="BiLSTM_",
                                          bidirection_shared=True,
                                          dropout=dropout)
        elif encoder == 'recurrent':
            encoder_layer = RecurrentEncoder(in_dim=word_dim,
                                             hidden_dim=hidden_dims[0],
                                             pooling='final',
                                             prefix="Recurrent_",
                                             dropout=dropout)
        elif encoder == 'birecurrent':
            encoder_layer = BiRecurrentEncoder(in_dim=word_dim,
                                               hidden_dim=hidden_dims[0],
                                               pooling='final',
                                               prefix="BiRecurrent_",
                                               bidirection_shared=True,
                                               dropout=dropout)
        elif encoder == 'gru':
            encoder_layer = GRUEncoder(in_dim=word_dim,
                                       hidden_dim=hidden_dims[0],
                                       pooling='final',
                                       prefix="GRU_",
                                       dropout=dropout)
        elif encoder == 'bigru':
            encoder_layer = BiGRUEncoder(in_dim=word_dim,
                                         hidden_dim=hidden_dims[0],
                                         pooling='final',
                                         prefix="BiGRU_",
                                         bidirection_shared=True,
                                         dropout=dropout)
        elif encoder == 'cbow':
            encoder_layer = CBOWLayer(in_dim=word_dim, )
        elif encoder == 'cnn':
            encoder_layer = MultiFilterConvolutionLayer(
                in_dim=word_dim,
                hidden_dim=hidden_dims[0],
                pooling='max',
                prefix="ConvLayer_",
                kernel_sizes=CONV_FILTER_SIZES)
        else:
            raise NotImplementedError

        self.text_embedding = encoder_layer.forward_batch(
            self.word_embeddings, self.s1_mask)

        if len(hidden_dims) > 1:
            hidden_layer = MultiHiddenLayer(in_dim=encoder_layer.out_dim,
                                            hidden_dims=hidden_dims[1:],
                                            dropout=dropout,
                                            prefix='Full_Connected_Layer_')
            classifier_input = hidden_layer.forward_batch(self.text_embedding)
            classifier_input_dim = hidden_layer.out_dim
        else:
            classifier_input = self.text_embedding
            classifier_input_dim = encoder_layer.out_dim

        self.classifier = SoftmaxClassifier(classifier_input_dim,
                                            label_num,
                                            dropout=dropout)
        self.predict_loss = self.classifier.loss(classifier_input, self.golden)
        self.predict_prob = self.classifier.forward_batch(classifier_input)
        self.predict_label = T.argmax(self.predict_prob, axis=1)
        """Params in TextClassifier"""
        self.params = self.classifier.params + encoder_layer.params
        self.l2_norm = self.classifier.l2_norm + encoder_layer.l2_norm
        if len(hidden_dims) > 1:
            self.params += hidden_layer.params
            self.l2_norm += hidden_layer.l2_norm

        self.l2_loss = regularization_weight * self.l2_norm / 2
        self.loss = self.predict_loss + self.l2_loss
        """Opimizer and Loss"""
        if optimizer_name == 'adagrad':
            sgd_optimizer = AdaGradOptimizer(lr=lr, norm_lim=norm_lim)
        elif optimizer_name == 'adadelta':
            sgd_optimizer = AdaDeltaOptimizer(lr=lr, norm_lim=norm_lim)
        elif optimizer_name == 'sgd':
            sgd_optimizer = SGDOptimizer(lr=lr, norm_lim=norm_lim)
        elif optimizer_name == 'momentum':
            sgd_optimizer = SGDMomentumOptimizer(lr=lr, norm_lim=norm_lim)
        elif optimizer_name == 'adam':
            sgd_optimizer = AdamOptimizer(lr=lr, norm_lim=norm_lim)
        else:
            raise NotImplementedError

        self.train_indexs = T.ivector()
        self.train_data_x = shared_zero_matrix(shape=(5, 5),
                                               name="train_data_x",
                                               dtype=np.int32)
        self.train_data_y = shared_zero_matrix(shape=(5, ),
                                               name="train_data_y",
                                               dtype=np.int32)

        self.model_params = self.params + self.embedding.params
        """Theano Function"""
        if EMBEDDING_LR > 0:
            embedding_updates = SGDOptimizer(lr=EMBEDDING_LR,
                                             norm_lim=-1).get_update(
                                                 self.loss,
                                                 self.embedding.params)
            updates = sgd_optimizer.get_update(
                self.loss, self.params, norm_exc_params=self.embedding.params)
            updates.update(embedding_updates)
        elif EMBEDDING_LR < 0:
            # Optimize Embedding using Global Optimizer
            self.params += self.embedding.params
            updates = sgd_optimizer.get_update(
                self.loss, self.params, norm_exc_params=self.embedding.params)
        else:
            # Fix Embedding
            updates = sgd_optimizer.get_update(
                self.loss, self.params, norm_exc_params=self.embedding.params)

        self.train_batch = theano.function(
            inputs=[self.train_indexs, self.max_len],
            outputs=[self.loss, self.predict_loss, self.l2_loss],
            updates=updates,
            givens=[(self.indexs, self.train_data_x[self.train_indexs]),
                    (self.golden, self.train_data_y[self.train_indexs])])

        self.loss_batch = theano.function(
            inputs=[self.indexs, self.golden, self.max_len],
            outputs=[self.loss, self.predict_loss, self.l2_loss],
        )

        self.pred_prob_batch = theano.function(
            inputs=[self.indexs, self.max_len],
            outputs=[self.predict_prob],
        )

        self.pred_label_batch = theano.function(
            inputs=[self.indexs, self.max_len],
            outputs=[self.predict_label],
        )

        self.get_l2_loss = theano.function(
            inputs=[],
            outputs=[self.l2_loss, self.l2_norm],
        )