Пример #1
0
 def test_something(self):
     bin_word_map = WordEmbedding.load_word2vec_word_map(
         "text.bin", binary=True, unicode_errors='replace')
     embedding = WordEmbedding(bin_word_map,
                               filename="text.bin",
                               unicode_errors='replace')
     self.assertEqual(True, True)
def main():
    config_name = sys.argv[1]
    forced_decode_data = "../gbrae/data/250w/tune_hyperparameter/tune.data"
    brae_config = BRAEConfig(config_name)
    train_data = "../gbrae/data/250w/tune_hyperparameter/train/tune.train"
    dev_data = "../gbrae/data/250w/tune_hyperparameter/dev/tune.dev"
    test_data = "../gbrae/data/250w/tune_hyperparameter/test/tune.test"
    train_name = "dim%d_lrec%f_lsem%f_ll2%f_alpha%f_seed%d_batch%d_min%d_lr%f" % (
        brae_config.dim,
        brae_config.weight_rec,
        brae_config.weight_sem,
        brae_config.weight_l2,
        brae_config.alpha,
        brae_config.random_seed,
        brae_config.batch_size,
        brae_config.min_count,
        brae_config.optimizer.param["lr"],
    )
    model_name = "model/%s" % train_name
    temp_model = model_name + ".temp"
    start_iter = int(sys.argv[3]) if len(sys.argv) > 3 else 0
    end_iter = int(sys.argv[4]) if len(sys.argv) > 4 else 26
    pre_logger("brae_" + train_name)
    np.random.seed(brae_config.random_seed)
    if start_iter == 0:
        print "Load Dict ..."
        en_embedding_name = "../gbrae/data/embedding/en.token.dim%d.bin" % brae_config.dim
        zh_embedding_name = "../gbrae/data/embedding/zh.token.dim%d.bin" % brae_config.dim
        tar_word_dict = WordEmbedding.load_word2vec_word_map(en_embedding_name,
                                                             binary=True,
                                                             oov=True)
        src_word_dict = WordEmbedding.load_word2vec_word_map(zh_embedding_name,
                                                             binary=True,
                                                             oov=True)
        print "Compiling Model ..."
        brae = pre_model(src_word_dict,
                         tar_word_dict,
                         brae_config,
                         verbose=True)
        print "Load All Data ..."
        src_phrases, tar_phrases, src_tar_pair = read_phrase_list(
            forced_decode_data, src_word_dict, tar_word_dict)
        src_train = [p[WORD_INDEX] for p in src_phrases]
        tar_train = [p[WORD_INDEX] for p in tar_phrases]
        print "Write Binary Data ..."
        with open(temp_model, 'wb') as fout:
            pickle.dump(src_train, fout)
            pickle.dump(tar_train, fout)
            pickle.dump(src_tar_pair, fout)
            pickle.dump(brae, fout)
            pickle.dump(np.random.get_state(), fout)
        if end_iter == 1:
            exit(1)
    else:
        with open(temp_model, 'rb') as fin:
            src_train = pickle.load(fin)
            tar_train = pickle.load(fin)
            src_tar_pair = pickle.load(fin)
            brae = pickle.load(fin)
            np.random.set_state(pickle.load(fin))
    src_phrase2id = dict()
    tar_phrase2id = dict()
    for phrase, i in zip(src_phrases, xrange(len(src_phrases))):
        src_phrase2id[phrase[TEXT_INDEX]] = i
    for phrase, i in zip(tar_phrases, xrange(len(tar_phrases))):
        tar_phrase2id[phrase[TEXT_INDEX]] = i
    train_pair = load_sub_data_pair(train_data, src_phrase2id, tar_phrase2id)
    dev_pair = load_sub_data_pair(dev_data, src_phrase2id, tar_phrase2id)
    test_pair = load_sub_data_pair(test_data, src_phrase2id, tar_phrase2id)
    brae.tune_hyper_parameter(src_train,
                              tar_train,
                              train_pair,
                              dev_pair,
                              test_pair,
                              brae_config,
                              model_name,
                              start_iter=start_iter,
                              end_iter=end_iter)
    brae.save_model("%s.tune.model" % model_name)
Пример #3
0
def main():
    min_count = int(sys.argv[1])
    dim = 50
    '''
    forced_decode_data = "data/brae.train.data"
    src_count_path = "data/src.trans.data"
    tar_count_path = "data/tar.trans.data"
    tar_para_path = "data/tar.para.data"
    src_para_path = "data/src.para.data"
    gbrae_data_name = "model/gbrae.data.min.count.%d.pkl" % min_count
    gbrae_dict_name = "model/gbrae.dict.min.count.%d.pkl" % min_count
    gbrae_phrase_dict_name = "model/gbrae.phrase.text.dict.pkl"
    '''
    forced_decode_data = "data/250w/tune_hyperparameter/tune.data"
    src_count_path = "data/250w/tune_hyperparameter/tune.data"
    #tar_count_path = "data/250w/phrase-table.filtered"
    tar_para_path = "data/250w/enBP_alignPhraProb.xml"
    src_para_path = "data/250w/chBP_alignPhraProb.xml"
    gbrae_data_name = "data/250w/tune_hyperparameter/gbrae.data.tune.min.count.%d.pkl" % min_count
    gbrae_dict_name = "data/250w/tune_hyperparameter/train/gbrae.dict.tune.min.count.%d.pkl" % min_count
    gbrae_phrase_dict_name = "data/250w/tune_hyperparameter/gbrae.tune.phrase.text.dict.pkl"
    print "Load Word Dict ..."
    en_embedding_name = "data/embedding/en.token.dim%d.bin" % dim
    zh_embedding_name = "data/embedding/zh.token.dim%d.bin" % dim
    tar_word_dict = WordEmbedding.load_word2vec_word_map(en_embedding_name,
                                                         binary=True,
                                                         oov=True)
    src_word_dict = WordEmbedding.load_word2vec_word_map(zh_embedding_name,
                                                         binary=True,
                                                         oov=True)
    print "Load All Data ..."
    src_phrases, tar_phrases, src_tar_pair = read_phrase_list(
        forced_decode_data, src_word_dict, tar_word_dict)
    print "Load Para Data ..."
    src_phrases = read_para_list(src_para_path, src_phrases, src_word_dict)
    tar_phrases = read_para_list(tar_para_path, tar_phrases, tar_word_dict)
    print "Load Trans Data ..."
    src_phrases, tar_phrases = read_trans_list(src_count_path, src_phrases,
                                               tar_phrases, src_word_dict,
                                               tar_word_dict)
    #tar_phrases, src_phrases = read_trans_list(tar_count_path, tar_phrases, src_phrases,
    #tar_word_dict, src_word_dict)
    src_phrase2id = dict()
    tar_phrase2id = dict()
    for phrase, i in zip(src_phrases, xrange(len(src_phrases))):
        src_phrase2id[phrase[TEXT_INDEX]] = i
    for phrase, i in zip(tar_phrases, xrange(len(tar_phrases))):
        tar_phrase2id[phrase[TEXT_INDEX]] = i
    src_phrases = clean_text(src_phrases)
    tar_phrases = clean_text(tar_phrases)
    with open(gbrae_dict_name, 'wb') as fout:
        print "Write Word Dict ..."
        pickle.dump(src_word_dict, fout)
        pickle.dump(tar_word_dict, fout)
    with open(gbrae_data_name, 'wb') as fout:
        print "Write Source Phrases Data ..."
        pickle.dump(src_phrases, fout)
        print "Write Target Phrases Data ..."
        pickle.dump(tar_phrases, fout)
        pickle.dump(src_tar_pair, fout)
    with open(gbrae_phrase_dict_name, 'wb') as fout:
        print "Write Source Phrases Dictionary ..."
        pickle.dump(src_phrase2id, fout)
        print "Write Target Phrases Dictionary ..."
        pickle.dump(tar_phrase2id, fout)
Пример #4
0
def main():
    train_test = sys.argv[1]
    if train_test not in ["train", "predict"]:
        sys.stderr("train or predict")
        exit(1)
    config_name = sys.argv[2]
    forced_decode_data = "../gbrae/data/250w/phrase-table.filtered"
    phrase_data_path = "data/phrase.list"
    brae_config = BRAEConfig(config_name)
    train_name = "dim%d_lrec%f_lsem%f_ll2%f_alpha%f_seed%d_batch%d_min%d_lr%f" % (brae_config.dim,
                                                                                  brae_config.weight_rec,
                                                                                  brae_config.weight_sem,
                                                                                  brae_config.weight_l2,
                                                                                  brae_config.alpha,
                                                                                  brae_config.random_seed,
                                                                                  brae_config.batch_size,
                                                                                  brae_config.min_count,
                                                                                  brae_config.optimizer.param["lr"],)
    model_name = "model/%s" % train_name
    temp_model = model_name + ".temp"
    if train_test == "train":
        start_iter = int(sys.argv[3]) if len(sys.argv) > 3 else 0
        end_iter = int(sys.argv[4]) if len(sys.argv) > 4 else 26
        pre_logger("brae_" + train_name)
        np.random.seed(brae_config.random_seed)
        if start_iter == 0:
            print "Load Dict ..."
            en_embedding_name = "../gbrae/data/embedding/en.token.dim%d.bin" % brae_config.dim
            zh_embedding_name = "../gbrae/data/embedding/zh.token.dim%d.bin" % brae_config.dim
            tar_word_dict = WordEmbedding.load_word2vec_word_map(en_embedding_name, binary=True, oov=True)
            src_word_dict = WordEmbedding.load_word2vec_word_map(zh_embedding_name, binary=True, oov=True)
            print "Compiling Model ..."
            brae = pre_model(src_word_dict, tar_word_dict, brae_config, verbose=True)
            print "Load All Data ..."
            src_phrases, tar_phrases, src_tar_pair = read_phrase_list(forced_decode_data, src_word_dict, tar_word_dict)
            src_train = [p[WORD_INDEX] for p in src_phrases]
            tar_train = [p[WORD_INDEX] for p in tar_phrases]
            print "Write Binary Data ..."
            with open(temp_model, 'wb') as fout:
                pickle.dump(src_train, fout)
                pickle.dump(tar_train, fout)
                pickle.dump(src_tar_pair, fout)
                pickle.dump(brae, fout)
                pickle.dump(np.random.get_state(), fout)
            if end_iter == 1:
                exit(1)
        else:
            with open(temp_model, 'rb') as fin:
                src_train = pickle.load(fin)
                tar_train = pickle.load(fin)
                src_tar_pair = pickle.load(fin)
                brae = pickle.load(fin)
                np.random.set_state(pickle.load(fin))
        brae.train(src_train, tar_train, src_tar_pair, brae_config, model_name, start_iter, end_iter)
        brae.save_model("%s.model" % model_name)
    elif train_test == "predict":
        num_process = int(sys.argv[3]) if len(sys.argv) > 3 else 0
        brae_predict(phrase_data_path, train_name + ".pred", model_file="%s.model" % model_name, num_process=num_process)
    else:
        sys.stderr("train or predict")
        exit(1)