def __init__(self, w2v_ph, model_root, strees=[], n_iters=1000, n_step2save=250, window=5, tolerance=0.03): _word2vec = Word2Vec() _word2vec.model_fromfile(w2v_ph) model = _ParseTreeAutoencoder( word2vec=_word2vec, strees=strees, ) ExecFrame.__init__(self, model=model, model_root=model_root, n_iters=n_iters, n_step2save=n_step2save, window=window, tolerance=tolerance) self.c_rate = 0.0 self.cur_c_rate = 0.0
TARGET_LEN = 40 pretrained_ckpt = 'attention/model_best_rouge1.ckpt' device = 'cuda' # read data print('reading data...') from _utils import read_jsonl valid_X, valid_Y = read_jsonl(VALID_FILE_PATH) print('done') # load pretrained word embedding print('loading word embedding...') from _word2vec import Word2Vec word2vec = Word2Vec(EMBEDDING_SAVE_PATH, 300, raw=False) embedding = word2vec.embedding SOS_token = word2vec.word2idx['<SOS>'] EOS_token = word2vec.word2idx['<EOS>'] PAD_token = word2vec.word2idx['<PAD>'] UNK_token = word2vec.word2idx['<UNK>'] print('done') # transform sentences to embedding print('valid_X') valid_X = word2vec.sent2idx(valid_X, INPUT_LEN) valid_Y = word2vec.sent2idx(valid_Y, INPUT_LEN) # convert them to dataset and dataloader import torch
CKPT_NAME = 'seq2seq/model.ckpt' device = 'cuda' # read data print('reading data...') from _utils import read_jsonl train_X, train_Y = read_jsonl(TRAIN_FILE_PATH) valid_X, valid_Y = read_jsonl(VALID_FILE_PATH) test_X, _ = read_jsonl(TEST_FILE_PATH, False) print('done') # load pretrained word embedding print('loading word embedding...') from _word2vec import Word2Vec word2vec = Word2Vec(EMBEDDING_FILE_PATH, EMBEDDING_DIM) embedding = word2vec.make_embedding( [train_X, train_Y, valid_X, valid_Y, test_X], MIN_DISCARD_LEN) SOS_token = word2vec.word2idx['<SOS>'] EOS_token = word2vec.word2idx['<EOS>'] PAD_token = word2vec.word2idx['<PAD>'] UNK_token = word2vec.word2idx['<UNK>'] print('done') # dump word2vec object import pickle with open(EMBEDDING_SAVE_PATH, 'wb') as f: tmp = {} tmp['embedding'] = word2vec.embedding tmp['word2idx'] = word2vec.word2idx
'len_X': 40, 'len_Y': 30, 'BATCH_SIZE': 64, 'model_name': sys.argv[3], } args = argparse.Namespace(**args) # set random seed random.seed(1003) np.random.seed(1003) torch.manual_seed(1003) torch.cuda.manual_seed_all(1003) torch.backends.cudnn.deterministic = True # word2vec en, cn = Word2Vec(os.path.join(args.dir, '{}_en.json')), Word2Vec( os.path.join(args.dir, '{}_cn.json')) en_BOS_token = en.word2idx['<BOS>'] en_EOS_token = en.word2idx['<EOS>'] en_PAD_token = en.word2idx['<PAD>'] en_UNK_token = en.word2idx['<UNK>'] cn_BOS_token = cn.word2idx['<BOS>'] cn_EOS_token = cn.word2idx['<EOS>'] cn_PAD_token = cn.word2idx['<PAD>'] cn_UNK_token = cn.word2idx['<UNK>'] # train, valid, test datas #train_X, train_Y = read_data(os.path.join(args.dir, 'training.txt')) #valid_X, valid_Y = read_data(os.path.join(args.dir, 'validation.txt'))
return bt.forward_update_vec() sentence_vec = bt.root.vector return sentence_vec if __name__ == "__main__": args = sys.argv[1:] if len(args) == 0: print 'cat stree_paths | ./cmd.py w2v_ph bae_ph' sys.exit(-1) stree_paths = sys.stdin.read().split() # args w2v_ph, bae_ph = args # load word2vec _word2vec = Word2Vec() _word2vec.model_fromfile(w2v_ph) # load bae bae = obj_from_file(bae_ph) tree2vec = Tree2Vec(_word2vec, bae) for path in stree_paths: output = [] with open(path) as f: strees = f.readlines() # used to recoveray the original valid sentences valid_line_nos = [] for no, stree in enumerate(strees): stree = stree.strip() #print 'parsing', stree sentence_vec = tree2vec.get_vec_from_stree(stree) if sentence_vec is not None: