'%(filename)-20s LINE %(lineno)-4d %(levelname)-8s %(message)s') # tell the handler to use this format console.setFormatter(formatter) logging.getLogger('').addHandler(console) init_logging() # ----------------- 1. Process the data --------------------------------------- data_dir = '/slfs1/users/zjz17/github/data/ptb_data/' vocab_path = os.path.join(data_dir, 'ptb.vocab.txt') train_path = os.path.join(data_dir, 'ptb.train.txt') valid_path = os.path.join(data_dir, 'ptb.valid.txt') test_path = os.path.join(data_dir, 'ptb.test.txt') word2idx = read_dict(vocab_path) ignore_label = word2idx.get('<pad>') data_train, label_train = get_text_id(train_path, word2idx) data_valid, label_valid = get_text_id(valid_path, word2idx) # -----------------2. Params Defination ---------------------------------------- num_buckets = 1 batch_size = 32 # network parameters num_lstm_layer = 1 input_size = len(word2idx) dropout = 0.0
elif mode == 'static': use_word2vec = True fixed_embed = True else: use_word2vec = True fixed_embed = False word2vec_path = w2v_file vocab_file = 'vocab.txt' sentences, label = read_file(train_file) sentences_padded = pad_sentences(sentences) build_vocab(sentences_padded, vocab_file, size=None) logging.info('total sentences lines: %d' % len(sentences_padded)) word2idx = read_dict(vocab_file) logging.info('dict length: %d' % len(word2idx)) valid_sentences, valid_label = read_file(valid_file) valid_sentences_padded = pad_sentences(valid_sentences) train_data, train_label = get_text_id(sentences_padded, label, word2idx) valid_data, valid_label = get_text_id(valid_sentences_padded, valid_label, word2idx) print 'train data shape: ', train_data.shape print 'example: ', train_label[0], '\t=>\t', train_data[0] print 'valid data shape: ', valid_data.shape print 'example: ', valid_label[0], '\t=>\t', valid_data[0] # ---------------------- 2. Params Defination ---------------------------------------- batch_size = 50
def see_hidden_vectos(params_dir='optimal_params', params_prefix='couplet', epoch=20): _, arg_params, __ = mx.model.load_checkpoint( '%s/%s' % (params_dir, params_prefix), epoch) results = [] moban = [[5, 5, 5, 5, 5], [10, 10, 10, 10, 10], [15, 15, 15, 15, 15], [20, 20, 20, 20, 20], [25, 25, 25, 25, 25]] moban = [[5, 10, 15, 20, 25], [15, 15, 15, 15, 15], [10, 10, 10, 20, 25], [10, 20, 20, 20, 5], [13, 14, 15, 16, 17]] num = 50 for i in range(len(moban)): results.append([]) # parameter definition data_dir = '/slfs1/users/zjz17/github/data/sort' vocab_file = 'q3.vocab' enc_word2idx = read_dict(os.path.join(data_dir, vocab_file)) dec_word2idx = read_dict(os.path.join(data_dir, vocab_file)) num_lstm_layer = 1 num_embed = 100 num_hidden = 200 num_label = len(dec_word2idx) batch_size = 1 enc_input_size = len(enc_word2idx) dec_input_size = len(dec_word2idx) enc_dropout = 0.0 dec_dropout = 0.0 output_dropout = 0.2 dg = DataGeneration(1000, 1, 1, 1) for i in range(len(moban)): lis = dg.generate_test_pairs(moban[i], num) for l in lis: enc_len = len(l) enc_data = mx.nd.array(np.array(l).reshape(1, enc_len) + 3) enc_mask = mx.nd.array(np.ones((enc_len, )).reshape(1, enc_len)) beam = BeamSearch(num_lstm_layer=num_lstm_layer, enc_data=enc_data, enc_mask=enc_mask, enc_len=enc_len, enc_input_size=enc_input_size, dec_input_size=dec_input_size, num_hidden=num_hidden, num_embed=num_embed, num_label=num_label, batch_size=batch_size, arg_params=arg_params, eos=dec_word2idx.get('<EOS>'), unk=dec_word2idx.get('<UNK>'), pad=dec_word2idx.get('<PAD>'), ctx=mx.cpu(), enc_dropout=enc_dropout, dec_dropout=dec_dropout, output_dropout=output_dropout) v = beam.init_states_dict['dec_l0_init_c'].asnumpy() results[i].append(v) ff = [] for i in range(len(moban)): ff.append(results[i][0]) for i in range(1, num): for j in range(len(moban)): ff[j] = np.concatenate((ff[j], results[j][i])) f = np.concatenate((ff[0], ff[1])) for i in range(2, len(moban)): f = np.concatenate((f, ff[i])) model = TSNE(n_components=3, random_state=0, learning_rate=500, n_iter=2000) x = model.fit_transform(f) colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] for i in range(len(moban)): tmp = range(i * 50, (i + 1) * 50) plt.scatter(x[tmp, 1], x[tmp, 0], s=20, marker='o', color=colors[i], label='%s' % moban[i]) ''' Three-dimension Graph from mpl_toolkits.mplot3d import Axes3D fig = plt.figure() ax = Axes3D(fig) for i in range(len(moban)): tmp = range(i*50, (i+1)*50) ax.scatter(x[tmp,0], x[tmp,1], x[tmp,2], s=20, marker = 'o', color = colors[i], label='%s' % moban[i])''' plt.legend(loc='upper left') plt.title('Encoded Hidden Vector T-SNE Visualization') plt.savefig('ff.jpg')
default=0, type=int, help='which line you want to test') parser.add_argument( '--mode', default='test', type=str, help='test one example or write examples results into a file') args = parser.parse_args() print args filename = args.filename idx = args.idx mode = args.mode enc_word2idx = read_dict(os.path.join(data_dir, vocab_file)) dec_word2idx = read_dict(os.path.join(data_dir, vocab_file)) print 'encoder dict length:', len(enc_word2idx) print 'decoder dict length:', len(dec_word2idx) enc_data, dec_data = get_enc_dec_text_id(filename, enc_word2idx, dec_word2idx) print 'enc_data length: ', len(enc_data), print 'example:', enc_data[0] print 'dec_data_length: ', len(dec_data) print 'example:', dec_data[0] # ------------------------------- Parameter Defination ------------------------------- # network parameters
Model = namedtuple("Model", ['executor', 'symbol']) logging.basicConfig(level = logging.DEBUG, format = '%(asctime)s %(message)s', datefmt = '%m-%d %H:%M:%S %p', filename = 'Log', filemode = 'w') logger = logging.getLogger() console = logging.StreamHandler() console.setLevel(logging.DEBUG) logger.addHandler(console) DEBUG = True # ----------------- 1. Process the data --------------------------------------- enc_word2idx = read_dict('../data/sort_test/vocab.txt') dec_word2idx = read_dict('../data/sort_test/vocab.txt') ignore_label = enc_word2idx.get('<PAD>') if DEBUG: print 'read_dict length:', len(enc_word2idx) enc_data, dec_data = get_enc_dec_text_id('../data/sort_test/tt.txt', enc_word2idx, dec_word2idx) enc_valid, dec_valid = get_enc_dec_text_id('../data/sort_test/tt.txt', enc_word2idx, dec_word2idx) if DEBUG: print 'enc_data length: ' , len(enc_data), enc_data[0:1] print 'dec_data_length: ' , len(dec_data), dec_data[0:1] print 'enc_valid_length: ', len(enc_valid), enc_valid[0:1] print 'dec_valid_length: ', len(dec_valid), dec_valid[0:1] # -----------------2. Params Defination ----------------------------------------