torch.manual_seed(123) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ''' 1. データの準備 ''' data_dir = os.path.join(os.path.dirname(__file__), 'data') en_train_path = os.path.join(data_dir, 'train.en') en_val_path = os.path.join(data_dir, 'dev.en') en_test_path = os.path.join(data_dir, 'test.en') ja_train_path = os.path.join(data_dir, 'train.ja') ja_val_path = os.path.join(data_dir, 'dev.ja') ja_test_path = os.path.join(data_dir, 'test.ja') en_vocab = Vocab() ja_vocab = Vocab() en_vocab.fit(en_train_path) ja_vocab.fit(ja_train_path) x_train = en_vocab.transform(en_train_path) x_val = en_vocab.transform(en_val_path) x_test = en_vocab.transform(en_test_path) t_train = ja_vocab.transform(ja_train_path, eos=True) t_val = ja_vocab.transform(ja_val_path, eos=True) t_test = ja_vocab.transform(ja_test_path, eos=True) def sort(x, t): lens = [len(i) for i in x]
def main(args): print(args) ts = datetime.datetime.now().timestamp() logger = SummaryWriter( os.path.join('exp/qgen/', '{}_{}'.format(args.exp_name, ts))) logger.add_text('exp_name', args.exp_name) logger.add_text('args', str(args)) torch.manual_seed(args.seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(args.seed) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') vocab = Vocab(os.path.join(args.data_dir, 'vocab.csv'), args.min_occ) category_vocab = CategoryVocab( os.path.join(args.data_dir, 'categories.csv')) load_vgg_features, load_resnet_features = False, False if args.visual_representation == 'vgg': load_vgg_features = True elif args.visual_representation == 'resnet-mlb': load_resnet_features = True data_loader = OrderedDict() splits = ['train', 'valid'] for split in splits: file = os.path.join(args.data_dir, 'guesswhat.' + split + '.jsonl.gz') data_loader[split] = DataLoader( dataset=QuestionerDataset( split, file, vocab, category_vocab, True, load_vgg_features=load_vgg_features, load_resnet_features=load_resnet_features), batch_size=args.batch_size, shuffle=split == 'train', #collate_fn=QuestionerDataset.get_collate_fn(device), collate_fn=QuestionerDataset.collate_fn) model = QGen(len(vocab), args.word_embedding_dim, args.num_visual_features, args.visual_embedding_dim, args.hidden_size, visual_representation=args.visual_representation, query_tokens=vocab.answer_tokens).to(device) print(model) loss_fn = torch.nn.CrossEntropyLoss(ignore_index=0) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) forward_kwargs_mapping = { 'dialogue': 'source_dialogue', 'dialogue_lengths': 'dialogue_lengths' } if load_vgg_features: forward_kwargs_mapping['visual_features'] = 'vgg_features' if load_resnet_features: forward_kwargs_mapping['visual_features'] = 'resnet_features' target_kwarg = 'target_dialogue' best_val_loss = 1e9 for epoch in range(args.epochs): train_loss, _ = eval_epoch(model, data_loader['train'], forward_kwargs_mapping, target_kwarg, loss_fn, optimizer) valid_loss, _ = eval_epoch(model, data_loader['valid'], forward_kwargs_mapping, target_kwarg, loss_fn) if valid_loss < best_val_loss: best_val_loss = valid_loss model.save( os.path.join('bin', 'qgen_{}_{}.pt'.format(args.exp_name, ts))) logger.add_scalar('train_loss', train_loss, epoch) logger.add_scalar('valid_loss', valid_loss, epoch) print(("Epoch {:2d}/{:2d} Train Loss {:07.4f} Vaild Loss {:07.4f}" ).format(epoch, args.epochs, train_loss, valid_loss))
def preprocess(field='body'): load_boilerpipe() binary = args.binary_html data_dir = args.data_dir max_vocab_size = args.max_vocab_size docs_dir = os.path.join(data_dir, 'docs') query_filepath = os.path.join(data_dir, 'query') train_filepath = os.path.join(data_dir, 'train.pointwise') test_filepath = os.path.join(data_dir, 'test.pointwise') vocab = Vocab(max_size=max_vocab_size) train_query_ids, train_doc_ids = get_query_doc_ids(train_filepath) test_query_ids, test_doc_ids = get_query_doc_ids(test_filepath) query_ids = train_query_ids | test_query_ids doc_ids = train_doc_ids | test_doc_ids print('total query: {}, total doc: {}'.format(len(query_ids), len(doc_ids))) query_dict = load_from_query_file(query_filepath) doc_dict = {} for qid in sorted(train_query_ids): for term in query_dict[qid].split(): vocab.add(term) count = 0 for docid in sorted(train_doc_ids): count += 1 if count % 10000 == 0: print('processed {}w docs'.format(count // 10000)) loaded_html = load_from_html_cascade(os.path.join( docs_dir, docid + '.html'), binary=binary, field=[field]) doc_dict[docid] = loaded_html[field] #print(docid) #print(' '.join(doc_dict[docid])) #input() for term in doc_dict[docid]: vocab.add(term) vocab.build() vocab.save_to_file(os.path.join(data_dir, 'vocab')) empty_qid, empty_docid = set(), set() with open(os.path.join(data_dir, 'query.prep'), 'w') as fp: for qid in sorted(query_ids): qt = query_dict[qid].split() if len(qt) == 0: empty_qid.add(qid) continue fp.write('{}\t{}\n'.format( qid, ' '.join(map(lambda x: str(x), vocab.encode(qt))))) with open(os.path.join(data_dir, 'docs.prep'), 'w') as fp: for docid in sorted(doc_ids): if docid in doc_dict: doc_text = doc_dict[docid] else: doc_text = load_from_html_cascade(os.path.join( docs_dir, docid + '.html'), binary=binary, field=[field])[field] if len(doc_text) == 0: empty_docid.add(docid) continue fp.write('{}\t{}\n'.format( docid, ' '.join(map(lambda x: str(x), vocab.encode(doc_text))))) print('have {} empty query, have {} empty doc'.format( len(empty_qid), len(empty_docid))) filter_samples(train_filepath, '{}.prep.{}'.format(*train_filepath.rsplit('.', 1)), empty_qid, empty_docid) filter_samples(test_filepath, '{}.prep.{}'.format(*test_filepath.rsplit('.', 1)), empty_qid, empty_docid)
parser.add_argument("--inter_alpha", type=float, default=0.1, help="adjust the penalty on intermediate labels") parser.add_argument("--corpus", type=str, default='raw', help="acd|raw") parser.add_argument("--mode", type=str, default='lstm', help="rnn|lstm") params, _ = parser.parse_known_args() if __name__ == '__main__': data = params.corpus print(data) assert data == 'acd_trees_128d' or data == 'acd_trees_512d' or data == 'raw' or data == 'acd_trees_512d_rand' train_data, dev_data, test_data = tr.simplified_data(0, 0, 0, data) print(len(train_data), len(dev_data), len(test_data)) print(train_data[0]) vocab = Vocab() train_sents = [t.get_words() for t in train_data] vocab.construct(list(itertools.chain.from_iterable(train_sents))) if params.mode == 'lstm': model = RNN_LSTM_Model(vocab, embed_size=embed_size).cuda() else: model = RNN_Model(vocab, embed_size=embed_size).cuda() loss_history = [] optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, dampening=0.0) for epoch in range(max_epochs): print("epoch = ", epoch)
max_conv_len = args.max_conversation_length max_vocab_size = args.max_vocab_size min_freq = args.min_vocab_frequency print("Loading conversations...") train = load_conversations(datasets_dir.joinpath("train.txt")) valid = load_conversations(datasets_dir.joinpath("dev.txt")) test = load_conversations(datasets_dir.joinpath("test.txt")) print("#train=%d, #val=%d, #test=%d" % (len(train), len(valid), len(test))) def to_pickle(obj, path): with open(path, 'wb') as f: pickle.dump(obj, f) vocab = Vocab(lang="zh") for split_type, conversations in [('train', train), ('valid', valid), ('test', test)]: print(f'Processing {split_type} dataset...') split_data_dir = datasets_dir.joinpath(split_type) split_data_dir.mkdir(exist_ok=True) conversation_length = [ min(len(conv), max_conv_len) for conv in conversations ] sentences, sentence_length = pad_sentences( conversations, max_sentence_length=max_sent_len, max_conversation_length=max_conv_len) print('Saving preprocessed data at', split_data_dir)
def test(args): vocab = Vocab() vocab.load(args.vocab, args.lowercase) vocab.add_special_token() sufvocab = Vocab() sufvocab.load(args.sufvocab, args.lowercase) sufvocab.add_special_token(['s>', '<UNK>']) pos2id = Vocab() pos2id.load(args.poslist) if args.gpu > -1: cuda.get_device(args.gpu).use() xp = cuda.cupy else: xp = np model = WordnnTagger.load(args.model) out_path = making_data(args.test_path, model.window) if args.gpu > -1: model.to_gpu() model.make_oov_vector(args.gpu > -1) # start evaluation n_data = 0 n_correct = 0 sum_loss = xp.zeros((), dtype=xp.float32) start = time.time() for tags, contexts in line_iter(out_path, args.minibatch, False): batch_ts = xp.array([pos2id[tag] for tag in tags], dtype=xp.int32) batch_caps = xp.array([[get_capf(word) for word in context] for context in contexts], dtype=xp.int32) if args.lowercase: contexts = [[word.lower() for word in context] for context in contexts] batch_xs = xp.array( [[vocab[word] for word in vocab.check_words(context)] for context in contexts], dtype=xp.int32) # maybe inefficient... batch_sufs = [[word[-2:] for word in context] for context in contexts] batch_sufs = xp.array( [[sufvocab[suf] for suf in sufvocab.check_words(sufs)] for sufs in batch_sufs], dtype=xp.int32) batch_caps = xp.array([[get_capf(word) for word in context] for context in contexts], dtype=xp.int32) batch_features = [batch_xs, batch_sufs, batch_caps] cur_batch_size = batch_ts.shape[0] ys, loss = model(batch_features, batch_ts) sum_loss += loss.data * cur_batch_size pred_labels = ys.data.argmax(1) n_correct += sum(1 for j in range(cur_batch_size) if pred_labels[j] == batch_ts[j]) n_data += cur_batch_size end = time.time() accuracy = float(n_correct / n_data) print('test loss : {}'.format(sum_loss)) print('test accuracy : {}'.format(accuracy)) print('(time to run : {})'.format(end - start))
def main(argv=()): del argv # Unused. vocab = Vocab() shp_p = tf.placeholder(tf.int32, shape=(2, )) sen_batch_p = tf.placeholder(tf.int32, shape=(FLAGS.batch_size, None)) mask_batch_p = tf.placeholder(tf.int32, shape=(FLAGS.batch_size, None)) labels_batch_p = tf.placeholder(tf.int32, shape=(FLAGS.batch_size, )) max_sampling = (FLAGS.sampling_mode == 'max') decoded_samples = model_sample(sen_batch_p, mask_batch_p, shp_p, labels_batch_p, max_sampling=max_sampling) saver = tf.train.Saver() with tf.Session() as sess: coord = tf.train.Coordinator() saver.restore(sess, FLAGS.restore_ckpt_path) threads = tf.train.start_queue_runners(sess=sess, coord=coord) for label in range(FLAGS.Nlabels): if FLAGS.flip_label: flip_label = 1 - label else: flip_label = label input_file = FLAGS.input_file.split(',')[label] input_sents = open(input_file, 'r').readlines() input_sents = [sent.strip() for sent in input_sents] samples = [] for it in range(int(len(input_sents) / FLAGS.batch_size) + 1): labels_batch = np.array([0] * FLAGS.batch_size) sents = input_sents[it * FLAGS.batch_size:(it + 1) * FLAGS.batch_size] num_sents = len(sents) while len(sents) < FLAGS.batch_size: sents.extend(sents[:FLAGS.batch_size - len(sents)]) sen_batch, mask_batch, shp = vocab.construct_batch(sents) out = sess.run(decoded_samples, feed_dict={ sen_batch_p: sen_batch, mask_batch_p: mask_batch, shp_p: shp, labels_batch_p: labels_batch }) for k in range(FLAGS.batch_size): if k >= num_sents: break samples.append(vocab.convert_to_str(out[flip_label][k])) fname = FLAGS.samples_dir + '/' + FLAGS.mdl_name + '_sample_' + str( flip_label) fname += '.txt' with open(fname, 'w') as results_file: results_file.write('\n'.join(samples)) coord.request_stop() coord.join(threads)
sentences, sentence_length = pad_sentences( conv_sentences, max_sentence_length=max_sent_len, max_conversation_length=max_conv_len) for sentence_len, label in zip(conversation_length, conv_labels): assert(sentence_len ==len(label)) print('Saving preprocessed data at', split_data_dir) to_pickle(conversation_length, split_data_dir.joinpath( 'conversation_length.pkl')) to_pickle(sentences, split_data_dir.joinpath('sentences.pkl')) to_pickle(conv_labels, split_data_dir.joinpath('labels.pkl')) to_pickle(sentence_length, split_data_dir.joinpath( 'sentence_length.pkl')) to_pickle(iemocap.vids[split_type], split_data_dir.joinpath('video_id.pkl')) if split_type == 'train': print('Save Vocabulary...') vocab = Vocab(tokenizer) vocab.add_dataframe(conv_sentences) assert(GLOVE_DIR != "") vocab.update(GLOVE_DIR, max_size=max_vocab_size, min_freq=min_freq) print('Vocabulary size: ', len(vocab)) vocab.pickle(iemocap_dir.joinpath('word2id.pkl'), iemocap_dir.joinpath('id2word.pkl'), iemocap_dir.joinpath('word_emb.pkl'))
args = parse_args() logging.basicConfig(level=args.log_level) train_data = "snli_1.0/snli_1.0_train.jsonl" dev_data = "snli_1.0/snli_1.0_dev.jsonl" test_data = "snli_1.0/snli_1.0_test.jsonl" if args.no_cache or not os.path.exists("cache"): logging.info("Cache not found, reprocessing data.") train_sentence1, train_sentence2, train_labels = read(train_data, 550152) dev_sentence1, dev_sentence2, dev_labels = read(dev_data, 10000) test_sentence1, test_sentence2, test_labels = read(test_data, 10000) raw_data = chain(*train_sentence1, *train_sentence2) vocab = Vocab(raw_data) if not os.path.exists("cache"): os.makedirs("cache") pickle.dump(vocab, open("cache/vocab.p", "wb")) pickle.dump([train_sentence1, train_sentence2, train_labels], open("cache/train.p", "wb")) pickle.dump([test_sentence1, test_sentence2, test_labels], open("cache/test.p", "wb")) pickle.dump([dev_sentence1, dev_sentence2, dev_labels], open("cache/dev.p", "wb")) else: logging.info("Loading data from the cache.") vocab = pickle.load(open("cache/vocab.p", "rb")) train_sentence1, train_sentence2, train_labels = pickle.load( open("cache/train.p", "rb")) dev_sentence1, dev_sentence2, dev_labels = pickle.load(