예제 #1
0
def main(args):
    # Load the arguments.
    model_dir = os.path.dirname(args.model_path)
    params = Dict2Obj(
        json.load(open(os.path.join(model_dir, "args.json"), "r")))

    # Config logging
    log_format = '%(levelname)-8s %(message)s'
    logfile = os.path.join(model_dir, 'eval.log')
    logging.basicConfig(filename=logfile,
                        level=logging.INFO,
                        format=log_format)
    logging.getLogger().addHandler(logging.StreamHandler())
    logging.info(json.dumps(args.__dict__))
    # Load vocabulary wrapper.
    vocab = load_vocab(params.vocab_path)

    # Build data loader
    logging.info("Building data loader...")

    # Load GloVe embedding.
    if params.use_glove:
        embedding = get_glove_embedding(params.embedding_name, 300, vocab)
    else:
        embedding = None

    # Processing input text
    logging.info("Processing input text...")
    text, length = process_text(args.text, vocab, max_length=20)
    d_text = text

    logging.info("Done")
    # Build the models
    logging.info('Creating IQ model...')
    model = Classifier(len(vocab),
                       embedding_dim=params.embedding_dim,
                       embedding=embedding,
                       hidden_dim=params.num_hidden_nodes,
                       output_dim=params.num_output_nodes,
                       num_layers=params.num_layers,
                       bidirectional=params.bidirectional,
                       dropout=params.dropout,
                       rnn_cell=params.rnn_cell)

    logging.info("Done")

    logging.info("Loading model.")
    model.load_state_dict(
        torch.load(args.model_path + "model-tf-" + args.state + ".pkl"))

    # Setup GPUs.
    if torch.cuda.is_available():
        logging.info("Using available GPU...")
        model.cuda()

    predict(model, d_text)
예제 #2
0
def pretrain(source_data_loader,
             test_data_loader,
             no_classes,
             embeddings,
             epochs=20,
             batch_size=128,
             cuda=False):

    classifier = Classifier()
    encoder = Encoder(embeddings)

    if cuda:
        classifier.cuda()
        encoder.cuda()
    ''' Jointly optimize both encoder and classifier '''
    encoder_params = filter(lambda p: p.requires_grad, encoder.parameters())
    optimizer = optim.Adam(
        list(encoder_params) + list(classifier.parameters()))

    # Use weights to normalize imbalanced in data
    c = [1] * len(no_classes)
    weights = torch.FloatTensor(len(no_classes))
    for i, (a, b) in enumerate(zip(c, no_classes)):
        weights[i] = 0 if b == 0 else a / b

    loss_fn = nn.CrossEntropyLoss(weight=Variable(weights))

    print('Training encoder and classifier')
    for e in range(epochs):

        # pretrain with whole source data -- use groups with DCD
        for sample in source_data_loader:
            x, y = Variable(sample[0]), Variable(sample[1])
            optimizer.zero_grad()

            if cuda:
                x, y = x.cuda(), y.cuda()

            output = model_fn(encoder, classifier)(x)

            loss = loss_fn(output, y)

            loss.backward()

            optimizer.step()

        print("Epoch", e, "Loss", loss.data[0], "Accuracy",
              eval_on_test(test_data_loader, model_fn(encoder, classifier)))

    return encoder, classifier
예제 #3
0
def main(args):
	vecs_builder = VecsBuilder(vecs_path='./glove/glove.6B.300d.txt')
	vecs = vecs_builder.get_data()

	train_dataset = Loader(args.max_length,vecs,'train')
	train_loader = DataLoader(train_dataset, batch_size = args.batch_size, num_workers = 5)
	val_dataset = Loader(args.max_length,vecs,'val')
	val_loader = DataLoader(val_dataset, batch_size = args.batch_size)
	model = Classifier(args.embed_dim, args.hidden_dim,args.num_classes,args.num_hidden_layers)

	if torch.cuda.is_available():
		print('Cuda Functioning..')
		model.cuda()

	best_acc = 0
	automated_log = open('models/automated_log.txt','w+')
	automated_log.write('Epochs'+'\t'+'Train-Loss'+'\t'+'Train-Accuracy'+'\t'+'Validation Loss'+'\t'+'Validation Accuracy\n')

	for epoch in tqdm(range(args.num_epochs)):
		train_loss,train_acc = train(model,train_loader)
		val_loss,val_acc = eval(model,val_loader)
		train_acc = train_acc/train_dataset.num_samples
		val_acc = val_acc/val_dataset.num_samples
		# print('Epoch : ',epoch)
		# print('Train Loss : ',train_loss)
		# print('Train Acc : ',train_acc)
		# print('Validation Loss : ',val_loss)
		# print('Validation Acc : ',val_acc)
		automated_log.write(str(epoch)+'\t'+str(train_loss)+'\t'+str(train_acc)+'\t'+str(val_loss)+'\t'+str(val_acc)+'\n')
		if epoch%10==0:
			model_name = 'models/model_'+str(epoch)+'.pkl'
			torch.save(model.state_dict(),model_name)
		if val_acc>best_acc:
			best_acc = val_acc
			best_model = 'best.pkl'
			torch.save(model.state_dict(),best_model)
			f = open('models/best.txt','w+')
			report = 'Epoch : '+str(epoch)+'\t Validation Accuracy : '+str(best_acc)
			f.write(report)
			f.close()
			print('Best Model Saved with Valdn Accuracy :',val_acc)
	automated_log.close()
예제 #4
0
def pretrain(data, epochs=5, batch_size=128, cuda=False):

    X_s, y_s, _, _ = data

    test_dataloader = mnist_dataloader(train=False, cuda=cuda)

    classifier = Classifier()
    encoder = Encoder()

    if cuda:
        classifier.cuda()
        encoder.cuda()

    ''' Jointly optimize both encoder and classifier ''' 
    optimizer = optim.Adam(list(encoder.parameters()) + list(classifier.parameters()))
    loss_fn = nn.CrossEntropyLoss()
    
    for e in range(epochs):
        
        for _ in range(len(X_s) // batch_size):
            inds = torch.randperm(len(X_s))[:batch_size]

            x, y = Variable(X_s[inds]), Variable(y_s[inds])
            optimizer.zero_grad()

            if cuda:
                x, y = x.cuda(), y.cuda()

            y_pred = model_fn(encoder, classifier)(x)

            loss = loss_fn(y_pred, y)

            loss.backward()

            optimizer.step()

        print("Epoch", e, "Loss", loss.data[0], "Accuracy", eval_on_test(test_dataloader, model_fn(encoder, classifier)))
    
    return encoder, classifier
예제 #5
0
    if data_list[stage] is not None:
        data_loader.set_data_list(data_list[stage])
    data_gen[stage] = DataGenerator(data_loader, generator_config[stage])

# - GPUs
os.environ['CUDA_VISIBLE_DEVICES'] = str(config['gpus'])
torch.backends.cudnn.enabled = True

# - model
model = Classifier(out_channels=2)
if args.checkpoint is not None:
    model.load_state_dict(torch.load(args.checkpoint))
    print('Load checkpoint:', args.checkpoint)

if torch.cuda.device_count() > 0:
    model = model.cuda()
model.zero_grad()

# - optimizer
optim = Optimizer(config['optimizer'])(model)
optim.zero_grad()

weight = torch.tensor([0.1, 0.99])
if torch.cuda.device_count() > 0:
    weight = weight.cuda()
criterion = torch.nn.CrossEntropyLoss(weight)


def F1_score(predis, labels):
    return 2 * torch.sum(predis * labels) / torch.sum(predis + labels)
def main(args):
    # Load the arguments.
    model_dir = os.path.dirname(args.model_path)
    params = Dict2Obj(
        json.load(open(os.path.join(model_dir, "args.json"), "r")))

    # Config logging
    log_format = '%(levelname)-8s %(message)s'
    logfile = os.path.join(model_dir, 'eval.log')
    logging.basicConfig(filename=logfile,
                        level=logging.INFO,
                        format=log_format)
    logging.getLogger().addHandler(logging.StreamHandler())
    logging.info(json.dumps(args.__dict__))
    # Load vocabulary wrapper.
    vocab = load_vocab(params.vocab_path)

    # Build data loader
    logging.info("Building data loader...")

    # Load GloVe embedding.
    if params.use_glove:
        embedding = get_glove_embedding(params.embedding_name, 300, vocab)
    else:
        embedding = None

    # Build data loader
    logging.info("Building data loader...")
    data_loader = get_loader(args.dataset,
                             args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers,
                             max_examples=args.max_examples)
    logging.info("Done")
    # Build the models
    logging.info('Creating a multi class classification model...')
    model = Classifier(len(vocab),
                       embedding_dim=params.embedding_dim,
                       embedding=embedding,
                       hidden_dim=params.num_hidden_nodes,
                       output_dim=params.num_output_nodes,
                       num_layers=params.num_layers,
                       bidirectional=params.bidirectional,
                       dropout=params.dropout,
                       rnn_cell=params.rnn_cell)

    logging.info("Done")

    logging.info("Loading model.")
    model.load_state_dict(
        torch.load(args.model_path + "model-tf-" + args.state + ".pkl"))

    # Setup GPUs.
    if torch.cuda.is_available():
        logging.info("Using available GPU...")
        model.cuda()
    scores, gts, preds = evaluate(model, data_loader, vocab, args, params)

    # Print and save the scores.
    print(scores)
    with open(os.path.join(model_dir, args.results_path), 'w') as results_file:
        json.dump(scores, results_file)
    with open(os.path.join(model_dir, args.preds_path), 'w') as preds_file:
        json.dump(preds, preds_file)
    with open(os.path.join(model_dir, args.gts_path), 'w') as gts_file:
        json.dump(gts, gts_file)
예제 #7
0
    classifier_pt = torch.load('classifier.pt')
    cla.load_state_dict(classifier_pt)
    cla.eval()

    for method in methods:
        print(method)
        model = MADVAE(args)
        model_pt = torch.load(
            f'../pretrained_model/{method}/params.pt')
        model.load_state_dict(model_pt)
        model.eval()

        if torch.cuda.is_available():
            print("Using CUDA")
            model = model.cuda()
            cla = cla.cuda()

        results = {}

        for norm in norms:
            total, total_inbds, adv, adv_inb = accuracy(cla, model, norms=[norm], suffix=f"_{method}")
            _, adv_old, _ = accuracy_paper(cla, model, norms=[norm], suffix=f"_{method}")
            results[f'{norm}'] = [adv.item(), adv_inb.item(), adv_old.item()]

        total, total_inbds, adv, adv_inb = accuracy(cla, model, suffix=f"_{method}")
        _, adv_old, _ = accuracy_paper(cla, model, suffix=f"_{method}")
        results['all'] = [adv.item(), adv_inb.item(), adv_old.item()]

        with open(f'./results/accuracy_{method}.txt', 'w') as f:
            json.dump(results, f)
예제 #8
0
# Calculate output of image discriminator (PatchGAN)
patch = int(opt.img_size / (2**4))
patch = (1, patch, patch)

generator = Generator(opt.latent_dim, opt.channels, opt.img_size,
                      opt.n_residual_blocks)
discriminator = Discriminator(opt.channels)
classifier = Classifier(opt.channels, opt.img_size, opt.n_classes)

generator = nn.DataParallel(generator)
generator.cuda()
discriminator = nn.DataParallel(discriminator)
discriminator.cuda()
classifier = nn.DataParallel(classifier)
classifier.cuda()

adversarial_loss = torch.nn.MSELoss().cuda()
task_loss = torch.nn.CrossEntropyLoss().cuda()

generator.apply(weights_init_normal)
discriminator.apply(weights_init_normal)
classifier.apply(weights_init_normal)

os.makedirs("data", exist_ok=True)

train_source = get_cifar10(train=True)
train_target = get_stl10(split='train')

optimizer_G = torch.optim.Adam(itertools.chain(generator.parameters(),
                                               classifier.parameters()),
def main():
  args = parser.parse_args()

  # model
  model = Classifier(args.channels)
  optimizer = optim.SGD(
    model.parameters(), lr=0.05, momentum=0.9, weight_decay=0.0001, nesterov=True)
  scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epoch)

  if args.gpu is not None:
    model.cuda(args.gpu)

  # dataset
  raw_loader = torch.utils.data.DataLoader(
    Dataset(os.path.join(DATA_DIR, 'raw')),
    args.batch // 2, shuffle=True, drop_last=True)
  noised_loader = torch.utils.data.DataLoader(
    Dataset(os.path.join(DATA_DIR, 'noised_tgt')),
    args.batch // 2, shuffle=True, drop_last=True)

  # train
  for epoch in range(args.epoch):
    loss = 0
    accuracy = 0
    count = 0

    for x0, x1 in zip(noised_loader, raw_loader):
      if args.gpu is not None:
        x0 = x0.cuda(args.gpu)
        x1 = x1.cuda(args.gpu)

      # train
      model.train()

      x = torch.cat((x0, x1), dim=0)  # @UndefinedVariable
      t = torch.zeros((x.shape[0], 2), device=x.device).float()  # @UndefinedVariable

      t[:x0.shape[0], 0] = 1
      t[x0.shape[0]:, 1] = 1

      x, t = mixup(x, t)
      y = model(x)
      e = (-1 * nn.functional.log_softmax(y, dim=1) * t).sum(dim=1).mean()

      optimizer.zero_grad()
      e.backward()
      optimizer.step()

      # validate
      model.eval()

      with torch.no_grad():
        y0 = (model(x0).max(dim=1)[1] == 0).float()
        y1 = (model(x1).max(dim=1)[1] == 1).float()

      a = torch.cat((y0, y1), dim=0).mean()  # @UndefinedVariable

      loss += float(e) * len(x)
      accuracy += float(a) * len(x)
      count += len(x)

    print('[{}] lr={:.7f}, loss={:.4f}, accuracy={:.4f}'.format(
      epoch, float(optimizer.param_groups[0]['lr']), loss / count, accuracy / count),
      flush=True)

    scheduler.step()

    snapshot = {'channels': args.channels, 'model': model.state_dict()}
    torch.save(snapshot, '{}.tmp'.format(args.file))
    os.rename('{}.tmp'.format(args.file), args.file)
예제 #10
0
source_loader = torch.utils.data.DataLoader(source_dataset_train, batch_size = batch_size, shuffle = True)
target_loader = torch.utils.data.DataLoader(target_dataset_train, batch_size = batch_size, shuffle = True)
s_test_loader = torch.utils.data.DataLoader(source_dataset_test, batch_size = batch_size, shuffle = True)
t_test_loader = torch.utils.data.DataLoader(target_dataser_test, batch_size = batch_size, shuffle = True)
total_steps = total_epochs*len(source_loader)
'''定义网络框架'''
feature_extrator = Extractor()
class_classifier = Classifier()
class_criterion = nn.NLLLoss()
optimizer = optim.SGD([{'params': feature_extrator.parameters()},
                            {'params': class_classifier.parameters()}], lr= lr, momentum= momentum)

if torch.cuda.is_available():
    feature_extrator = feature_extrator.cuda()
    class_classifier = class_classifier.cuda()
    class_criterion = class_criterion.cuda()

def train(f,c,source,target,optimizer,step):
    result = []
    source_data, source_label = source
    target_data, target_label = target
    # torchvision.utils.save_image(source_data,'mnist.png')
    # torchvision.utils.save_image(target_data, 'mnist_M.png')
    size = min((source_data.shape[0], target_data.shape[0]))
    # print(size)
    source_data, source_label = source_data[0:size, :, :, :], source_label[0:size]
    target_data, target_label = target_data[0:size, :, :, :], target_label[0:size]
    p = float(step)/total_steps
    gamma = 2 / (1 + np.exp(-10 * p)) - 1
    if torch.cuda.is_available():
def main():
    global args
    # Parse commands from ArgumentParser
    args = parser.parse_args()
    # Our text field for imdb data
    TEXT = torchtext.data.Field(lower=True)
    # Our label field for imdb data
    LABEL = torchtext.data.Field(sequential=False)
    # Load GloVE embeddings
    orig_embeddings = torch.load(args.data_folder + 'all_orig_emb.pt')
    total_words = len(orig_embeddings)
    # Load shared words and all GloVE words
    with open(args.data_folder + "shared_words.txt", "r") as file:
        shared_words = file.read().split('\n')
    with open(args.data_folder + "glove_words.txt", "r") as file:
        glove_words = file.read().split('\n')
    # Recreate GloVE_dict
    glove_dict = {}
    for i, word in enumerate(glove_words):
        glove_dict[word] = orig_embeddings[i]

    # Load IMDB dataset with standard splits and restrictions identical to paper
    train, test = torchtext.datasets.IMDB.splits(
        TEXT,
        LABEL,
        filter_pred=lambda ex: ex.label != 'neutral' and len(ex.text) <= 400)

    # Both loops go through the words of train and test dataset, finds words without glove vectors, and replaces them with <unk>
    for i in range(len(train)):
        review = train.examples[i].text
        for i, word in enumerate(review):
            if word not in glove_dict:
                review[i] = '<unk>'
    for i in range(len(test)):
        review = test.examples[i].text
        for i, word in enumerate(review):
            if word not in glove_dict:
                review[i] = '<unk>'

    # Build modified vocabulary
    TEXT.build_vocab(train)
    LABEL.build_vocab(train)

    # Create iterators over train and test set
    train_iter, test_iter = torchtext.data.BucketIterator.splits(
        (train, test), batch_size=args.batch_size, repeat=False, device=-1)

    # If we want to use baseline GloVE embeddings
    if args.embedding_type == 'baseline':
        # Initialize embedding
        comp_embedding = np.random.uniform(
            -0.25, 0.25, (len(TEXT.vocab), args.embedding_size))
        # For each vocab word, replace embedding vector with GloVE vector
        for word in shared_words:
            comp_embedding[TEXT.vocab.stoi[word]] = glove_dict[word]
        # Initialize Classifer with our GloVE embedding
        base_c = Classifier(torch.FloatTensor(comp_embedding), args.batch_size)
        # Put model into CUDA memory if using GPU
        if use_gpu:
            base_c = base_c.cuda()
        # Initialize Optimizer
        optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                      base_c.parameters()),
                               lr=args.lr)
        # Define Loss function
        loss_func = nn.NLLLoss()

    else:
        '''
        Note- the model in the paper is different because they only store the source dictionaries,
        making their model smaller than normal classifiers which is a major purpose of the paper.
        By my formulation, my model actually has the same size. However, they are fundamentally equivalent,
        except that the authors would have to preprocess the data (convert words into codes) whereas I
        simply make an embedding layer of size Vocab like GloVE vectors. Either way, I should get the same
        levels of accuracy, which is the primary importance of the sentiment classification task- to check
        whether the coding embeddings still give the same level of accuracy.
        '''
        # Initialize embedding
        code_embedding = torch.FloatTensor(
            np.random.uniform(-0.25, 0.25,
                              (len(TEXT.vocab), args.embedding_size)))
        # Load best model for code embedding generation
        model = Code_Learner(args.embedding_size, args.M, args.K)
        model = torch.load(args.model_file)
        # Put model into CUDA memory if using GPU
        if use_gpu:
            code_embedding = code_embedding.cuda()
            model = model.cuda()
        # For all words in vocab
        for i in range(len(TEXT.vocab)):
            # Try to see if it has a corresponding glove_vector
            try:
                glove_vec = glove_dict[TEXT.vocab.itos[i]]
                if use_gpu:
                    glove_vec = glove_vec.cuda()
                # If so, then generate our own embedding for the word using our model
                code_embedding[i] = model(glove_vec, training=False)
            # The word doesn't have a GloVE vector, keep it randomly initialized
            except KeyError:
                pass
        base_c = Classifier(torch.FloatTensor(code_embedding.cpu()),
                            args.batch_size)
        # Put model into CUDA memory if using GPU
        if use_gpu:
            base_c = base_c.cuda()
        # Initialize Optimizer
        optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                      base_c.parameters()),
                               lr=args.lr)
        # Define Loss function
        loss_func = nn.NLLLoss()

    classifier_train(args.epochs, base_c, optimizer, loss_func, train_iter,
                     test_iter, args.embedding_type)
    PATH = values["classifier"]

    transform = transforms.Compose([transforms.RandomAffine(degrees=15, scale=(0.9, 1.0), fillcolor=256), transforms.Grayscale(),  transforms.Resize(227), transforms.RandomHorizontalFlip(0.5), transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

    path = values["datasetFolder"]+"/SKETCHES_TRAINING"

    trainset = torchvision.datasets.ImageFolder(root=path, transform=transform)

    trainloader = torch.utils.data.DataLoader(trainset, batch_size=4, shuffle=True, num_workers=4)

    classes = set(values["classes"])

    net = Classifier()
    net.to(device)
    if use_cuda:
        net.cuda()

    # Set up loss function and optimiser

    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(net.parameters(), lr=0.0001, momentum=0.9)

    epoch = 0

    running_loss = 1.0

    # Simple training for 500 epochs

    for epoch in range(500):
        
        running_loss = 0.0
예제 #13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--word-dim', type=int, default=300, help='size of word embeddings')
    parser.add_argument('--hidden-dim', type=int, default=300, help='number of hidden units per layer')
    parser.add_argument('--num-layers', type=int, default=1, help='number of layers in BiLSTM')
    parser.add_argument('--att-dim', type=int, default=350, help='number of attention unit')
    parser.add_argument('--att-hops', type=int, default=4, help='number of attention hops, for multi-hop attention model')
    parser.add_argument('--clf-hidden-dim', type=int, default=512, help='hidden (fully connected) layer size for classifier MLP')
    parser.add_argument('--clip', type=float, default=0.5, help='clip to prevent the too large grad in LSTM')
    parser.add_argument('--lr', type=float, default=.001, help='initial learning rate')
    parser.add_argument('--weight-decay', type=float, default=1e-5, help='weight decay rate per batch')
    parser.add_argument('--dropout', type=float, default=0.3)
    parser.add_argument('--max-epoch', type=int, default=8)
    parser.add_argument('--seed', type=int, default=666)
    parser.add_argument('--cuda', action='store_true', default=True)
    parser.add_argument('--optimizer', default='adam', choices=['adam', 'sgd'])
    parser.add_argument('--batch-size', type=int, default=32, help='batch size for training')
    parser.add_argument('--penalization-coeff', type=float, default=0.1, help='the penalization coefficient')
    parser.add_argument('--fix-word-embedding', action='store_true')


    parser.add_argument('--model-type', required=True, choices=['sa', 'avgblock', 'hard'])
    parser.add_argument('--data-type', required=True, choices=['age2', 'dbpedia', 'yahoo'])
    parser.add_argument('--data', required=True, help='pickle file obtained by dataset dump')
    parser.add_argument('--save-dir', type=str, required=True, help='path to save the final model')
    parser.add_argument('--block-size', type=int, default=-1, help='block size only when model-type is avgblock')
    args = parser.parse_args()

    torch.manual_seed(args.seed)
    random.seed(args.seed)
    if torch.cuda.is_available():
        if not args.cuda:
            print("WARNING: You have a CUDA device, so you should probably run with --cuda")
        else:
            torch.cuda.manual_seed(args.seed)
    #######################################
    # a simple log file, the same content as stdout
    if not os.path.exists(args.save_dir):
        os.mkdir(args.save_dir)
    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)-8s %(message)s')
    logFormatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s')
    rootLogger = logging.getLogger()
    fileHandler = logging.FileHandler(os.path.join(args.save_dir, 'stdout.log'))
    fileHandler.setFormatter(logFormatter)
    rootLogger.addHandler(fileHandler)
    ########################################
    for k, v in vars(args).items():
        logging.info(k+':'+str(v))

    #####################################################################
    if args.data_type == 'age2':
        data = AGE2(datapath=args.data, batch_size=args.batch_size)
        num_classes = 5
    elif args.data_type == 'dbpedia':
        data = DBpedia(datapath=args.data, batch_size=args.batch_size)
        num_classes = 14
    elif args.data_type == 'yahoo':
        data = Yahoo(datapath=args.data, batch_size=args.batch_size)
        num_classes = 10
    else:
        raise Exception('Invalid argument data-type')
    #####################################################################
    if args.model_type == 'avgblock':
        assert args.block_size > 0
    #####################################################################


    tic = time.time()
    model = Classifier(
        dictionary=data,
        dropout=args.dropout,
        num_words=data.num_words,
        num_layers=args.num_layers,
        hidden_dim=args.hidden_dim,
        word_dim=args.word_dim,
        att_dim=args.att_dim,
        att_hops=args.att_hops,
        clf_hidden_dim=args.clf_hidden_dim,
        num_classes=num_classes,
        model_type=args.model_type,
        block_size=args.block_size,
    )
    print('It takes %.2f sec to build the model.' % (time.time() - tic))
    logging.info(model)

    model.word_embedding.weight.data.set_(data.weight)
    if args.fix_word_embedding:
        model.word_embedding.weight.requires_grad = False
    if args.cuda:
        model = model.cuda()
    ''' count parameters
    num_params = sum(np.prod(p.size()) for p in model.parameters())
    num_embedding_params = np.prod(model.word_embedding.weight.size())
    print('# of parameters: %d' % num_params)
    print('# of word embedding parameters: %d' % num_embedding_params)
    print('# of parameters (excluding word embeddings): %d' % (num_params - num_embedding_params))
    '''
    if args.optimizer == 'adam':
        optimizer_class = optim.Adam
    elif args.optimizer == 'sgd':
        optimizer_class = optim.SGD
    else:
        raise Exception('For other optimizers, please add it yourself. supported ones are: SGD and Adam.')
    params = [p for p in model.parameters() if p.requires_grad]
    optimizer = optimizer_class(params=params, lr=args.lr, weight_decay=args.weight_decay)
    scheduler = lr_scheduler.ReduceLROnPlateau(optimizer=optimizer, mode='max', factor=0.5, patience=10, verbose=True)
    criterion = nn.CrossEntropyLoss()
    # Identity matrix for each batch
    I = Variable(torch.eye(args.att_hops).unsqueeze(0).expand(args.batch_size, -1, -1))
    if args.cuda:
        I = I.cuda()
    trpack = {
            'model': model,
            'params': params, 
            'criterion': criterion, 
            'optimizer': optimizer,
            'I': I,
            }

    train_summary_writer = tensorboard.FileWriter(
        logdir=os.path.join(args.save_dir, 'log', 'train'), flush_secs=10)
    valid_summary_writer = tensorboard.FileWriter(
        logdir=os.path.join(args.save_dir, 'log', 'valid'), flush_secs=10)
    tsw, vsw = train_summary_writer, valid_summary_writer

    logging.info('number of train batches: %d' % data.train_num_batch)
    validate_every = data.train_num_batch // 10
    best_vaild_accuacy = 0
    iter_count = 0
    tic = time.time()

    for epoch_num in range(args.max_epoch):
        for batch_iter, train_batch in enumerate(data.train_minibatch_generator()):
            progress = epoch_num + batch_iter / data.train_num_batch 
            iter_count += 1

            train_loss, train_accuracy = train_iter(args, train_batch, **trpack)
            add_scalar_summary(tsw, 'loss', train_loss, iter_count)
            add_scalar_summary(tsw, 'acc', train_accuracy, iter_count)

            if (batch_iter + 1) % (data.train_num_batch // 100) == 0:
                tac = (time.time() - tic) / 60
                print('   %.2f minutes\tprogress: %.2f' % (tac, progress))
            if (batch_iter + 1) % validate_every == 0:
                correct_sum = 0
                for valid_batch in data.dev_minibatch_generator():
                    correct, supplements = eval_iter(args, model, valid_batch)
                    correct_sum += unwrap_scalar_variable(correct)
                valid_accuracy = correct_sum / data.dev_size 
                scheduler.step(valid_accuracy)
                add_scalar_summary(vsw, 'acc', valid_accuracy, iter_count)
                logging.info('Epoch %.2f: valid accuracy = %.4f' % (progress, valid_accuracy))
                if valid_accuracy > best_vaild_accuacy:
                    correct_sum = 0
                    for test_batch in data.test_minibatch_generator():
                        correct, supplements = eval_iter(args, model, test_batch)
                        correct_sum += unwrap_scalar_variable(correct)
                    test_accuracy = correct_sum / data.test_size
                    best_vaild_accuacy = valid_accuracy
                    model_filename = ('model-%.2f-%.4f-%.4f.pkl' % (progress, valid_accuracy, test_accuracy))
                    model_path = os.path.join(args.save_dir, model_filename)
                    torch.save(model.state_dict(), model_path)
                    print('Saved the new best model to %s' % model_path)