def get_data(batch_size):
    """
	[
	'./data/semeval/training_data/SemEval2015-Task3-CQA-QL-dev-reformatted-excluding-2016-questions-cleansed.xml',
	'./data/semeval/training_data/SemEval2015-Task3-CQA-QL-test-reformatted-excluding-2016-questions-cleansed.xml',
	'./data/semeval/training_data/SemEval2015-Task3-CQA-QL-train-reformatted-excluding-2016-questions-cleansed.xml',
	'./data/semeval/training_data/SemEval2016-Task3-CQA-QL-dev.xml',
	'./data/semeval/training_data/SemEval2016-Task3-CQA-QL-test.xml',
	'./data/semeval/training_data/SemEval2016-Task3-CQA-QL-train-part1.xml',
	'./data/semeval/training_data/SemEval2016-Task3-CQA-QL-train-part2.xml'
	]
	"""
    train_file = [
        './data/semeval/training_data/SemEval2015-Task3-CQA-QL-dev-reformatted-excluding-2016-questions-cleansed.xml',
        './data/semeval/training_data/SemEval2015-Task3-CQA-QL-test-reformatted-excluding-2016-questions-cleansed.xml',
        './data/semeval/training_data/SemEval2015-Task3-CQA-QL-train-reformatted-excluding-2016-questions-cleansed.xml',
        './data/semeval/training_data/SemEval2016-Task3-CQA-QL-dev.xml',
        './data/semeval/training_data/SemEval2016-Task3-CQA-QL-test.xml',
        './data/semeval/training_data/SemEval2016-Task3-CQA-QL-train-part1.xml',
        './data/semeval/training_data/SemEval2016-Task3-CQA-QL-train-part2.xml'
    ]
    test_file = [
        './data/semeval/training_data/SemEval2016-Task3-CQA-QL-test.xml'
    ]

    #train_dataset = itemDataset( file_name=train_file,vocab='./datapiece/vocab_4096.model',transform=transforms.Compose([ToTensor()]))
    train_dataset = itemDataset(file_name=train_file,
                                vocab='./data/vocab',
                                cate='./data/cate',
                                transform=transforms.Compose([ToTensor()]))
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True,
                                  num_workers=16,
                                  collate_fn=collate_fn)

    #valid_dataset = itemDataset( file_name=test_file,vocab='./datapiece/vocab_4096.model',transform=transforms.Compose([ToTensor()]))
    valid_dataset = itemDataset(file_name=test_file,
                                vocab='./data/vocab',
                                cate='./data/cate',
                                transform=transforms.Compose([ToTensor()]))
    valid_dataloader = DataLoader(valid_dataset,
                                  batch_size=batch_size,
                                  shuffle=False,
                                  num_workers=16,
                                  collate_fn=collate_fn)

    dataloader = {}
    dataloader['train'] = train_dataloader
    dataloader['valid'] = valid_dataloader

    length = {}
    length['train'] = len(train_dataset)
    length['valid'] = len(valid_dataset)

    return dataloader, length
示例#2
0
def get_data(train_file,eval_file,batch_size,maxlen,vocab,embedding):
	train_dataset = itemDataset( file_name=train_file,mode='train',vocab=vocab,embedding=embedding,maxlen=maxlen)
	train_dataloader = DataLoader(train_dataset, batch_size=batch_size,shuffle=True, num_workers=16,collate_fn=collate_fn)
	
	eval_dataset = itemDataset( file_name=eval_file,mode='eval',vocab=vocab,embedding=embedding,maxlen=maxlen)
	eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size,shuffle=True, num_workers=16,collate_fn=collate_fn)
	
	return {
		'train':train_dataloader,
		'eval':eval_dataloader
	}
def get_data(test_file, batch_size, vocab, embedding, maxlen):
    test_dataset = itemDataset(file_name=test_file,
                               mode='test',
                               vocab=vocab,
                               embedding=embedding,
                               maxlen=maxlen)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=batch_size,
                                 shuffle=False,
                                 num_workers=16,
                                 collate_fn=collate_fn)

    return test_dataloader
示例#4
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--batch_size', default=128, type=int)
    parser.add_argument('--gpu', default=0, type=int)

    parser.add_argument('--load_from', required=True, type=str)
    parser.add_argument('--model', required=True, type=str)

    args = parser.parse_args()

    checkpoint = torch.load(args.load_from)

    if (torch.cuda.is_available()):
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    print("loading data")
    test_data = itemDataset('./data/test.json',
                            mode='test',
                            transform=transforms.Compose([ToTensor()]))
    print('--', args.model, '--')
    if (args.model == 'birnn'):
        test_loader = DataLoader(test_data,
                                 batch_size=args.batch_size,
                                 shuffle=True,
                                 num_workers=12,
                                 collate_fn=collate_fn)
    elif (args.model == 'birnn_co'):
        test_loader = DataLoader(test_data,
                                 batch_size=args.batch_size,
                                 shuffle=True,
                                 num_workers=12,
                                 collate_fn=collate_fn1)

    print("setting model")
    if (args.model == 'birnn'):
        model = RNN(test_data.token, checkpoint['args'])
    elif (args.model == 'birnn_co'):
        model = RNNC(test_data.token, checkpoint['args'])
    else:
        raise ValueError('no this model')
    model.load_state_dict(checkpoint['model'])
    model = model.to(device)
    print(model)

    test(args, model, test_loader, device)
示例#5
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--batch_size', default=256, type=int)
    parser.add_argument('--dropout', default=0, type=float)
    parser.add_argument('--epoch', default=20, type=int)
    parser.add_argument('--gpu', default=0, type=int)

    parser.add_argument('--word_dim', default=100, type=int)
    parser.add_argument('--hidden_dim', default=128, type=int)
    parser.add_argument('--batch_first', default=True, type=bool)
    parser.add_argument('--bidirectional', default=True, type=bool)
    parser.add_argument('--num_layer', default=2, type=int)

    parser.add_argument('--learning_rate', default=0.001, type=float)
    parser.add_argument('--mode', required=True, type=str)
    parser.add_argument('--model', required=True, type=str)

    args = parser.parse_args()

    if (torch.cuda.is_available()):
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')

    print("loading data")
    train_data = itemDataset('./data/train.json',
                             mode=args.mode,
                             transform=transforms.Compose([ToTensor()]))
    test_data = itemDataset('./data/test.json',
                            mode='test',
                            transform=transforms.Compose([ToTensor()]))

    if (args.model == 'birnn'):
        train_loader = DataLoader(train_data,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=12,
                                  collate_fn=collate_fn)
        test_loader = DataLoader(test_data,
                                 batch_size=args.batch_size,
                                 shuffle=True,
                                 num_workers=12,
                                 collate_fn=collate_fn)
    elif (args.model == 'birnn_co'):
        train_loader = DataLoader(train_data,
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  num_workers=12,
                                  collate_fn=collate_fn1)
        test_loader = DataLoader(test_data,
                                 batch_size=args.batch_size,
                                 shuffle=True,
                                 num_workers=12,
                                 collate_fn=collate_fn1)

    print("setting model")
    if (args.model == 'birnn'):
        model = RNN(train_data.token, args)
    elif (args.model == 'birnn_co'):
        model = RNNC(train_data.token, args)
    args.model += 'neer'
    model = model.to(device)
    print(model)
    if (not os.path.isdir('./save_model/{0}'.format(args.model))):
        os.mkdir('./save_model/{0}'.format(args.model))
    #for name,d in model.named_parameters():
    #	print(name,d.requires_grad)

    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=args.learning_rate)

    criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=0)

    train(args, model, train_loader, test_loader, criterion, optimizer, device)