示例#1
0
 def __init__(self,
              path,
              lang,
              n_layers=3,
              d_model=128,
              head=4,
              d_ff=512,
              dropout=0.2,
              lr=0.001,
              max_len=30):
     super(VanillaTransformer, self).__init__()
     if path:
         self.model = torch.load(path)
     else:
         self.model = make_model(lang.vectors,
                                 N=n_layers,
                                 d_model=d_model,
                                 d_ff=d_ff,
                                 h=head,
                                 dropout=dropout)
     self.lang = lang
     self.opt = torch.optim.Adam(self.model.parameters(), lr=lr)
     self.max_len = min(30, max_len)
     self.model.cuda()
     self.name = 'VanillaTransformer'
示例#2
0
def loadModel(PATH, SRC, TGT):
    state = torch.load(PATH)

    model = transformer.make_model(len(SRC.vocab), len(TGT.vocab))
    model.load_state_dict(state['state_dict'])

    batchSize = state['batchSize']

    epoch = state['epoch']

    return model, batchSize, epoch
示例#3
0
    def __init__(self, config):
        super(TransEncoder, self).__init__(config)

        self.config = config

        self.w2s = SequentialRepr(config, \
          input_dim = config.embed_dim, mode = "lstm")
        self.pe = PositionalEncoding(config.hidden_dim, config.dropout)

        self.s2d = make_model(N = config.num_layers,\
          d_model = config.hidden_dim, dropout = config.dropout)

        self.layer_norm = LayerNorm(config.hidden_dim)

        self.add_att = AttNet(config, config.hidden_dim)
        self.fc = nn.Linear(config.hidden_dim * 2, config.num_classes)
示例#4
0
    def __init__(self, config):
        super(TransEncoder, self).__init__(config)

        self.sent_repr_dim = config.hidden_dim
        self.w2s = SequentialRepr(config, \
          input_dim = config.embed_dim, mode = "lstm")
        # self.w2s_tl = SequentialRepr(config,\
        # input_dim = config.embed_dim, mode = "lstm")

        # self.s2d = SequentialRepr(config,
        #  input_dim = config.hidden_dim, mode = "lstm")

        self.pe = PositionalEncoding(self.sent_repr_dim, config.dropout)
        self.s2d = make_model(N = config.num_layers,\
          d_model = self.sent_repr_dim, dropout = config.dropout)

        self.satt_layer = AttNet(config, config.hidden_dim)
        self.datt_layer = AttNet(config, config.hidden_dim * 2)

        self.dropout = nn.Dropout(p=config.dropout)
        self.fc = nn.Linear(config.hidden_dim * 2, config.num_classes)
示例#5
0
    PATH = 'data_balance.csv'
    start_time = time.time()
    print("Loading data...")

    train_data = build_dataset('./train.csv')
    dev_data = build_dataset('./test.csv')

    #print(train_data)
    train_iter = build_iterator(train_data)
    dev_iter = build_iterator(dev_data)
    test_iter = dev_iter
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

    # train
    model = make_model().to(Config.device)
    #model=ff().to('cuda')
    print('init')
    '''
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    '''
    for name, w in model.named_parameters():

        if 'embedding' not in name:
            if len(w.size()) < 2:
                continue
            if 'weight' in name:
                nn.init.xavier_normal_(w)
示例#6
0
    'd_model': args.dmodel,
    'N': args.nstacklayers,
    'h': args.heads,
    'N_dense': args.Ndense,
    'lambda_attention': args.lattn,
    'lambda_distance': args.ldist,
    'leaky_relu_slope': 0.1,
    'dense_output_nonlinearity': 'relu',
    'distance_matrix_kernel': 'exp',
    'dropout': args.dropout,
    'aggregation_type': 'mean'
}

print('Making Model')

model = make_model(**model_params)
if args.pretrain:
    print(f'Loading pretrained weights from: {args.pretrain}')
    pretrained_state_dict = torch.load(args.pretrain)
    model_state_dict = model.state_dict()
    for name, param in pretrained_state_dict.items():
        if 'generator' in name:
            continue
        if isinstance(param, torch.nn.Parameter):
            param = param.data
        model_state_dict[name].copy_(param)
param_count = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('Number of parameters:', param_count)

if args.wandb:
    wandb.watch(model, 'all')
示例#7
0
            start = time.time()
            tokens = 0

    return total_loss / total_tokens


if __name__ == "__main__":
    dataset = NMTDataset.load_dataset_and_make_vectorizer(
        # "/home/liuxd/home/NLP/PyTorchNLPBook/code4model/data/translation2019zh_train-df_100000.csv"
        "/home/liuxd/home/NLP/PyTorchNLPBook/code4model/data/translation2019zh_train-df_70w.csv"
    )
    src_vocab_size = len(dataset.get_vectorizer().source_vocab)
    tgt_vocab_size = len(dataset.get_vectorizer().target_vocab)
    padding_idx = dataset.get_vectorizer().target_vocab.lookup_token('<MASK>')
    criterion = LabelSmoothing(size=tgt_vocab_size,
                               padding_idx=0,
                               smoothing=0.1)
    criterion.cuda()
    model = make_model(src_vocab_size, tgt_vocab_size, 6)
    model.cuda()
    model_opt = NoamOpt(
        model.src_embed[0].d_model, 1, 8000,
        torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98),
                         eps=1e-9))
    loss_compute = SimpleLossCompute(model.generator, criterion, model_opt)

    # train
    model.train()
    for epcho in range(10):
        data_iter = generate_nmt_batches(dataset, 16, device="cuda")
        run_epoch(data_iter, model, loss_compute)
示例#8
0
    test_dataset = DL.SNLT_Dataset(split='test', gloss=True)
    test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

    src_vocab = len(train_dataset.gloss_dictionary.idx2word)
    trg_vocab = len(train_dataset.dictionary.idx2word)

    device = 'cpu'
    model_cp = args.model
    N_blocks = args.n_blocks
    d_model = args.d_model
    d_ff = args.d_ff
    att_heads = args.att_heads

    model = tf.make_model(src_vocab,
                          trg_vocab,
                          N=N_blocks,
                          d_model=d_model,
                          d_ff=d_ff,
                          h=att_heads)
    model.load_state_dict(
        torch.load(model_cp, map_location=torch.device(device)))

    score_model(model,
                test_loader,
                device,
                train_dataset.dictionary,
                verbose=True)

    file_path = './models/G2T/NLL/bs128_NLL/generated_corpus.txt'
    #write_corpus(pred_corpus, file_path)
示例#9
0
        #return loss.data[0] * norm  # TODO
        return loss.item() * norm


# Train the simple copy task.
device = "cuda"
nrof_epochs = 20
batch_size = 32
V = 11  # 词典的数量
sequence_len = 15  # 生成的序列数据的长度
#nrof_batch_train_epoch = 20    # 训练时每个epoch多少个batch
#nrof_batch_valid_epoch = 5     # 验证时每个epoch多少个batch
nrof_batch_train_epoch = 30  # 训练时每个epoch多少个batch
nrof_batch_valid_epoch = 10  # 验证时每个epoch多少个batch
criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
model = make_model(V, V, N=2)
optimizer = torch.optim.Adam(model.parameters(),
                             lr=0,
                             betas=(0.9, 0.98),
                             eps=1e-9)
model_opt = NoamOpt(model.src_embed[0].d_model, 1, 400, optimizer)
if device == "cuda":
    model.cuda()

for epoch in range(nrof_epochs):
    print(f"\nepoch {epoch}")
    print("train...")
    model.train()
    data_iter = data_gen(V, sequence_len, batch_size, nrof_batch_train_epoch,
                         device)
    loss_compute = SimpleLossCompute(model.generator, criterion, model_opt)
示例#10
0
    batchSize = state['batchSize']

    epoch = state['epoch']

    return model, batchSize, epoch


"""**Initialize model, optimizer, criterion and iterators**"""

if (loadPreTrain or justEvaluate):
    print("Loading pre-trained network")
    model, BATCH_SIZE, previousEpochNb = loadModel(modelSavePath, SRC, TGT)
else:
    print("initializing network")
    model = transformer.make_model(len(SRC.vocab), len(TGT.vocab), N=6)

model_opt = transformer.NoamOpt(
    model.src_embed[0].d_model, 1, 2000,
    torch.optim.Adam(filter(lambda x: x.requires_grad, model.parameters()),
                     lr=0.001,
                     betas=(0.9, 0.98),
                     eps=1e-8))
model.cuda()
#criterion = transformer.LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1)
criterion = nn.CrossEntropyLoss()
#criterion.cuda()

print("Initializing iterators")
#train_iter = MyIterator(train, batch_size=BATCH_SIZE, device = device,
#                        repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
示例#11
0
    print('Training end to end model')

else:
    import transformer as tf
    print('Training gloss to text model')
    src_vocab = len(train_dataset.gloss_dictionary.idx2word)

trg_vocab = len(train_dataset.dictionary.idx2word)


train_loader = DataLoader(train_dataset, batch_size=args.b_size, shuffle=True, num_workers = args.workers)
dev_loader = DataLoader(dev_dataset, batch_size=args.b_size, shuffle=True, num_workers = args.workers)
test_loader = DataLoader(test_dataset, batch_size=1)

criterion = tf.LabelSmoothing(size=trg_vocab, padding_idx=0, smoothing=0.0)
model = tf.make_model(src_vocab, trg_vocab, N=args.n_blocks, d_model=args.d_model, d_ff=args.d_ff, h=args.att_heads)

if args.checkpoint is not None:
    model.load_state_dict(torch.load(args.checkpoint))
    print('Loaded state_dict to the model before starting train')

model.to(device)
model_opt = tf.NoamOpt(args.d_model, 1, 2000,
                       torch.optim.Adam(model.parameters(), lr=args.learning_rate, betas=(0.9, 0.98), eps=1e-9))
if __name__ == '__main__':
    mp.set_start_method('spawn')

    train_losses = []
    dev_losses = []
    best_loss = None
示例#12
0
def main():
	args = parser.parse_args()

	# load dataset
	#sent_pairs = load_dataset_aihub()
	sent_pairs = load_dataset_aihub(path='data/')
	#random.seed(100)
	#random.shuffle(sent_pairs)

	# make dataloader with dataset
	# FIXME: RuntimeError: Internal: unk is not defined.
	inp_lang, out_lang = get_sentencepiece(src_prefix, trg_prefix)
	log.info('loaded input sentencepiece model: {}'.format(src_prefix))
	log.info('loaded output sentencepiece model: {}'.format(trg_prefix))

	# split train/valid sentence pairs
	n_train = int(len(sent_pairs) * 0.8)
	valid_sent_pairs = sent_pairs[n_train:]
	log.info('valid_sent_pairs: {}'.format(len(valid_sent_pairs)))

	# these are used for defining tokenize method and some reserved words
	SRC = KRENField(pad_token='<pad>')
	TRG = KRENField(pad_token='<pad>')

	# load SRC/TRG
	if not os.path.exists('spm/{}.model'.format(src_prefix)) or \
		not os.path.exists('spm/{}.model'.format(trg_prefix)):
		# build vocabulary
		SRC.build_vocab(train.src)
		TRG.build_vocab(train.trg)
		torch.save(SRC.vocab, 'spm/{}.spm'.format(src_prefix), pickle_module=dill)
		torch.save(TRG.vocab, 'spm/{}.spm'.format(trg_prefix), pickle_module=dill)
		log.info('input vocab was created and saved: spm/{}.spm'.format(src_prefix))
		log.info('output vocab was created and saved: spm/{}.spm'.format(trg_prefix))
	else:
		src_vocab = torch.load('spm/{}.spm'.format(src_prefix), pickle_module=dill)
		trg_vocab = torch.load('spm/{}.spm'.format(trg_prefix), pickle_module=dill)
		SRC.vocab = src_vocab
		TRG.vocab = trg_vocab
		log.info('input vocab was loaded: spm/{}.spm'.format(src_prefix))
		log.info('output vocab was loaded: spm/{}.spm'.format(trg_prefix))

	SRC.tokenize = inp_lang.EncodeAsIds
	TRG.tokenize = out_lang.EncodeAsIds
	SRC.detokenize = inp_lang.DecodeIds
	TRG.detokenize = out_lang.DecodeIds

	# make dataloader from KRENDataset
	#train, valid, test = KRENDataset.splits(sent_pairs, (SRC, TRG), inp_lang, out_lang, encoding_type='ids')
	train, valid, test = KRENDataset.splits(sent_pairs, (SRC, TRG), inp_lang, out_lang, encoding_type='pieces')
	valid_iter = MyIterator(valid, batch_size=100, device=0,
							repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
							batch_size_fn=batch_size_fn, train=False)
	#encoding_decoding_test(SRC, sent_pairs[0][0])
	#encoding_decoding_test(TRG, sent_pairs[0][1])


	# fix torch randomness
	fix_torch_randomness()

	# define input/output size
	args.inp_n_words = src_vocab_size
	args.out_n_words = trg_vocab_size
	log.info('inp_n_words: {} out_n_words: {}'.format(args.inp_n_words, args.out_n_words))

	# define model
	if args.small_model:
		model = make_model(
			args.inp_n_words, 
			args.out_n_words,
			dropout=args.dropout)
	else:
		model = make_model(
			args.inp_n_words, 
			args.out_n_words,
			N=N,
			d_model=args.d_model,
			d_ff=args.d_ff,
			h=args.h,
			dropout=args.dropout)

	#model_name_full_path = './models/model-tmp.bin'
	model_name_full_path = args.modelnm
	checkpoint = torch.load(model_name_full_path)
	state_dict = checkpoint['state_dict']
	model.load_state_dict(state_dict)
	model.cuda()
	
	model.eval()
	for i, batch in enumerate(valid_iter):
		src = batch.src.transpose(0, 1)[:1]
		src_mask = (src != SRC.vocab.stoi["<pad>"]).unsqueeze(-2)
		print(SRC.detokenize(src.numpy()[0].tolist()))
		print("Input::", end="\t")
		for i in range(src.size(1)):
			sym = SRC.vocab.itos[src[0, i].data.item()]
			if sym == "</s>": break
			print(sym, end =" ")
		print('')
		
		out = greedy_decode(model, src.cuda(), src_mask.cuda(), 
							max_len=60, start_symbol=TRG.vocab.stoi["<s>"])
		print("Translation with TRG:", end="\t")
		for i in range(1, out.size(1)):
			sym = TRG.vocab.itos[out[0, i].data.item()]
			if sym == "</s>": break
			print(sym, end =" ")
		print('')

		print("Translation with tokenize:", end="\t")
		out_list = []
		for i in range(1, out.size(1)):
			sym = out[0, i].data.item()
			if sym == TRG.vocab.stoi['</s>']:
				break
			out_list.append(sym)
		print(TRG.detokenize(out_list))

		print("Target:", end="\t")
		for i in range(1, batch.trg.size(0)):
			sym = TRG.vocab.itos[batch.trg.data[i, 0].item()]
			if sym == "</s>": break
			print(sym, end =" ")
		print('')
		print('---------------')
示例#13
0
    def __init__(self, params, newFeats=0, behavFeats=0):
        super(highwayNet, self).__init__()

        self.newFeats = newFeats
        self.behavFeats = behavFeats
        self.att_weights = None
        self.use_spatial_attention = True

        ## Unpack arguments
        self.params = params

        ## Use gpu flag
        self.use_cuda = params.use_cuda

        # Flag for maneuver based (True) vs uni-modal decoder (False)
        self.use_maneuvers = params.use_maneuvers
        if params.use_grid == 2:
            self.use_grid_soc = True
            self.use_grid = False
        else:
            self.use_grid = params.use_grid
            self.use_grid_soc = False

        # Transformer architecture related
        self.use_transformer = params.use_transformer
        self.teacher_forcing_ratio = 0.0  # Set to 0: we overfit otherwise

        # RNN-LSTM Seq2seq architecture related
        self.use_bidir = params.use_bidir
        # NB: seq2seq uses a bidir encoder (always)
        self.use_seq2seq = params.use_seq2seq
        if self.use_seq2seq:
            self.use_bidir = True

        # RNN-LSTM with Attention architecture related
        self.use_attention = params.use_attention
        if self.use_attention:
            self.use_bidir = True

        # Flag for train mode (True) vs test-mode (False)
        self.train_flag = params.train_flag
        if self.train_flag is False:
            self.teacher_forcing_ratio = 0.0

        ## Sizes of network layers
        self.encoder_size = params.encoder_size
        self.decoder_size = params.decoder_size
        self.in_length = params.in_length
        self.out_length = params.out_length
        self.grid_size = params.grid_size
        self.soc_conv_depth = params.soc_conv_depth
        self.conv_3x1_depth = params.conv_3x1_depth
        self.dyn_embedding_size = params.dyn_embedding_size
        self.input_embedding_size = params.input_embedding_size
        self.num_lat_classes = params.num_lat_classes
        self.num_lon_classes = params.num_lon_classes
        self.soc_embedding_size = ((
            (params.grid_size[0] - 4) + 1) // 2) * self.conv_3x1_depth

        ## Define network weights
        # TRANSFORMER
        if self.use_transformer:
            src_feats = 2 + self.newFeats  # (X,Y) point or (X,Y,A/V)
            tgt_feats = 2  # (X,Y) point
            tgt_params = 5  # 5 params for bivariate Gaussian distrib

            if self.use_grid or self.use_grid_soc:
                src_ngrid = self.in_length  # with soc
            else:
                src_ngrid = 0  # without soc

            if self.use_maneuvers:
                d_lon = self.num_lon_classes
                d_lat = self.num_lat_classes
            else:
                d_lon = 0
                d_lat = 0

            if self.use_grid_soc:
                self.transformer = tsf.make_model(
                    src_feats,
                    tgt_feats,
                    tgt_params=tgt_params,
                    src_ngrid=src_ngrid,
                    src_lon=d_lon,
                    src_lat=d_lat,
                    src_soc_emb_size=self.soc_embedding_size,
                    src_grid=self.params.grid_size)
            else:
                self.transformer = tsf.make_model(
                    src_feats,
                    tgt_feats,
                    tgt_params=tgt_params,
                    src_ngrid=src_ngrid,
                    src_lon=d_lon,
                    src_lat=d_lat,
                    src_grid=self.params.grid_size)
            print("TRANSFORMER:", self.transformer)
            self.batch = tsf.Batch()

        # Input embedding layer
        self.n_feats = 2 + self.newFeats
        self.ip_emb = torch.nn.Linear(self.n_feats, self.input_embedding_size)

        # Spatial Attention Path pipeline: a specific LSTM encoder + 2xConv/pool to process behavioral features
        if self.newFeats > 0 and self.use_spatial_attention:
            self.use_bidir = False  # just to make code more simple
            # Similar pipeline than SOC but with specific weights
            self.ip_behav_emb = torch.nn.Linear(self.n_feats,
                                                self.input_embedding_size)
            self.enc_behav_lstm = torch.nn.LSTM(self.input_embedding_size,
                                                self.encoder_size, 1)
            # Spatial Attention layer
            self.op_att1 = torch.nn.Linear(self.encoder_size, 10)
            self.op_att2 = torch.nn.Linear(10, 1)

        # Encoder LSTM
        if self.use_bidir:
            self.enc_lstm = torch.nn.LSTM(self.input_embedding_size,
                                          self.encoder_size,
                                          1,
                                          bidirectional=True)
            self.encoder_ndir = 2
        else:
            self.enc_lstm = torch.nn.LSTM(self.input_embedding_size,
                                          self.encoder_size, 1)
            self.encoder_ndir = 1

        # Vehicle dynamics embedding
        self.dyn_emb = torch.nn.Linear(self.encoder_size,
                                       self.dyn_embedding_size)

        # Convolutional social pooling layer and social embedding layer
        self.soc_conv = torch.nn.Conv2d(self.encoder_size, self.soc_conv_depth,
                                        3)
        self.soc_conv_3x1 = torch.nn.Conv2d(self.soc_conv_depth,
                                            self.conv_3x1_depth, (3, 1))
        self.soc_maxpool = torch.nn.MaxPool2d((2, 1), padding=(1, 0))

        # FC social pooling layer (for comparison):
        # self.soc_fc = torch.nn.Linear(self.soc_conv_depth * self.grid_size[0] * self.grid_size[1], (((params.grid_size[0]-4)+1)//2)*self.conv_3x1_depth)

        if self.use_seq2seq or self.use_attention:  # Decoder seq2seq LSTM (Attention buils on top of seq2seq)
            if self.use_maneuvers:
                self.proj_seq2seq = torch.nn.Linear(
                    self.soc_embedding_size +
                    self.encoder_ndir * self.dyn_embedding_size +
                    self.num_lat_classes + self.num_lon_classes,
                    self.decoder_size)
            else:
                self.proj_seq2seq = torch.nn.Linear(
                    self.soc_embedding_size +
                    self.encoder_ndir * self.dyn_embedding_size,
                    self.decoder_size)

            if self.use_seq2seq:
                self.num_layers = 2  # XXX
            else:
                self.num_layers = 1  # XXX
            self.dec_seq2seq = torch.nn.LSTM(self.decoder_size,
                                             self.decoder_size,
                                             num_layers=self.num_layers)
        elif self.use_transformer is False:  # Legacy Decoder LSTM
            if self.use_maneuvers:
                self.dec_lstm = torch.nn.LSTM(
                    self.soc_embedding_size + self.dyn_embedding_size +
                    self.num_lat_classes + self.num_lon_classes,
                    self.decoder_size)
            else:
                self.dec_lstm = torch.nn.LSTM(
                    self.soc_embedding_size + self.dyn_embedding_size,
                    self.decoder_size)

        if self.use_attention:
            self.attn_densor1 = torch.nn.Linear(
                self.encoder_ndir * self.encoder_size + self.decoder_size, 10)
            self.attn_densor2 = torch.nn.Linear(10, 1)

        # Output layers:
        if self.use_transformer is False:
            self.op = torch.nn.Linear(self.decoder_size, 5)

        self.op_lat = torch.nn.Linear(
            self.soc_embedding_size + self.dyn_embedding_size,
            self.num_lat_classes)
        self.op_lon = torch.nn.Linear(
            self.soc_embedding_size + self.dyn_embedding_size,
            self.num_lon_classes)

        # Activations:
        self.leaky_relu = torch.nn.LeakyReLU(0.1)
        self.relu = torch.nn.ReLU()
        self.softmax = torch.nn.Softmax(dim=1)
示例#14
0
                                  batch_size=args.batch_size,
                                  shuffle=True,
                                  collate_fn=collate_fn)
    dev_dataset = NMTDataset(os.path.join(args.data, 'dev.en'),
                             os.path.join(args.data, 'dev.zh'), src_sp, trg_sp)
    dev_dataloader = DataLoader(dev_dataset,
                                batch_size=args.test_batch_size,
                                shuffle=False,
                                collate_fn=collate_fn)
    test_dataset = NMTDataset(os.path.join(args.data, 'test.en'),
                              os.path.join(args.data, 'test.zh'), src_sp,
                              trg_sp)
    test_dataloader = DataLoader(test_dataset,
                                 batch_size=args.test_batch_size,
                                 shuffle=False,
                                 collate_fn=collate_fn)

    model = make_model(src_vocab=train_dataset.src_vocabs_size,
                       tgt_vocab=train_dataset.trg_vocabs_size,
                       N=args.layers,
                       d_model=args.d_model,
                       d_ff=args.d_ff,
                       h=args.heads,
                       dropout=args.dropout)
    model = model.cuda()
    print('total #parameters: {}'.format(
        sum(p.numel() for p in model.parameters())))
    writer = SummaryWriter(args.output_dir)

    train(train_dataloader, dev_dataloader, model, args, writer, trg_sp)
示例#15
0
文件: train.py 项目: jinkilee/LaH
def main():
    args = parser.parse_args()

    if args.multi_gpu:
        ngpus_per_node = torch.cuda.device_count()
    else:
        ngpus_per_node = 1

    args.world_size = ngpus_per_node

    global best_acc1
    args.gpu = args.local_rank
    torch.cuda.set_device(args.gpu)

    dist.init_process_group(backend='nccl',
                            init_method='env://',
                            world_size=args.world_size,
                            rank=args.gpu)

    # load dataset
    #sent_pairs = load_dataset_aihub(path='data/')
    sent_pairs = load_dataset_aihub()
    print_log('GPU#{} seeding with {}'.format(args.gpu, args.gpu))

    # make dataloader with dataset
    # FIXME: RuntimeError: Internal: unk is not defined.
    inp_lang, out_lang = get_sentencepiece(src_prefix, trg_prefix)
    print_log('loaded input sentencepiece model: {}'.format(src_prefix))
    print_log('loaded output sentencepiece model: {}'.format(trg_prefix))

    # split train/valid sentence pairs
    n_train = int(len(sent_pairs) * 0.8)
    n_split = int(n_train * 1. / args.world_size)
    print_log(n_split * args.gpu, n_split * (args.gpu + 1))

    train_sent_pairs = sent_pairs[:n_train]
    print_log('train_sent_pairs before split: {}'.format(
        len(train_sent_pairs)))

    # split train datset by GPU
    train_sent_pairs = train_sent_pairs[n_split * args.gpu:n_split *
                                        (args.gpu + 1)]
    train_sent_pairs = sorted(train_sent_pairs,
                              key=lambda x: (len(x[0]), len(x[1])))
    print_log('train_sent_pairs after split: {} --> GPU:{}'.format(
        len(train_sent_pairs), args.gpu))

    valid_sent_pairs = sent_pairs[n_train:]
    print_log('valid_sent_pairs: {}'.format(len(valid_sent_pairs)))

    # these are used for defining tokenize method and some reserved words
    SRC = KRENField(pad_token='<pad>')
    TRG = KRENField(pad_token='<pad>')

    SRC.decode = inp_lang.DecodeIds
    TRG.decode = out_lang.DecodeIds
    SRC.encode = inp_lang.EncodeAsIds
    TRG.encode = out_lang.EncodeAsIds

    # load SRC/TRG
    if not os.path.exists('spm/{}.model'.format(src_prefix)) or \
     not os.path.exists('spm/{}.model'.format(trg_prefix)):
        # build vocabulary
        SRC.build_vocab(train.src)
        TRG.build_vocab(train.trg)
        torch.save(SRC.vocab,
                   'spm/{}.spm'.format(src_prefix),
                   pickle_module=dill)
        torch.save(TRG.vocab,
                   'spm/{}.spm'.format(trg_prefix),
                   pickle_module=dill)
        print_log(
            'input vocab was created and saved: spm/{}.spm'.format(src_prefix))
        print_log('output vocab was created and saved: spm/{}.spm'.format(
            trg_prefix))
    else:
        src_vocab = torch.load('spm/{}.spm'.format(src_prefix),
                               pickle_module=dill)
        trg_vocab = torch.load('spm/{}.spm'.format(trg_prefix),
                               pickle_module=dill)
        SRC.vocab = src_vocab
        TRG.vocab = trg_vocab
        print_log('input vocab was loaded: spm/{}.spm'.format(src_prefix))
        print_log('output vocab was loaded: spm/{}.spm'.format(trg_prefix))

    # make dataloader from KRENDataset
    train, valid, test = KRENDataset.splits(sent_pairs, (SRC, TRG),
                                            inp_lang,
                                            out_lang,
                                            encoding_type='pieces')

    # output -> ['<s>', '▁', 'Central', '▁Asian', '▁c', 'u', 'is', ... '▁yesterday', '.', '</s>']
    train_iter = MyIterator(train,
                            batch_size=args.train_batch_size,
                            device=0,
                            repeat=False,
                            sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn,
                            train=True)
    valid_iter = MyIterator(valid,
                            batch_size=args.valid_batch_size,
                            device=0,
                            repeat=False,
                            sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn,
                            train=False)
    # fix torch randomness
    fix_torch_randomness()

    # define input/output size
    args.inp_n_words = src_vocab_size
    args.out_n_words = trg_vocab_size
    print_log('inp_n_words: {} out_n_words: {}'.format(args.inp_n_words,
                                                       args.out_n_words))

    # define model
    model = make_model(args.inp_n_words,
                       args.out_n_words,
                       N=N,
                       d_model=args.d_model,
                       d_ff=args.d_ff,
                       h=args.h,
                       dropout=args.dropout)
    print_log('number of model parameters: {}'.format(
        get_number_of_params(model)))
    model.cuda()
    optimizer = get_std_opt(model, args.fp16)

    # initizlie model and optimizer for amp
    model, optimizer = amp.initialize(
        model,
        optimizer,
        opt_level=args.opt_level,
        #keep_batchnorm_fp32=args.keep_batchnorm_fp32,
        #loss_scale=args.loss_scale
    )
    #optimizer.optimizer = opt

    if args.fp16:
        model = DDP(model, delay_allreduce=True)
    else:
        model = DDP(model, device_ids=[args.gpu])

    # define model
    criterion = LabelSmoothing(size=args.out_n_words,
                               padding_idx=0,
                               smoothing=0.1)
    criterion.cuda()

    # initial best loss
    best_val_loss = np.inf

    # initialize visdom graph
    #vis_train = Visdom()
    #vis_valid = Visdom()

    #train_loss_list = []
    #valid_loss_list = []

    if args.gpu == 0:
        randidx = '{}'.format(np.random.randint(0, 10000)).zfill(4)
        model_name = 'transformer-s{}-t{}-b{}-n{}-md{}-ff{}-h{}-r{}.bin'.format(
            args.inp_n_words,  # s  : source vocab count
            args.out_n_words,  # t  : target vocab count
            args.train_batch_size,  # b  : batch size
            args.N,  # n  : number of layers
            args.d_model,  # md : d_model
            args.d_ff,  # ff : d_ff
            args.h,  # h  : hidden size
            randidx)  # r  : random number
    else:
        model_name = 'a.bin'
    print_log('model name to be saved: {}'.format(
        os.path.join(args.model_path, model_name)))

    for epoch in range(args.epochs):
        train_losses = train_epoch((rebatch(pad_id, b) for b in train_iter),
                                   model, criterion, optimizer, args.gpu,
                                   epoch, args.fp16)
        valid_loss = valid_epoch((rebatch(pad_id, b) for b in valid_iter),
                                 model, criterion, optimizer, epoch, args.fp16)

        sum_of_weight = sum(
            [p[1].data.sum() for p in model.named_parameters()])
        print_log('GPU{} -> sum_of_weight={:.4f}'.format(
            args.gpu, sum_of_weight))

        if args.gpu == 0:
            if valid_loss >= best_val_loss:
                print_log('Try again. Current best is still {:.4f} (< {:.4f})'.
                          format(best_val_loss, valid_loss))
            else:
                print_log('New record. from {:.4f} to {:.4f}'.format(
                    best_val_loss, valid_loss))
                best_val_loss = valid_loss
                save_model(args,
                           model,
                           optimizer,
                           epoch,
                           valid_loss,
                           model_name=model_name)

        # blocking processes
        torch.distributed.barrier()
示例#16
0
def main(args):
    src, tgt = load_data(args.path)

    src_vocab = Vocab(init_token='<sos>',
                      eos_token='<eos>',
                      pad_token='<pad>',
                      unk_token='<unk>')
    src_vocab.load(os.path.join(args.path, 'vocab.en'))
    tgt_vocab = Vocab(init_token='<sos>',
                      eos_token='<eos>',
                      pad_token='<pad>',
                      unk_token='<unk>')
    tgt_vocab.load(os.path.join(args.path, 'vocab.de'))

    sos_idx = 0
    eos_idx = 1
    pad_idx = 2
    max_length = 50

    src_vocab_size = len(src_vocab)
    tgt_vocab_size = len(tgt_vocab)

    # Set hyper parameter
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = make_model(src_vocab_size, tgt_vocab_size).to(device)
    optimizer = get_std_opt(model)
    criterion = LabelSmoothing(size=tgt_vocab_size,
                               padding_idx=pad_idx,
                               smoothing=0.1)
    train_criterion = SimpleLossCompute(model.generator, criterion, optimizer)
    valid_criterion = SimpleLossCompute(model.generator, criterion, None)
    print('Using device:', device)

    if not args.test:
        train_loader = get_loader(src['train'],
                                  tgt['train'],
                                  src_vocab,
                                  tgt_vocab,
                                  batch_size=args.batch_size,
                                  shuffle=True)
        valid_loader = get_loader(src['valid'],
                                  tgt['valid'],
                                  src_vocab,
                                  tgt_vocab,
                                  batch_size=args.batch_size)

        best_loss = 987654321
        for epoch in range(args.epochs):
            train_total_loss, valid_total_loss = 0.0, 0.0
            start = time.time()
            total_tokens = 0
            tokens = 0

            model.train()
            # Train
            for src_batch, tgt_batch in train_loader:
                src_batch = torch.tensor(src_batch).to(device)
                tgt_batch = torch.tensor(tgt_batch).to(device)
                batch = Batch(src_batch, tgt_batch, pad_idx)

                prediction = model(batch.src, batch.trg, batch.src_mask,
                                   batch.trg_mask)
                loss = train_criterion(prediction, batch.trg_y, batch.ntokens)

                train_total_loss += loss
                total_tokens += batch.ntokens
                tokens += batch.ntokens

            # Valid
            model.eval()
            for src_batch, tgt_batch in valid_loader:
                src_batch = torch.tensor(src_batch).to(device)
                tgt_batch = torch.tensor(tgt_batch).to(device)
                batch = Batch(src_batch, tgt_batch, pad_idx)

                prediction = model(batch.src, batch.trg, batch.src_mask,
                                   batch.trg_mask)
                loss = valid_criterion(prediction, batch.trg_y, batch.ntokens)
                valid_total_loss += loss
                total_tokens += batch.ntokens
                tokens += batch.ntokens

            if valid_total_loss.item() < best_loss:
                best_loss = valid_total_loss
                best_model_state = model.state_dict()
                best_optimizer_state = optimizer.optimizer.state_dict()

            elpsed = time.time() - start
            print(
                time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + "|| [" +
                str(epoch) + "/" + str(args.epochs) + "], train_loss = " +
                str(train_total_loss.item()) + ", valid_loss = " +
                str(valid_total_loss.item()) + ", Tokens per Sec = " +
                str(tokens.item() / elpsed))
            tokens = 0
            start = time.time()

            if epoch % 100 == 0:
                # Save model
                torch.save(
                    {
                        'epoch': args.epochs,
                        'model_state_dict': best_model_state,
                        'optimizer_state': best_optimizer_state,
                        'loss': best_loss
                    }, args.model_dir + "/intermediate.pt")
                print("Model saved")

        # Save model
        torch.save(
            {
                'epoch': args.epochs,
                'model_state_dict': best_model_state,
                'optimizer_state': best_optimizer_state,
                'loss': best_loss
            }, args.model_dir + "/best.pt")
        print("Model saved")
    else:
        # Load the model
        checkpoint = torch.load(args.model_dir + "/" + args.model_name,
                                map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.optimizer.load_state_dict(checkpoint['optimizer_state'])
        model.eval()
        print("Model loaded")

        # Test
        test_loader = get_loader(src['test'],
                                 tgt['test'],
                                 src_vocab,
                                 tgt_vocab,
                                 batch_size=args.batch_size)

        pred = []

        for src_batch, tgt_batch in test_loader:
            src_batch = torch.tensor(src_batch).to(device)
            tgt_batch = torch.tensor(tgt_batch).to(device)
            batch = Batch(src_batch, tgt_batch, pad_idx)

            # Get pred_batch
            memory = model.encode(batch.src, batch.src_mask)
            pred_batch = torch.ones(src_batch.size(0), 1)\
                            .fill_(sos_idx).type_as(batch.src.data).to(device)
            for i in range(max_length - 1):
                out = model.decode(
                    memory, batch.src_mask, Variable(pred_batch),
                    Variable(
                        Batch.make_std_mask(pred_batch,
                                            pad_idx).type_as(batch.src.data)))
                prob = model.generator(out[:, -1])
                prob.index_fill_(1,
                                 torch.tensor([sos_idx, pad_idx]).to(device),
                                 -float('inf'))
                _, next_word = torch.max(prob, dim=1)

                pred_batch = torch.cat(
                    [pred_batch, next_word.unsqueeze(-1)], dim=1)
            pred_batch = torch.cat([pred_batch, torch.ones(src_batch.size(0), 1)\
                                                    .fill_(eos_idx).type_as(batch.src.data).to(device)], dim=1)

            # every sentences in pred_batch should start with <sos> token (index: 0) and end with <eos> token (index: 1).
            # every <pad> token (index: 2) should be located after <eos> token (index: 1).
            # example of pred_batch:
            # [[0, 5, 6, 7, 1],
            #  [0, 4, 9, 1, 2],
            #  [0, 6, 1, 2, 2]]
            pred += seq2sen(pred_batch.tolist(), tgt_vocab)

        with open('results/pred.txt', 'w', encoding='utf-8') as f:
            for line in pred:
                f.write('{}\n'.format(line))

        os.system(
            'bash scripts/bleu.sh results/pred.txt multi30k/test.de.atok')
示例#17
0
    for i in range(nbatches):
        data = torch.from_numpy(np.random.randint(1, V, size=(batch, 10)))
        print(data)
        data = data.type(torch.LongTensor)
        data[:, 0] = 1
        src = Variable(data, requires_grad=False)
        tgt = Variable(data, requires_grad=False)
        yield Batch(src, tgt, 0)


# Train the simple copy task.
V = 11
MODEL_SIZE = 10
Heads = 2
criterion = LabelSmoothing(size=V, padding_idx=0, smoothing=0.0)
model = make_model(V, V, N=2, h=Heads, d_model=MODEL_SIZE, dropout=0.0)

for p in model.parameters():
    nn.init.ones_(p)

model_opt = NoamOpt(
    model.src_embed[0].d_model,
    1,
    400,
    torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9),
)

nepoch = 5  # 10
batch_size = 2  # 30
nbatches = 2  # 20
nbatches_eval = 5
	def __init__(self,params):
		super(highwayNet, self).__init__()

		## Unpack arguments
		self.params = params

		## Use gpu flag
		self.use_cuda = params.use_cuda

		# Flag for maneuver based (True) vs uni-modal decoder (False)
		self.use_maneuvers = params.use_maneuvers
		if params.use_grid == 2:
			self.use_grid_soc = True
			self.use_grid = False
		else:
			self.use_grid = params.use_grid
			self.use_grid_soc = False

		# Transformer architecture related
		self.use_transformer = params.use_transformer
		self.teacher_forcing_ratio = 0.0 # TODO ultimately set it in [0.9; 1.0]

		# RNN-LSTM Seq2seq architecture related
		self.use_bidir = params.use_bidir
		# NB: seq2seq uses a bidir encoder (always)
		self.use_seq2seq = params.use_seq2seq
		if self.use_seq2seq:
			self.use_bidir = True

		# RNN-LSTM with Attention architecture related
		self.use_attention = params.use_attention
		if self.use_attention:
			self.use_bidir = True

		# Flag for train mode (True) vs test-mode (False)
		self.train_flag = params.train_flag
		if self.train_flag is False:
			self.teacher_forcing_ratio = 0.0

		## Sizes of network layers
		self.encoder_size = params.encoder_size
		self.decoder_size = params.decoder_size
		self.in_length = params.in_length
		self.out_length = params.out_length
		self.grid_size = params.grid_size
		self.soc_conv_depth = params.soc_conv_depth
		self.conv_3x1_depth = params.conv_3x1_depth
		self.dyn_embedding_size = params.dyn_embedding_size
		self.input_embedding_size = params.input_embedding_size
		self.num_lat_classes = params.num_lat_classes
		self.num_lon_classes = params.num_lon_classes
		self.soc_embedding_size = (((params.grid_size[0]-4)+1)//2)*self.conv_3x1_depth

		## Define network weights
		# TRANSFORMER
		if self.use_transformer:
			src_feats = tgt_feats = 2 # (X,Y) point
			tgt_params = 5 # 5 params for bivariate Gaussian distrib

			if self.use_grid or self.use_grid_soc:
				src_ngrid = self.in_length # with soc
			else:
				src_ngrid = 0 # without soc

			if self.use_maneuvers:
				d_lon = self.num_lon_classes
				d_lat = self.num_lat_classes
			else:
				d_lon = 0
				d_lat = 0

			if self.use_grid_soc:
				self.transformer = tsf.make_model(src_feats, tgt_feats, 
            	                                  tgt_params=tgt_params,
            	                                  src_ngrid=src_ngrid, 
            	                                  src_lon=d_lon, src_lat=d_lat,
            	                                  src_soc_emb_size=self.soc_embedding_size)
			else:
				self.transformer = tsf.make_model(src_feats, tgt_feats, 
            	                                  tgt_params=tgt_params,
            	                                  src_ngrid=src_ngrid, 
            	                                  src_lon=d_lon, src_lat=d_lat)
			print("TRANSFORMER:", self.transformer)
			self.batch = tsf.Batch()

		# Input embedding layer
		self.ip_emb = torch.nn.Linear(2,self.input_embedding_size)

		# Encoder LSTM
		if self.use_bidir:
			self.enc_lstm = torch.nn.LSTM(self.input_embedding_size, self.encoder_size, 1, bidirectional=True)
			self.encoder_ndir = 2
		else:
			self.enc_lstm = torch.nn.LSTM(self.input_embedding_size, self.encoder_size, 1)
			self.encoder_ndir = 1

		# Vehicle dynamics embedding
		self.dyn_emb = torch.nn.Linear(self.encoder_size,self.dyn_embedding_size)

		# Convolutional social pooling layer and social embedding layer
		self.soc_conv = torch.nn.Conv2d(self.encoder_size,self.soc_conv_depth,3)
		self.conv_3x1 = torch.nn.Conv2d(self.soc_conv_depth, self.conv_3x1_depth, (3,1))
		self.soc_maxpool = torch.nn.MaxPool2d((2,1),padding = (1,0))
		self.grid_emb = torch.nn.Linear(5 ,self.input_embedding_size)

		# FC social pooling layer (for comparison):
		# self.soc_fc = torch.nn.Linear(self.soc_conv_depth * self.grid_size[0] * self.grid_size[1], (((params.grid_size[0]-4)+1)//2)*self.conv_3x1_depth)

		if self.use_seq2seq or self.use_attention:# Decoder seq2seq LSTM (Attention buils on top of seq2seq)
			if self.use_maneuvers:
				self.proj_seq2seq = torch.nn.Linear(self.encoder_ndir * self.soc_embedding_size + self.encoder_ndir * self.dyn_embedding_size + self.num_lat_classes + self.num_lon_classes, self.decoder_size)
			else:
				self.proj_seq2seq = torch.nn.Linear(self.encoder_ndir * self.soc_embedding_size + self.encoder_ndir * self.dyn_embedding_size, self.decoder_size)

			if self.use_seq2seq:
				self.num_layers = 2 # XXX
			else:
				self.num_layers = 1 # XXX
			self.dec_seq2seq = torch.nn.LSTM(self.decoder_size, self.decoder_size, num_layers=self.num_layers)
		elif self.use_transformer is False: # Legacy Decoder LSTM
			if self.use_maneuvers:
				self.dec_lstm = torch.nn.LSTM(self.soc_embedding_size + self.dyn_embedding_size + self.num_lat_classes + self.num_lon_classes, self.decoder_size)
			else:
				self.dec_lstm = torch.nn.LSTM(self.soc_embedding_size + self.dyn_embedding_size, self.decoder_size)

		if self.use_attention:
			self.attn_densor1 = torch.nn.Linear(self.encoder_ndir * self.encoder_size + self.decoder_size, 10)
			self.attn_densor2 = torch.nn.Linear(10, 1)

		# Output layers:
		if self.use_transformer is False:
			self.op = torch.nn.Linear(self.decoder_size,5)

		self.op_lat = torch.nn.Linear(self.soc_embedding_size + self.dyn_embedding_size, self.num_lat_classes)
		self.op_lon = torch.nn.Linear(self.soc_embedding_size + self.dyn_embedding_size, self.num_lon_classes)

		# Activations:
		self.leaky_relu = torch.nn.LeakyReLU(0.1)
		self.relu = torch.nn.ReLU()
		self.softmax = torch.nn.Softmax(dim=1)

		#self.conv1 = torch.nn.Conv2d(self.in_length, 64, 3) # => [64, 11, 1]
		self.conv1 = torch.nn.Conv2d(16, 64, 3) # => [64, 11, 1]
		self.conv2 = torch.nn.Conv2d(64, 16, (3,1))  # => [16,	9, 1]
		self.maxpool = torch.nn.MaxPool2d((2,1),padding = (1,0)) # => [16, 5, 1]
		self.proj_grid = nn.Linear(self.encoder_size, self.soc_embedding_size)