예제 #1
0
파일: main.py 프로젝트: steven0129/TextCNN
def train(**kwargs):
    for k_, v_ in kwargs.items():
        setattr(options, k_, v_)

    training_set = TextDataset(path='data/train/train.csv', model='wordvec/skipgram.bin', max_length=options.max_length, word_dim=options.word_dim)
    training_loader = Data.DataLoader(dataset=training_set, batch_size=options.batch_size, shuffle=True, drop_last=True)
    model = TextCNN(options.word_dim, options.max_length, training_set.encoder.classes_.shape[0])

    if torch.cuda.is_available():
        model.cuda()

    optimizer = optim.Adam(model.parameters(), lr=options.learning_rate)

    for epoch in tqdm(range(options.epochs)):
        loss_sum = 0
        
        for data, label in tqdm(training_loader):
            if torch.cuda.is_available():
                data = data.cuda()
                label = label.cuda()

            out = model(data)
            
            loss = criteration(out, autograd.Variable(label.squeeze().long()))
            loss_sum += loss.item() / options.batch_size
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        tqdm.write(f'epoch {epoch + 1}: loss = {loss_sum/len(training_set.data)}')
        model.save(f'checkpoints/loss-{loss_sum/len(training_set.data)}.pt')
예제 #2
0
def main():
    testset = TextDataset(args.testset)
    test_loader = DataLoader(dataset=testset,
                             batch_size=args.test_batch,
                             drop_last=False,
                             shuffle=False,
                             collate_fn=synth_collate_fn,
                             pin_memory=True)

    t2m = Text2Mel().to(DEVICE)
    ssrn = SSRN().to(DEVICE)

    mname = type(t2m).__name__
    ckpt = sorted(
        glob.glob(os.path.join(args.logdir, mname, '{}-*k.pth'.format(mname))))
    state = torch.load(ckpt[-1])
    t2m.load_state_dict(state['model'])
    args.global_step = state['global_step']

    mname = type(ssrn).__name__
    ckpt = sorted(
        glob.glob(os.path.join(args.logdir, mname, '{}-*k.pth'.format(mname))))
    state = torch.load(ckpt[-1])
    ssrn.load_state_dict(state['model'])

    print('All of models are loaded.')

    t2m.eval()
    ssrn.eval()

    if not os.path.exists(os.path.join(args.sampledir, 'A')):
        os.makedirs(os.path.join(args.sampledir, 'A'))
    synthesize(t2m, ssrn, test_loader, args.test_batch)
예제 #3
0
def main():

    testset = TextDataset(args.testset)
    test_loader = DataLoader(dataset=testset, batch_size=args.test_batch, drop_last=False,
                             shuffle=False, collate_fn=synth_collate_fn, pin_memory=True)

    t2m = Text2Mel().to(DEVICE)
    ssrn = SSRN().to(DEVICE)
    
    ckpt = pd.read_csv(os.path.join(args.logdir, t2m.name, 'ckpt.csv'), sep=',', header=None)
    ckpt.columns = ['models', 'loss']
    ckpt = ckpt.sort_values(by='loss', ascending=True)
    state = torch.load(os.path.join(args.logdir, t2m.name, ckpt.models.loc[0]))
    t2m.load_state_dict(state['model'])
    args.global_step = state['global_step']

    ckpt = pd.read_csv(os.path.join(args.logdir, ssrn.name, 'ckpt.csv'), sep=',', header=None)
    ckpt.columns = ['models', 'loss']
    ckpt = ckpt.sort_values(by='loss', ascending=True)
    state = torch.load(os.path.join(args.logdir, ssrn.name, ckpt.models.loc[0]))
    ssrn.load_state_dict(state['model'])

    print('All of models are loaded.')

    t2m.eval()
    ssrn.eval()
    
    if not os.path.exists(os.path.join(args.sampledir, 'A')):
        os.makedirs(os.path.join(args.sampledir, 'A'))
    return synthesize(t2m=t2m, ssrn=ssrn, data_loader=test_loader, batch_size=args.test_batch)
예제 #4
0
def main():
    testset = TextDataset(args.testset, args.lang, args.ref_path)
    test_loader = DataLoader(dataset=testset,
                             batch_size=args.test_batch,
                             drop_last=False,
                             shuffle=False,
                             collate_fn=synth_collate_fn,
                             pin_memory=True)

    model = DCTTS(args).to(DEVICE)

    ckpt = sorted(
        glob.glob(
            os.path.join(args.logdir, args.model_name,
                         '{}-*k.pth'.format(args.model_name))))
    state = torch.load(ckpt[-1])
    model.load_state_dict(state['model'])
    args.global_step = state['global_step']

    print('All of models are loaded.')

    model.eval()

    if not os.path.exists(os.path.join(args.sampledir, 'A')):
        os.makedirs(os.path.join(args.sampledir, 'A'))
        os.makedirs(os.path.join(args.sampledir, 'f0'))
    synthesize(model, test_loader, args.test_batch)
예제 #5
0
def main():
    testset = TextDataset(args.testset)
    test_loader = DataLoader(dataset=testset,
                             batch_size=args.test_batch,
                             drop_last=False,
                             shuffle=False,
                             collate_fn=synth_collate_fn,
                             pin_memory=True)

    model = Tacotron().to(DEVICE)

    model_path = sorted(
        glob.glob(os.path.join(args.logdir, model.name,
                               'model-*.tar')))[-1]  # latest model
    state = torch.load(model_path)
    model.load_state_dict(state['model'])
    args.global_step = state['global_step']

    print('The model is loaded. Step: {}'.format(args.global_step))

    model.eval()

    if not os.path.exists(os.path.join(args.sampledir, 'A')):
        os.makedirs(os.path.join(args.sampledir, 'A'))
    synthesize(model, test_loader, args.test_batch)
예제 #6
0
 def get_data(self):
     self.convert = TextConverter(self.args.txt,
                                  max_vocab=self.args.max_vocab)
     dataset = TextDataset(self.args.txt, self.args.len,
                           self.convert.text_to_arr)
     self.train_loader = DataLoader(dataset,
                                    self.args.batch_size,
                                    shuffle=True,
                                    num_workers=self.args.num_workers)
예제 #7
0
def main(load_model='latest'):
    """
    main function

    :param load_model: String. {best, latest, <model_path>}
    :param synth_mode: {'test', 'synthesize'}

    """
    assert os.path.exists(args.testset), 'Test sentence path is wrong.'

    model = TPGST().to(DEVICE)

    testset = TextDataset(args.testset, args.ref_path)
    test_loader = DataLoader(dataset=testset, batch_size=args.test_batch, drop_last=False,
                            shuffle=False, collate_fn=text_collate_fn, pin_memory=True)
    
    if load_model.lower() == 'best':
        ckpt = pd.read_csv(os.path.join(args.logdir, model.name, 'ckpt.csv'), sep=',', header=None)
        ckpt.columns = ['models', 'loss']
        model_path = ckpt.sort_values(by='loss', ascending=True).models.loc[0]
        model_path = os.path.join(args.logdir, model.name, model_path)
    elif 'pth.tar' in load_model:
        model_path = load_model
    else:
        model_path = sorted(glob.glob(os.path.join(args.logdir, model.name, 'model-*.tar')))[-1] # latest model
    state = torch.load(model_path)
    model.load_state_dict(state['model'])
    args.global_step = state['global_step']

    print('The model is loaded. Step: {}'.format(args.global_step))

    model.eval()
    
    if not os.path.exists(os.path.join(args.sampledir, 'A')):
        os.makedirs(os.path.join(args.sampledir, 'A'))

    if synth_mode == 'test':
        ref_synthesize(model, test_loader, args.test_batch)
    elif synth_mode == 'style':
        style_synthesize(model, test_loader, args.test_batch)
    elif synth_mode == 'tp':
        tp_synthesize(model, test_loader, args.test_batch)
    elif synth_mode == 'fix':
        fixed_synthesize(model, test_loader, args.test_batch)
예제 #8
0
bptt = 8
batch_size = 32

train = [
    '/media/lytic/STORE/ru_open_stt_wav/text/public_youtube1120_hq.txt',
    '/media/lytic/STORE/ru_open_stt_wav/text/public_youtube1120.txt',
    '/media/lytic/STORE/ru_open_stt_wav/text/public_youtube700.txt'
]

test = [
    '/media/lytic/STORE/ru_open_stt_wav/text/asr_calls_2_val.txt',
    '/media/lytic/STORE/ru_open_stt_wav/text/buriy_audiobooks_2_val.txt',
    '/media/lytic/STORE/ru_open_stt_wav/text/public_youtube700_val.txt'
]

train = TextDataset(train, labels, batch_size)
test = TextDataset(test, labels, batch_size)

test.shuffle(0)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4, weight_decay=1e-5)
scheduler = StepLR(optimizer, step_size=10000, gamma=0.99)

for epoch in range(20):

    model.train()

    hidden = model.step_init(batch_size)

    err = AverageMeter('loss')
예제 #9
0
torch.cuda.manual_seed(LUCKY_NUM)
np.random.seed(LUCKY_NUM)
# initialize matplotlib and CUDA
# plt.ion()
torch.cuda.set_device(config.deviceID)
# set the work path
PATH = config.path
if not os.path.isdir(PATH):
    os.makedirs(PATH)
# Parameters used in the net
ERROR_PER = config.ERROR_PER
NE = config.ne  # number of ensemble
GAMMA = config.GAMMA
T = config.T
# Load data and initialize enn net
text = TextDataset()
# Set the loss function
criterion = torch.nn.MSELoss()
INFO = {
    "train len": config.train_len,
    "shrink len": config.shrink_len,
    "window step": config.window_step,
    "Error per": config.ERROR_PER,
    "input dim": config.input_dim,
    "hid dim": config.hid_dim,
    "num layer": config.num_layer,
    "number of ensemble": config.ne,
    "T": config.T,
    "batch size": config.batch_size,
    "epoch": config.epoch,
    "GAMMA": config.GAMMA,
예제 #10
0
    ### Rebuild dictionary
    print('Build word2idx ... ', end='')

    word2idx = {}
    for k, v in word2vec.wv.vocab.items():
        word2idx[k] = v.index
    word2vec.wv.syn0[word2idx['<pad>']] = np.zeros(embedding_dim)
    pickle.dump(word2idx, open('_word2vec.pkl', 'wb'))

    print('Done !')

    ### Load dataset
    print('Load dataset ... ', end='')

    d_train = TextDataset(word2idx, fp_train_labeled, train=True)
    d_val = TextDataset(word2idx, fp_train_labeled, train=True, val=True)

    train_loader = DataLoader(d_train, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(d_val, batch_size=batch_size, shuffle=False)

    ### Train model
    print('Train LSTM ... ')

    model = LSTMClassifier(embedding_dim, hidden_dim, num_layers, batch_size)
    model.init_weights()
    model.embedding.weight = torch.nn.Parameter(torch.Tensor(word2vec.wv.syn0))
    model.embedding.weight.requires_grad = False
    model.cuda()
    print(model)
예제 #11
0
def main(_):
    # Set up logging
    configure_logging(FLAGS.debug_log)

    # Load configuration
    with open(FLAGS.config, 'r') as f:
        config = yaml.load(f)

    # Get the directory paths
    ckpt_dir = os.path.join(config['training']['ckpt_dir'],
                            config['experiment_name'])
    summary_dir = os.path.join(config['training']['summary_dir'],
                               config['experiment_name'])

    # Create the directories if they do not already exist
    if not os.path.exists(ckpt_dir):
        logging.info('Creating checkpoint directory: `%s`.' % ckpt_dir)
        os.makedirs(ckpt_dir)
    if not os.path.exists(summary_dir):
        logging.info('Creating summary directory: `%s`.' % summary_dir)
        os.makedirs(summary_dir)

    # Check for conflicting configurations
    safe_copy_config(config, FLAGS.force_overwrite)

    # Init summary writer
    summary_writer =  SummaryWriter(summary_dir)

    # Load vocab and datasets
    logging.info('Loading the vocabulary.')
    with open(config['data']['vocab'], 'r') as f:
        vocab = Vocab.load(f)
    logging.info('Loading train and valid data.')
    train_data = TextDataset(config['data']['train'],
                             vocab=vocab,
                             max_length=config['training']['max_length'])
    valid_data = TextDataset(config['data']['valid'],
                             vocab=vocab,
                             max_length=config['training']['max_length'])

    # Initialize models
    logging.info('Initializing the inference network and generative model.')
    inference_network = RNNTextInferenceNetwork(
        dim=config['model']['dim'],
        vocab_size=len(vocab),
        encoder_kwargs=config['model']['encoder'],
        normalizing_flow_kwargs=config['model']['normalizing_flow'])
    generative_model = RNNTextGenerativeModel(
        dim=config['model']['dim'],
        vocab_size=len(vocab),
        max_length=config['training']['max_length'],
        sos_idx=vocab.sos_idx,
        **config['model']['generator'])
    if torch.cuda.is_available():
        inference_network = inference_network.cuda()
        generative_model = generative_model.cuda()

    # Setup model optimizers
    optimizer_in = torch.optim.Adam(inference_network.parameters(),
                                    lr=config['training']['learning_rate'])
    optimizer_gm = torch.optim.Adam(generative_model.parameters(),
                                    lr=config['training']['learning_rate'])

    # Restore
    ckpt = os.path.join(ckpt_dir, 'model.pt')
    if os.path.exists(ckpt):
        logging.info('Model checkpoint detected at: `%s`. Restoring.' % ckpt)
        checkpoint = torch.load(ckpt)
        epoch = checkpoint['epoch']
        t = checkpoint['t']
        best_loss = checkpoint['best_loss']
        inference_network.load_state_dict(checkpoint['state_dict_in'])
        generative_model.load_state_dict(checkpoint['state_dict_gm'])
        optimizer_in.load_state_dict(checkpoint['optimizer_in'])
        optimizer_gm.load_state_dict(checkpoint['optimizer_gm'])
    else:
        logging.info('No existing checkpoint found.')
        epoch = 0
        t = 0
        best_loss = float('inf')

    # Start train
    weight = torch.ones(len(vocab))
    weight[vocab.unk_idx] = config['training']['unk_weight']
    if torch.cuda.is_available():
        weight = weight.cuda()
    while epoch < config['training']['epochs']:
        logging.info('Starting epoch - %i.' % epoch)

        inference_network.train()
        generative_model.train()

        # Training step
        logging.info('Start train step.')
        train_loader = DataLoader(
            dataset=train_data,
            batch_size=config['training']['batch_size'],
            shuffle=True,
            num_workers=cpu_count(),
            pin_memory=torch.cuda.is_available())

        # Init train summaries
        train_nll = 0.0
        train_kl = 0.0
        train_loss = 0.0

        for batch in train_loader:

            optimizer_in.zero_grad()
            optimizer_gm.zero_grad()

            x = batch['input']
            target = batch['target']
            lengths = batch['lengths']
            if torch.cuda.is_available():
                x = x.cuda()
                target = target.cuda()
                lengths = lengths.cuda()

            # Forward pass of inference network
            z, kl = inference_network(x, lengths)

            # Teacher forcing
            x_hat = word_dropout(x, config['training']['word_dropout_rate'],
                                 vocab.unk_idx)
            logp, _ = generative_model(z, x_hat, lengths)

            # Obtain current value of the annealing constant with beta trick
            beta = get_beta(config, epoch)

            # Compute annealed loss
            length = logp.shape[1]
            logp = logp.view(-1, len(vocab))
            target = target[:,:length].contiguous().view(-1)
            nll = F.nll_loss(logp, target,
                             ignore_index=vocab.pad_idx,
                             weight=weight,
                             size_average=False)
            loss = nll + beta * kl

            # Update summaries
            train_nll += nll.data
            train_kl += kl.data
            train_loss += loss.data

            # Backpropagate gradients
            batch_size = config['training']['batch_size']
            loss /= batch_size
            kl /= batch_size
            nll /= batch_size
            loss.backward()
            optimizer_in.step()
            optimizer_gm.step()

            # Log
            if not t % config['training']['log_frequency']:
                # Note: logged train loss only for a single batch - see
                # tensorboard for summary over epochs
                line = 'Iteration: %i - Loss: %0.4f. - KL: %0.4f - NLL: %0.4f'
                logging.info(line % (t, loss.data, kl.data, nll.data))

                # Print a greedy sample
                z_k, _ = inference_network(x, lengths)
                _, sample = generative_model(z_k)
                example = [vocab.id2word(int(x)) for x in sample[0]]
                try:
                    T = example.index(vocab.eos_token)
                    example = example[:T]
                except ValueError:
                    pass
                example = ' '.join(example)
                logging.info('Example - `%s`' % example)

            t += 1

        # Validation step
        logging.info('Start valid step.')
        valid_loader = DataLoader(
            dataset=valid_data,
            batch_size=config['training']['batch_size'],
            shuffle=False,
            num_workers=cpu_count(),
            pin_memory=torch.cuda.is_available())

        # Init valid summaries
        valid_nll = 0.0
        valid_kl = 0.0
        valid_loss = 0.0

        for batch in valid_loader:

            x = batch['input']
            target = batch['target']
            lengths = batch['lengths']
            if torch.cuda.is_available():
                x = x.cuda()
                target = target.cuda()
                lengths = lengths.cuda()

            # Forward pass of inference network
            z, kl = inference_network(x, lengths)

            # Teacher forcing
            logp, _ = generative_model(z, x, lengths)

            # Compute annealed loss
            length = logp.shape[1]
            logp = logp.view(-1, len(vocab))
            target = target[:,:length].contiguous().view(-1)
            nll = F.nll_loss(logp, target, ignore_index=vocab.pad_idx,
                             size_average=False)
            loss = nll + kl

            # Update summaries
            valid_nll += nll.data
            valid_kl += kl.data
            valid_loss += loss.data

        # Normalize losses
        train_nll /= len(train_data)
        train_kl /= len(train_data)
        train_loss /= len(train_data)
        valid_nll /= len(valid_data)
        valid_kl /= len(valid_data)
        valid_loss /= len(valid_data)

        # Tensorboard logging
        summary_writer.add_scalar("elbo/train", train_loss.data, epoch)
        summary_writer.add_scalar("kl/train", train_kl.data, epoch)
        summary_writer.add_scalar("nll/train", train_nll.data, epoch)
        summary_writer.add_scalar("elbo/val", valid_loss.data, epoch)
        summary_writer.add_scalar("kl/val", valid_kl.data, epoch)
        summary_writer.add_scalar("nll/val", valid_nll.data, epoch)

        # Save checkpoint
        is_best = valid_loss < best_loss
        best_loss = min(loss, best_loss)
        save_checkpoint({
            'epoch': epoch + 1,
            't': t,
            'best_loss': best_loss,
            'state_dict_in': inference_network.state_dict(),
            'state_dict_gm': generative_model.state_dict(),
            'optimizer_in': optimizer_in.state_dict(),
            'optimizer_gm': optimizer_gm.state_dict()
        }, is_best, ckpt)

        epoch += 1
def train_iters(ae_model, dis_model):
    if args.use_albert:
        tokenizer = BertTokenizer.from_pretrained("clue/albert_chinese_tiny",
                                                  do_lower_case=True)
    elif args.use_tiny_bert:
        tokenizer = AutoTokenizer.from_pretrained(
            "google/bert_uncased_L-2_H-256_A-4", do_lower_case=True)
    elif args.use_distil_bert:
        tokenizer = DistilBertTokenizer.from_pretrained(
            'distilbert-base-uncased', do_lower_case=True)
    # tokenizer = BertTokenizer.from_pretrained(args.PRETRAINED_MODEL_NAME, do_lower_case=True)
    tokenizer.add_tokens('[EOS]')
    bos_id = tokenizer.convert_tokens_to_ids(['[CLS]'])[0]

    ae_model.bert_encoder.resize_token_embeddings(len(tokenizer))
    #print("[CLS] ID: ", bos_id)

    print("Load trainData...")
    if args.load_trainData and os.path.exists('./{}_trainData.pkl'.format(
            args.task)):
        with open('./{}_trainData.pkl'.format(args.task), 'rb') as f:
            trainData = pickle.load(f)
    else:
        trainData = TextDataset(batch_size=args.batch_size,
                                id_bos='[CLS]',
                                id_eos='[EOS]',
                                id_unk='[UNK]',
                                max_sequence_length=args.max_sequence_length,
                                vocab_size=0,
                                file_list=args.train_file_list,
                                label_list=args.train_label_list,
                                tokenizer=tokenizer)
        with open('./{}_trainData.pkl'.format(args.task), 'wb') as f:
            pickle.dump(trainData, f)

    add_log("Start train process.")

    ae_model.train()
    dis_model.train()
    ae_model.to(device)
    dis_model.to(device)
    '''
    Fixing or distilling BERT encoder
    '''
    if args.fix_first_6:
        print("Try fixing first 6 bertlayers")
        for layer in range(6):
            for param in ae_model.bert_encoder.encoder.layer[layer].parameters(
            ):
                param.requires_grad = False
    elif args.fix_last_6:
        print("Try fixing last 6 bertlayers")
        for layer in range(6, 12):
            for param in ae_model.bert_encoder.encoder.layer[layer].parameters(
            ):
                param.requires_grad = False

    if args.distill_2:
        print("Get result from layer 2")
        for layer in range(2, 12):
            for param in ae_model.bert_encoder.encoder.layer[layer].parameters(
            ):
                param.requires_grad = False

    ae_optimizer = NoamOpt(
        ae_model.d_model, 1, 2000,
        torch.optim.Adam(ae_model.parameters(),
                         lr=0,
                         betas=(0.9, 0.98),
                         eps=1e-9))
    dis_optimizer = torch.optim.Adam(dis_model.parameters(), lr=0.0001)

    #ae_criterion = get_cuda(LabelSmoothing(size=args.vocab_size, padding_idx=args.id_pad, smoothing=0.1))
    ae_criterion = LabelSmoothing(size=ae_model.bert_encoder.config.vocab_size,
                                  padding_idx=0,
                                  smoothing=0.1).to(device)
    dis_criterion = nn.BCELoss(reduction='mean')

    history = {'train': []}

    for epoch in range(args.epochs):
        print('-' * 94)
        epoch_start_time = time.time()
        total_rec_loss = 0
        total_dis_loss = 0

        train_data_loader = DataLoader(trainData,
                                       batch_size=args.batch_size,
                                       shuffle=True,
                                       collate_fn=trainData.collate_fn,
                                       num_workers=4)
        num_batch = len(train_data_loader)
        trange = tqdm(enumerate(train_data_loader),
                      total=num_batch,
                      desc='Training',
                      file=sys.stdout,
                      position=0,
                      leave=True)

        for it, data in trange:
            batch_sentences, tensor_labels, tensor_src, tensor_src_mask, tensor_tgt, tensor_tgt_y, tensor_tgt_mask, tensor_ntokens = data

            tensor_labels = tensor_labels.to(device)
            tensor_src = tensor_src.to(device)
            tensor_tgt = tensor_tgt.to(device)
            tensor_tgt_y = tensor_tgt_y.to(device)
            tensor_src_mask = tensor_src_mask.to(device)
            tensor_tgt_mask = tensor_tgt_mask.to(device)

            # Forward pass
            latent, out = ae_model.forward(tensor_src, tensor_tgt,
                                           tensor_src_mask, tensor_tgt_mask)

            # Loss calculation
            loss_rec = ae_criterion(
                out.contiguous().view(-1, out.size(-1)),
                tensor_tgt_y.contiguous().view(-1)) / tensor_ntokens.data

            ae_optimizer.optimizer.zero_grad()
            loss_rec.backward()
            ae_optimizer.step()

            latent = latent.detach()
            next_latent = latent.to(device)

            # Classifier
            dis_lop = dis_model.forward(next_latent)
            loss_dis = dis_criterion(dis_lop, tensor_labels)

            dis_optimizer.zero_grad()
            loss_dis.backward()
            dis_optimizer.step()

            total_rec_loss += loss_rec.item()
            total_dis_loss += loss_dis.item()

            trange.set_postfix(total_rec_loss=total_rec_loss / (it + 1),
                               total_dis_loss=total_dis_loss / (it + 1))

            if it % 100 == 0:
                add_log(
                    '| epoch {:3d} | {:5d}/{:5d} batches | rec loss {:5.4f} | dis loss {:5.4f} |'
                    .format(epoch, it, num_batch, loss_rec, loss_dis))

                print(id2text_sentence(tensor_tgt_y[0], tokenizer, args.task))
                generator_text = ae_model.greedy_decode(
                    latent, max_len=args.max_sequence_length, start_id=bos_id)
                print(id2text_sentence(generator_text[0], tokenizer,
                                       args.task))

                # Save model
                #torch.save(ae_model.state_dict(), args.current_save_path / 'ae_model_params.pkl')
                #torch.save(dis_model.state_dict(), args.current_save_path / 'dis_model_params.pkl')

        history['train'].append({
            'epoch': epoch,
            'total_rec_loss': total_rec_loss / len(trange),
            'total_dis_loss': total_dis_loss / len(trange)
        })

        add_log('| end of epoch {:3d} | time: {:5.2f}s |'.format(
            epoch, (time.time() - epoch_start_time)))
        # Save model
        torch.save(ae_model.state_dict(),
                   args.current_save_path / 'ae_model_params.pkl')
        torch.save(dis_model.state_dict(),
                   args.current_save_path / 'dis_model_params.pkl')

    print("Save in ", args.current_save_path)
    return
예제 #13
0
    embedding_dim = 128
    hidden_dim = 128
    num_layers = 2
    batch_size = 128

    ### Load dictionary
    print('Loading Dictionary ... ', end='')

    word2idx = pickle.load(open(fp_word2idx, 'rb'))

    print('Done !')

    ### Load data
    print('Loading Data ... ', end='')

    d_test = TextDataset(word2idx, fp_test, train=False)
    test_loader = DataLoader(d_test, batch_size=batch_size, shuffle=False)

    print('Done !')

    ### Load model
    print('Loading Model ... ', end='')

    model = LSTMClassifier(embedding_dim, hidden_dim, num_layers, batch_size)
    model.cuda()
    model.load_state_dict(torch.load(fp_model))

    print('Done !')

    ### Predict
    print('Predict ... ', end='')
예제 #14
0
        if n in pretrained_state_dict:
            w = pretrained_state_dict[n]
            p.data.copy_(w.data)
    model = cuda(model)
    print('loaded pretrained ckpt')

optimizer = AdamW(
    optimizer_params(model),
    lr=args.lr,
    weight_decay=args.wd,
    eps=args.eps,
)
criterion = nn.CrossEntropyLoss()
best_loss = float('inf')

train_ds = TextDataset(f'amazon/{args.src}_train.csv', args.src_p)
valid_ds = TextDataset(f'amazon/{args.src}_valid.csv', args.src_p)
test_ds = TextDataset(f'amazon/{args.src}_test.csv', args.src_p)

if args.train:
    for epoch in range(1, args.epochs + 1):
        train_loss = train(train_ds)
        valid_loss = valid(valid_ds)
        if valid_loss < best_loss:
            best_loss = valid_loss
            torch.save(model.state_dict(), args.ckpt)
        print(f'epoch: {epoch} | '
              f'train loss: {train_loss:.6f} | '
              f'valid loss: {valid_loss:.6f}')

model.load_state_dict(torch.load(args.ckpt))
예제 #15
0
    parser.add_argument('--output', type=str, default='./result')
    parser.add_argument('--model', type=str, required=True)
    parser.add_argument('--batchsize', type=int, default=5)
    parser.add_argument('--manga_name', type=str, default=None)
    parser.add_argument('--visualize', action='store_true')
    return parser.parse_args()

if __name__ == '__main__':
    # Good formatting when printing the APs for each class and mAP
    pp = PrettyPrinter()
    
    args = get_args()

    obj = torch.load(args.model)
    model = obj['model']

    if args.manga_name != None:
        args.output = os.path.join(args.output, args.manga_name)
    if not os.path.exists(args.output):
        os.makedirs(args.output)

    if args.visualize:
        mytransforms = MyTransform()
        test_dataset = TextDataset(args.root, model_type='ssd-fork', transforms=None, specific_manga=args.manga_name)
    else:
        mytransforms = None
        test_dataset = TextDataset(args.root, model_type='ssd-fork', transforms=MyTransform(), specific_manga=args.manga_name)
    test_loader = DataLoader(test_dataset, batch_size=args.batchsize, shuffle=False, collate_fn=my_collate_fn, num_workers=4, pin_memory=True)

    evaluate(test_loader, model, args.visualize, args.output, mytransforms)
예제 #16
0
        print('Vocabulary has been loaded from {}'.format(args.vocab_file))
    if args.tokenized == 1:
        corpus = Corpus_tok(path, args.train, args.valid, args.test, load_vocab=args.load_vocab, vocab_file=args.vocab_file)
    else: 
        corpus = Corpus(path, args.train, args.valid, args.test, load_vocab=args.load_vocab, vocab_file=args.vocab_file)
    torch.save(corpus, fn)
    if args.save_vocab:
        with open('{}/{}'.format(path, args.vocab_file), 'wb') as f:
            torch.save([corpus.vocabulary.word2idx, corpus.vocabulary.idx2word], f)

vocab_sz = len(corpus.vocabulary)  

# Produce dataloaders
if args.tokenized == 1:
    print("Producing train dataloader...")
    train_loader = TextDataset(path, args.train, corpus.vocabulary)
    dlt = DataLoader(train_loader, batch_size=args.bs, drop_last=True)
    train_data = SortingTextDataLoader(dlt)
    print("Num sentences train loader:", len(train_loader))
    print("Producing val dataloader...")
    valid_loader = TextDataset(path, args.valid, train_loader.vocabulary)
    dlv = DataLoader(valid_loader, batch_size=args.bs, drop_last=True)
    valid_data = SortingTextDataLoader(dlv)
    print("Num sentences valid loader:", len(valid_loader))
    print("Producing test dataloader...")
    test_loader = TextDataset(path, args.test, valid_loader.vocabulary)
    dlte = DataLoader(test_loader, batch_size=args.bs, drop_last=True)
    test_data = SortingTextDataLoader(dlte)
    corpus.vocabulary = test_loader.vocabulary
    print("Num sentences test loader:", len(test_loader))
else:
예제 #17
0
def main(_):
    # Set up logging
    configure_logging(FLAGS.debug_log)

    # Load configuration
    with open(FLAGS.config, 'r') as f:
        config = yaml.load(f)

    # Get the checkpoint path
    ckpt_dir = os.path.join(config['training']['ckpt_dir'],
                            config['experiment_name'])

    # Load vocab and datasets
    logging.info('Loading the vocabulary.')
    with open(config['data']['vocab'], 'r') as f:
        vocab = Vocab.load(f)
    logging.info('Loading test data.')
    test_data = TextDataset(config['data']['test'],
                            vocab=vocab,
                            max_length=config['training']['max_length'])
    test_loader = DataLoader(dataset=test_data,
                             batch_size=config['training']['batch_size'],
                             shuffle=False,
                             num_workers=cpu_count(),
                             pin_memory=torch.cuda.is_available())

    # Initialize models
    logging.info('Initializing the inference network and generative model.')
    inference_network = RNNTextInferenceNetwork(
        dim=config['model']['dim'],
        vocab_size=len(vocab),
        encoder_kwargs=config['model']['encoder'],
        normalizing_flow_kwargs=config['model']['normalizing_flow'])
    generative_model = RNNTextGenerativeModel(
        dim=config['model']['dim'],
        vocab_size=len(vocab),
        max_length=config['training']['max_length'],
        sos_idx=vocab.sos_idx,
        **config['model']['generator'])
    if torch.cuda.is_available():
        inference_network = inference_network.cuda()
        generative_model = generative_model.cuda()

    # Restore
    ckpt = os.path.join(ckpt_dir, 'model.pt.best')
    if os.path.exists(ckpt):
        logging.info('Model checkpoint detected at: `%s`. Restoring.' % ckpt)
        checkpoint = torch.load(ckpt)
        inference_network.load_state_dict(checkpoint['state_dict_in'])
        generative_model.load_state_dict(checkpoint['state_dict_gm'])
    else:
        logging.error('No model checkpoint found. Terminating.')
        sys.exit(1)

    # Init test summaries
    test_nll = 0.0
    test_kl = 0.0
    test_loss = 0.0
    test_suml2p = 0.0
    test_n = 0.0

    # Evaluate
    inference_network.eval()
    generative_model.eval()

    for batch in test_loader:

        x = batch['input']
        target = batch['target']
        lengths = batch['lengths']
        if torch.cuda.is_available():
            x = x.cuda()
            target = target.cuda()
            lengths = lengths.cuda()

        # Forward pass of inference network
        z, kl = inference_network(x, lengths)

        # Teacher forcing
        logp, _ = generative_model(z, x, lengths)

        # Compute loss
        length = logp.shape[1]
        logp = logp.view(-1, len(vocab))
        target = target[:, :length].contiguous().view(-1)
        nll = F.nll_loss(logp,
                         target,
                         ignore_index=vocab.pad_idx,
                         size_average=False)
        loss = nll + kl
        l2p, n = suml2p(logp, target, vocab.pad_idx)

        # Update summaries
        test_nll += nll.data
        test_kl += kl.data
        test_loss += loss.data
        test_suml2p += l2p.data
        test_n += n

    # Normalize losses
    test_nll /= len(test_data)
    test_kl /= len(test_data)
    test_loss /= len(test_data)
    H = -test_suml2p / test_n
    test_perplexity = 2**H

    # Log output
    logging.info('NLL: %0.4f' % test_nll)
    logging.info('KL: %0.4f' % test_kl)
    logging.info('ELBO: %0.4f' % test_loss)
    logging.info('Perplexity: %0.4f' % test_perplexity)
    words = []
    for t in text_corpus:
        splits = t.split(' ')
        words.extend([s for s in splits if len(s) > 2])
    return words


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--config',
                        type=str,
                        help="path to config with parameters")
    args = parser.parse_args()
    cfg = config_from_file(args.config)

    dataset = TextDataset(cfg)

    word_sets = dict()
    pos_words = text_corpus_to_words(dataset.pos_text_corpus)
    neg_words = text_corpus_to_words(dataset.neg_text_corpus)
    for name, words in zip(['pos', 'neg'], [pos_words, neg_words]):

        plt.hist(words)
        plt.title(f'hist of {name} words')
        plt.show()

        counter = Counter(words)
        word_sets[name] = (set(words), counter)
        print(f'{name} descriptive stat:')
        most_common = counter.most_common()
        print(f'        most frequent words in {name}: ')
예제 #19
0
torch.manual_seed(args.seed)

if torch.cuda.is_available():
    torch.cuda.set_device(args.gpu)

# Create the configuration
config = Config(sentence_max_size=50,
                batch_size=args.batch_size,
                word_num=11000,
                label_num=args.label_num,
                learning_rate=args.lr,
                cuda=args.gpu,
                epoch=args.epoch,
                out_channel=args.out_channel)

training_set = TextDataset(path='data/train')

training_iter = data.DataLoader(dataset=training_set,
                                batch_size=config.batch_size,
                                num_workers=2)

model = DPCNN(config)
embeds = nn.Embedding(config.word_num, config.word_embedding_dimension)

if torch.cuda.is_available():
    model.cuda()
    embeds = embeds.cuda()

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=config.lr)
def eval_iters(ae_model, dis_model):
    # tokenizer = BertTokenizer.from_pretrained(args.PRETRAINED_MODEL_NAME, do_lower_case=True)
    if args.use_albert:
        tokenizer = BertTokenizer.from_pretrained("clue/albert_chinese_tiny",
                                                  do_lower_case=True)
    elif args.use_tiny_bert:
        tokenizer = AutoTokenizer.from_pretrained(
            "google/bert_uncased_L-2_H-256_A-4", do_lower_case=True)
    elif args.use_distil_bert:
        tokenizer = DistilBertTokenizer.from_pretrained(
            'distilbert-base-uncased', do_lower_case=True)
    tokenizer.add_tokens('[EOS]')
    bos_id = tokenizer.convert_tokens_to_ids(['[CLS]'])[0]
    ae_model.bert_encoder.resize_token_embeddings(len(tokenizer))

    print("[CLS] ID: ", bos_id)

    # if args.task == 'news_china_taiwan':
    eval_file_list = [
        args.data_path + 'test.0',
        args.data_path + 'test.1',
    ]
    eval_label_list = [
        [0],
        [1],
    ]

    if args.eval_positive:
        eval_file_list = eval_file_list[::-1]
        eval_label_list = eval_label_list[::-1]

    print("Load testData...")

    testData = TextDataset(batch_size=args.batch_size,
                           id_bos='[CLS]',
                           id_eos='[EOS]',
                           id_unk='[UNK]',
                           max_sequence_length=args.max_sequence_length,
                           vocab_size=0,
                           file_list=eval_file_list,
                           label_list=eval_label_list,
                           tokenizer=tokenizer)

    dataset = testData
    eval_data_loader = DataLoader(dataset,
                                  batch_size=1,
                                  shuffle=False,
                                  collate_fn=dataset.collate_fn,
                                  num_workers=4)

    num_batch = len(eval_data_loader)
    trange = tqdm(enumerate(eval_data_loader),
                  total=num_batch,
                  desc='Training',
                  file=sys.stdout,
                  position=0,
                  leave=True)

    gold_ans = [''] * num_batch

    add_log("Start eval process.")
    ae_model.to(device)
    dis_model.to(device)
    ae_model.eval()
    dis_model.eval()

    total_latent_lst = []

    for it, data in trange:
        batch_sentences, tensor_labels, tensor_src, tensor_src_mask, tensor_tgt, tensor_tgt_y, tensor_tgt_mask, tensor_ntokens = data

        tensor_labels = tensor_labels.to(device)
        tensor_src = tensor_src.to(device)
        tensor_tgt = tensor_tgt.to(device)
        tensor_tgt_y = tensor_tgt_y.to(device)
        tensor_src_mask = tensor_src_mask.to(device)
        tensor_tgt_mask = tensor_tgt_mask.to(device)

        print("------------%d------------" % it)
        print(id2text_sentence(tensor_tgt_y[0], tokenizer, args.task))
        print("origin_labels", tensor_labels.cpu().detach().numpy()[0])

        latent, out = ae_model.forward(tensor_src, tensor_tgt, tensor_src_mask,
                                       tensor_tgt_mask)
        generator_text = ae_model.greedy_decode(
            latent, max_len=args.max_sequence_length, start_id=bos_id)
        print(id2text_sentence(generator_text[0], tokenizer, args.task))

        # Define target label
        target = torch.FloatTensor([[1.0]]).to(device)
        if tensor_labels[0].item() > 0.5:
            target = torch.FloatTensor([[0.0]]).to(device)
        print("target_labels", target)

        modify_text, latent_lst = fgim_attack(dis_model,
                                              latent,
                                              target,
                                              ae_model,
                                              args.max_sequence_length,
                                              bos_id,
                                              id2text_sentence,
                                              None,
                                              gold_ans[it],
                                              tokenizer,
                                              device,
                                              task=args.task,
                                              save_latent=args.save_latent)
        if args.save_latent != -1:
            total_latent_lst.append(latent_lst)

        add_output(modify_text)

        if it >= args.save_latent_num:
            break

    print("Save log in ", args.output_file)

    if args.save_latent == -1:
        return

    folder = './latent_{}/'.format(args.task)
    if not os.path.exists(folder):
        os.mkdir(folder)

    if args.save_latent == 0:  # full
        prefix = 'full'
    elif args.save_latent == 1:  # first 6 layer
        prefix = 'first_6'
    elif args.save_latent == 2:  # last 6 layer
        prefix = 'last_6'
    elif args.save_latent == 3:  # get second layer
        prefix = 'distill_2'

    total_latent_lst = np.asarray(total_latent_lst)
    if args.eval_negative:
        save_label = 0
    else:
        save_label = 1
    with open(folder + '{}_{}.pkl'.format(prefix, save_label), 'wb') as f:
        pickle.dump(total_latent_lst, f)

    print("Save laten in ", folder + '{}_{}.pkl'.format(prefix, save_label))
예제 #21
0
def get_data(convert):
    dataset = TextDataset(opt.txt, opt.len, convert.text_to_arr)
    return DataLoader(dataset,
                      opt.batch_size,
                      shuffle=True,
                      num_workers=opt.num_workers)
예제 #22
0
if __name__ == '__main__':
    args = get_args()

    model, loss_func = build_fork_model_and_loss_function(args.n_classes)
    model.to(device)
    loss_func.to(device)

    optim = torch.optim.SGD(model.parameters(),
                            lr=0.001,
                            momentum=0.9,
                            weight_decay=5e-4)
    scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=6, gamma=0.1)

    # create dataloader
    dataset = TextDataset(args.root,
                          model_type='ssd-fork',
                          transforms=MyTransform())

    # according to fork paper
    test_size = 927
    # test_size = 20
    indices = list(range(len(dataset)))
    # split data set to training and testing
    train_set = torch.utils.data.Subset(dataset, indices[:-test_size])
    test_set = torch.utils.data.Subset(dataset, indices[-test_size:])

    train_dataloader = DataLoader(train_set,
                                  batch_size=args.batchsize,
                                  collate_fn=partial(my_collate_fn),
                                  shuffle=True,
                                  num_workers=4)
예제 #23
0
model = cuda(model)
optimizer = AdamW(
    optimizer_params(model),
    lr=args.lr,
    weight_decay=args.wd,
    eps=args.eps,
)
criterion = nn.CrossEntropyLoss(ignore_index=-1)
best_loss = float('inf')

train_ds_list = []
valid_ds_list = []
test_ds_list = []

if args.src_p > 0:
    train_ds_list.append(TextDataset('train-labels2.csv', args.src_p))
    valid_ds_list.append(TextDataset(f'valid-labels2.csv', args.src_p))
    test_ds_list.append(TextDataset(f'valid-labels2.csv', args.src_p))

if args.trg_p > 0:
    train_ds_list.append(TextDataset(f'train-labels2.csv', args.trg_p))
    valid_ds_list.append(TextDataset(f'valid-labels2.csv', args.trg_p))
    test_ds_list.append(TextDataset(f'valid-labels2.csv', args.trg_p))

if args.train:
    for epoch in range(1, args.epochs + 1):
        print(args.src_p, args.trg_p)
        print(train_ds_list[0].__len__())
        #print(train_ds_list[0].__getitem__(10))
        train_loss = train(train_ds_list)
        valid_loss = test(valid_ds_list)