示例#1
0
def train(data_fpath, index_fpath):
    dataset = TextData(data_fpath, index_fpath)
    dataloader = DataLoader(
        dataset,
        batch_size=16,
        shuffle=True,
        num_workers=4,
        collate_fn=collate_text
    )

    criterion = nn.BCEWithLogitsLoss()

    print('number of tokens {}'.format(dataset.ntoks))

    m = TextClassifier(dataset.ntoks)
    optimizer = optim.Adam(
        m.parameters(),
        lr=0.0001
    )

    for eidx in range(10):
        for bidx, batch in enumerate(dataloader):
            optimizer.zero_grad()
            #print(bidx, batch['transcriptions_one_hot'])

            out = m(batch['transcriptions_one_hot'])
            loss = criterion(out, batch['labels'])
            print('epoch:  {}\tstep: {}\tloss: {:.4f}'.format(eidx, bidx, loss.item()))
            print('pred', (F.sigmoid(out).data > 0.5).numpy().astype(np.int))
            print('true', batch['labels'].numpy().astype(np.int))
            loss.backward()
            optimizer.step()
示例#2
0
def main(args):
    data = TextData(args.source)
    if args.checkpoint:
        rnn = load_crnn(args.checkpoint)
    else:
        rnn = CharRNN(in_out_size=data.num_classes, state_size=args.state)
    opt = Adagrad(rnn, 0.1, stateful=True, clip=5)

    setup_plot()

    sequence_pairs = list(data.get_seqs(25))
    print('Training on {}:\n'
          '- {} total chars\n'
          '- {} unique chars\n'
          '- {} sequences of length 25'.format(args.source, data.tot_chars,
                                               data.num_classes,
                                               len(sequence_pairs)))
    opt.train(sequence_pairs,
              epochs=40,
              callback=partial(callback, data=data, start=time.time()),
              callback_every=4321,
              epoch_callback=epoch_callback)

    plt.savefig('plots/{}.png'.format(basename(args.source)))
示例#3
0
TextCNN新闻文本分类
'''

import numpy as np
from data import TextData
from public import TCNNConfig, plt_model
from keras import Input
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical, plot_model
from keras.models import Sequential, load_model, Model
from keras.layers import Dense, Dropout
from keras.layers import Embedding, Conv1D, MaxPool1D, concatenate, Flatten
from keras.optimizers import Adam

# 读入训练数据
td = TextData()
(x_data, y_data, z_data), (x_labels, y_labels, z_labels) = td.load_idata()
word2id = td.word2id
cat2id = td.cat2id
num_classes = len(set(x_labels))
vocab_size = len(word2id)

# 将每条文本固定为相同长度
x_data = pad_sequences(x_data, TCNNConfig.seq_length)
x_labels = to_categorical(x_labels, num_classes=num_classes)

y_data = pad_sequences(y_data, TCNNConfig.seq_length)
y_labels = to_categorical(y_labels, num_classes=num_classes)


def text_cnn(CNNConfig):
 def setUp(cls):
     cls.goblet = TextData('data/goblet_book.txt')
     cls.rnn = CharRNN(in_out_size=cls.goblet.num_classes, state_size=100)
示例#5
0
	def train(self, train, test, num_epochs, path):

		## Putting data into loaders
		train_loader = DataLoader(
			TextData(train),
			batch_size=self.params['batch_size'],
			num_workers=1,
			shuffle=True)

		test_loader = DataLoader(
			TextData(test),
			batch_size=self.params['batch_size'],
			num_workers=1,
			shuffle=True)

		## Begin training
		for epoch in range(num_epochs):
			## Train cycle
			self.model.train()
			train_loss = 0
			train_batches = 0

			for X, y in tqdm.tqdm(train_loader):
				X, y = X.cuda(), y.cuda()
				self.model.opt.zero_grad()

				## Forward
				logits = self.model(X)
				loss = self.criterion(logits, y)

				## Backward
				loss.backward()
				self.model.opt.step()

				## Measure stats
				train_loss += loss.item()
				train_batches += 1

				## Log train loss for each batch so we get a good graph
				wandb.log({'train_loss':loss.item()})

			## Testing cycle
			self.model.eval()
			test_loss = 0
			test_acc = 0
			test_batches = 0

			for X, y in tqdm.tqdm(test_loader):
				X, y = X.cuda(), y.cuda()

				## Forward
				with torch.no_grad():
					logits = self.model(X)
					loss = self.criterion(logits, y)

				## Measure stats
				test_loss += loss.item()
				probs = torch.softmax(logits, dim=-1)
				test_acc += self.accuracy(probs, y)
				test_batches += 1

			## Report statistics
			avg_train_loss = train_loss / test_batches
			avg_test_loss = test_loss / test_batches
			avg_test_acc = test_acc / test_batches

			print(f'\
				Train loss: {train_loss / train_batches}, Test loss: {avg_test_loss}, Test acc: {avg_test_acc}')

			wandb.log({
				'test_loss':avg_test_loss,
				'test_acc':avg_test_acc})

			## Save model each epoch
			torch.save(self.model.state_dict(), path / f'model{epoch}.pt')

		## Save model parameters at the end
		dictToJson(self.params, path / 'model_params.json')