def text_transformer(n_gram, window_size): """ Get tweet transformer :param lang: :param n_gram: :return: """ if n_gram == 'c1': return transforms.Compose([ ltransforms.ToLower(), ltransforms.Character(), ltransforms.ToIndex(start_ix=0), ltransforms.ToNGram(n=window_size, overlapse=True), ltransforms.Reshape((-1, window_size)), ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram] - 1) ]) else: return transforms.Compose([ ltransforms.ToLower(), ltransforms.Character2Gram(), ltransforms.ToIndex(start_ix=0), ltransforms.ToNGram(n=window_size, overlapse=True), ltransforms.Reshape((-1, window_size)), ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram] - 1) ])
def tweet_transformer(lang, n_gram, voc=None): """ Get tweet transformer :param lang: :param n_gram: :return: """ if voc is None: token_to_ix = dict() else: token_to_ix = voc # end if if n_gram == 'c1': return transforms.Compose([ ltransforms.RemoveRegex( regex=r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'), ltransforms.ToLower(), ltransforms.Character(), ltransforms.ToIndex(start_ix=1, token_to_ix=token_to_ix), ltransforms.ToLength(length=settings.min_length), ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram][lang] - 1) ]) else: return transforms.Compose([ ltransforms.RemoveRegex( regex=r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'), ltransforms.ToLower(), ltransforms.Character2Gram(), ltransforms.ToIndex(start_ix=1, token_to_ix=token_to_ix), ltransforms.ToLength(length=settings.min_length), ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram][lang] - 1) ])
def text_transformer_cnn(window_size, n_gram, token_to_ix): """ Get text transformer for CNNSCD :param window_size: :param n_gram: :return: """ if n_gram == 'c1': return ltransforms.Compose([ ltransforms.ToLower(), ltransforms.Character(), ltransforms.ToIndex(start_ix=1, token_to_ix=token_to_ix), ltransforms.ToLength(length=window_size), ltransforms.Reshape((-1)), ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram]) ]) else: return ltransforms.Compose([ ltransforms.ToLower(), ltransforms.Character2Gram(), ltransforms.ToIndex(start_ix=1, token_to_ix=token_to_ix), ltransforms.ToLength(length=window_size), ltransforms.Reshape((-1)), ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram]) ])
# end if # Load model and voc model.load_state_dict(torch.load(open(args.model, 'rb'))) if args.cuda: model.cuda() # end if voc = torch.load(open(args.voc, 'rb')) # Eval model.eval() if args.n_gram == 'c1': transforms = ltransforms.Compose([ ltransforms.ToLower(), ltransforms.Character(), ltransforms.ToIndex(start_ix=1, token_to_ix=voc), ltransforms.ToLength(length=window_size), ltransforms.MaxIndex(max_id=settings.voc_sizes[args.n_gram]) ]) else: transforms = ltransforms.Compose([ ltransforms.ToLower(), ltransforms.Character2Gram(), ltransforms.ToIndex(start_ix=1, token_to_ix=voc), ltransforms.ToLength(length=window_size), ltransforms.MaxIndex(max_id=settings.voc_sizes[args.n_gram]) ]) # end if # Validation losses
def train_ccsaa(fold=0, ccsaa_epoch=100, text_length=20, n_gram='c1', dataset_size=100, dataset_start=0, cuda=True, save=False, save_dir='.'): """ Train CCSAA :param fold: :param ccsaa_epoch: :param text_length: :param n_gram: :param dataset_size: :param dataset_start: :param cuda: :return: """ # Save path save_path = os.path.join(save_dir, str(int(dataset_size)), str(int(dataset_start))) # Transforms if n_gram == 'c1': transform = transforms.Compose([ transforms.Character(), transforms.ToIndex(start_ix=0), transforms.ToNGram(n=text_length, overlapse=True), transforms.Reshape((-1, text_length)) ]) else: transform = transforms.Compose([ transforms.Character2Gram(), transforms.ToIndex(start_ix=0), transforms.ToNGram(n=text_length, overlapse=True), transforms.Reshape((-1, text_length)) ]) # end if # Load from directory reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset( dataset_size=dataset_size, dataset_start=dataset_start) reutersc50_dataset.transform = transform # Loss function loss_function = nn.CrossEntropyLoss() # Set fold reuters_loader_train.dataset.set_fold(fold) reuters_loader_test.dataset.set_fold(fold) # Model model = torchlanguage.models.CCSAA( text_length=text_length, vocab_size=settings.ccsaa_voc_size, embedding_dim=settings.ccsaa_embedding_dim, n_classes=settings.n_authors) if cuda: model.cuda() # end if # Load if save and os.path.exists( os.path.join(save_path, u"ccsaa." + str(fold) + u".pth")) and os.path.exists( os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth")): model.load_state_dict( torch.load( open(os.path.join(save_path, u"ccsaa." + str(fold) + u".pth"), 'rb'))) voc = torch.load( open(os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth"), 'rb')) return model, voc # end if # Optimizer optimizer = optim.SGD(model.parameters(), lr=settings.ccsaa_lr, momentum=settings.ccsaa_momentum) # Best model best_acc = 0.0 best_model = model.state_dict() # Fail count fail_count = 0 # Epoch for epoch in range(10000): # Total losses training_loss = 0.0 training_total = 0.0 test_loss = 0.0 test_total = 0.0 # Get test data for this fold for i, data in enumerate(reuters_loader_train): # Inputs and labels inputs, labels, time_labels = data # Reshape inputs = inputs.view(-1, text_length) # Outputs outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0]) # To variable inputs, outputs = Variable(inputs), Variable(outputs) if cuda: inputs, outputs = inputs.cuda(), outputs.cuda() # end if # Zero grad model.zero_grad() # Compute output log_probs = model(inputs) # Loss loss = loss_function(log_probs, outputs) # Backward and step loss.backward() optimizer.step() # Add training_loss += loss.data[0] training_total += 1.0 # end for # Counters total = 0.0 success = 0.0 # For each test sample for i, data in enumerate(reuters_loader_test): # Inputs and labels inputs, labels, time_labels = data # Reshape inputs = inputs.view(-1, text_length) # Outputs outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0]) # To variable inputs, outputs = Variable(inputs), Variable(outputs) if cuda: inputs, outputs = inputs.cuda(), outputs.cuda() # end if # Forward model_outputs = model(inputs) loss = loss_function(model_outputs, outputs) # Take the max as predicted _, predicted = torch.max(model_outputs.data, 1) # Add to correctly classified word success += (predicted == outputs.data).sum() total += predicted.size(0) # Add loss test_loss += loss.data[0] test_total += 1.0 # end for # Accuracy accuracy = success / total * 100.0 # print(u"Epoch {}, train loss {}, test loss {}, accuracy {}".format(epoch, training_loss / training_total, test_loss / test_total, accuracy)) # Save if best if accuracy > best_acc and epoch > 10: best_acc = accuracy best_model = model.state_dict() fail_count = 0 elif epoch > 10: fail_count += 1 # end if if fail_count > ccsaa_epoch: break # end if # end for # Load best model.load_state_dict(best_model) # Save if save: # Create dir if not exists if not os.path.exists(save_path): os.mkdir(save_path) # end if # Save torch.save( model.state_dict(), open(os.path.join(save_path, u"ccsaa." + str(fold) + u".pth"), 'wb')) # Save doc torch.save( transform.transforms[1].token_to_ix, open(os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth"), 'wb')) # end if return model, transform.transforms[1].token_to_ix