def text_transformer_cnn(window_size, n_gram, token_to_ix): """ Get text transformer for CNNSCD :param window_size: :param n_gram: :return: """ if n_gram == 'c1': return ltransforms.Compose([ ltransforms.ToLower(), ltransforms.Character(), ltransforms.ToIndex(start_ix=1, token_to_ix=token_to_ix), ltransforms.ToLength(length=window_size), ltransforms.Reshape((-1)), ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram]) ]) else: return ltransforms.Compose([ ltransforms.ToLower(), ltransforms.Character2Gram(), ltransforms.ToIndex(start_ix=1, token_to_ix=token_to_ix), ltransforms.ToLength(length=window_size), ltransforms.Reshape((-1)), ltransforms.MaxIndex(max_id=settings.voc_sizes[n_gram]) ])
args.n_samples, args.k, verbose=args.verbose ) # CNN Glove Feature Selector cgfs = models.cgfs(pretrained=True, n_gram=2, n_features=60) # Remove last linear layer cgfs.linear2 = echotorch.nn.Identity() # Transformer transformer = transforms.Compose([ transforms.GloveVector(), transforms.ToNGram(n=2, overlapse=True), transforms.Reshape((-1, 1, 2, 300)), transforms.FeatureSelector(cgfs, 60, to_variable=True), transforms.Reshape((1, -1, 60)), transforms.Normalize(mean=-4.56512329954, std=0.911449706065) ]) # Reuters C50 dataset reutersloader = torch.utils.data.DataLoader(datasets.ReutersC50Dataset( root=args.dataset, download=True, n_authors=args.n_authors, transform=transformer), batch_size=1, shuffle=False) # Print authors xp.write(u"Authors : {}".format(reutersloader.dataset.authors), log_level=0)
# Load model and voc model.load_state_dict(torch.load(open(args.model, 'rb'))) if args.cuda: model.cuda() # end if voc = torch.load(open(args.voc, 'rb')) # Eval model.eval() if args.n_gram == 'c1': transforms = ltransforms.Compose([ ltransforms.ToLower(), ltransforms.Character(), ltransforms.ToIndex(start_ix=1, token_to_ix=voc), ltransforms.ToLength(length=window_size), ltransforms.MaxIndex(max_id=settings.voc_sizes[args.n_gram]) ]) else: transforms = ltransforms.Compose([ ltransforms.ToLower(), ltransforms.Character2Gram(), ltransforms.ToIndex(start_ix=1, token_to_ix=voc), ltransforms.ToLength(length=window_size), ltransforms.MaxIndex(max_id=settings.voc_sizes[args.n_gram]) ]) # end if # Validation losses validation_total = 0
def train_ccsaa(fold=0, ccsaa_epoch=100, text_length=20, n_gram='c1', dataset_size=100, dataset_start=0, cuda=True, save=False, save_dir='.'): """ Train CCSAA :param fold: :param ccsaa_epoch: :param text_length: :param n_gram: :param dataset_size: :param dataset_start: :param cuda: :return: """ # Save path save_path = os.path.join(save_dir, str(int(dataset_size)), str(int(dataset_start))) # Transforms if n_gram == 'c1': transform = transforms.Compose([ transforms.Character(), transforms.ToIndex(start_ix=0), transforms.ToNGram(n=text_length, overlapse=True), transforms.Reshape((-1, text_length)) ]) else: transform = transforms.Compose([ transforms.Character2Gram(), transforms.ToIndex(start_ix=0), transforms.ToNGram(n=text_length, overlapse=True), transforms.Reshape((-1, text_length)) ]) # end if # Load from directory reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset( dataset_size=dataset_size, dataset_start=dataset_start) reutersc50_dataset.transform = transform # Loss function loss_function = nn.CrossEntropyLoss() # Set fold reuters_loader_train.dataset.set_fold(fold) reuters_loader_test.dataset.set_fold(fold) # Model model = torchlanguage.models.CCSAA( text_length=text_length, vocab_size=settings.ccsaa_voc_size, embedding_dim=settings.ccsaa_embedding_dim, n_classes=settings.n_authors) if cuda: model.cuda() # end if # Load if save and os.path.exists( os.path.join(save_path, u"ccsaa." + str(fold) + u".pth")) and os.path.exists( os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth")): model.load_state_dict( torch.load( open(os.path.join(save_path, u"ccsaa." + str(fold) + u".pth"), 'rb'))) voc = torch.load( open(os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth"), 'rb')) return model, voc # end if # Optimizer optimizer = optim.SGD(model.parameters(), lr=settings.ccsaa_lr, momentum=settings.ccsaa_momentum) # Best model best_acc = 0.0 best_model = model.state_dict() # Fail count fail_count = 0 # Epoch for epoch in range(10000): # Total losses training_loss = 0.0 training_total = 0.0 test_loss = 0.0 test_total = 0.0 # Get test data for this fold for i, data in enumerate(reuters_loader_train): # Inputs and labels inputs, labels, time_labels = data # Reshape inputs = inputs.view(-1, text_length) # Outputs outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0]) # To variable inputs, outputs = Variable(inputs), Variable(outputs) if cuda: inputs, outputs = inputs.cuda(), outputs.cuda() # end if # Zero grad model.zero_grad() # Compute output log_probs = model(inputs) # Loss loss = loss_function(log_probs, outputs) # Backward and step loss.backward() optimizer.step() # Add training_loss += loss.data[0] training_total += 1.0 # end for # Counters total = 0.0 success = 0.0 # For each test sample for i, data in enumerate(reuters_loader_test): # Inputs and labels inputs, labels, time_labels = data # Reshape inputs = inputs.view(-1, text_length) # Outputs outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0]) # To variable inputs, outputs = Variable(inputs), Variable(outputs) if cuda: inputs, outputs = inputs.cuda(), outputs.cuda() # end if # Forward model_outputs = model(inputs) loss = loss_function(model_outputs, outputs) # Take the max as predicted _, predicted = torch.max(model_outputs.data, 1) # Add to correctly classified word success += (predicted == outputs.data).sum() total += predicted.size(0) # Add loss test_loss += loss.data[0] test_total += 1.0 # end for # Accuracy accuracy = success / total * 100.0 # print(u"Epoch {}, train loss {}, test loss {}, accuracy {}".format(epoch, training_loss / training_total, test_loss / test_total, accuracy)) # Save if best if accuracy > best_acc and epoch > 10: best_acc = accuracy best_model = model.state_dict() fail_count = 0 elif epoch > 10: fail_count += 1 # end if if fail_count > ccsaa_epoch: break # end if # end for # Load best model.load_state_dict(best_model) # Save if save: # Create dir if not exists if not os.path.exists(save_path): os.mkdir(save_path) # end if # Save torch.save( model.state_dict(), open(os.path.join(save_path, u"ccsaa." + str(fold) + u".pth"), 'wb')) # Save doc torch.save( transform.transforms[1].token_to_ix, open(os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth"), 'wb')) # end if return model, transform.transforms[1].token_to_ix
input_sparsity = 0.1 w_sparsity = 0.1 input_scaling = 0.5 n_test = 20 n_samples = 5000 leaky_rate = 0.1 reservoir_size = 400 # Argument args = tools.functions.argument_parser_training_model() # Transformer if args.lang == 'en' or args.lang == 'fr': transformer = transforms.Compose([ # transforms.RemoveLines(), transforms.GloveVector( model=tools.settings.lang_models[socket.gethostname()][args.lang]) ]) else: transformer = transforms.Compose([ # transforms.RemoveLines(), transforms.Token(model=tools.settings.lang_spacy_models[args.lang], lang=tools.settings.lang_models_lang[args.lang]), transforms.GensimModel(model_path=tools.settings.lang_models[ socket.gethostname()][args.lang]) ]) # end if # Samples average samples_average = np.array([]) best_acc = 0
# Experiment settings spectral_radius = 0.95 input_sparsity = 0.1 w_sparsity = 0.1 input_scaling = 0.5 n_test = 20 n_samples = 30 leaky_rate = 0.1 # Argument args = tools.functions.argument_parser_training_model() # Transformer transformer = transforms.Compose([ transforms.RemoveLines(), transforms.GloveVector(model=tools.settings.lang_models[args.lang]) ]) # Results parameter_averages = np.zeros(n_test) parameter_max = np.zeros(n_test) # For each leaky rate values index = 0 for rc_reservoir_size in np.linspace(200, 1000, n_test): # Round reservoir_size = int(math.floor(rc_reservoir_size)) # Log print(u"Reservoir size : {}".format(reservoir_size))
parser.add_argument("--text-length", type=int, help="Text length", default=20) parser.add_argument("--batch-size", type=int, help="Batch-size", default=64) parser.add_argument("--no-cuda", action='store_true', default=False, help="Enables CUDA training") args = parser.parse_args() # Use CUDA? args.cuda = not args.no_cuda and torch.cuda.is_available() # Transforms if args.n_gram == 'c1': transform = transforms.Compose([ transforms.Character(), transforms.ToIndex(start_ix=0), transforms.ToNGram(n=args.text_length, overlapse=True), transforms.Reshape((-1, args.text_length)) ]) else: transform = transforms.Compose([ transforms.Character2Gram(), transforms.ToIndex(start_ix=0), transforms.ToNGram(n=args.text_length, overlapse=True), transforms.Reshape((-1, args.text_length)) ]) # end if # Dataset dataset = datasets.ReutersC50Dataset(download=True, n_authors=15, transform=transform)
# Eval. dataset reutersloader_val = torch.utils.data.DataLoader( torchlanguage.utils.CrossValidation(reuters_dataset, k=10, train=False), batch_size=1, shuffle=False) # 10-CV for k in np.arange(args.fold, 10): # Model if model_type == 'linear': # Transformer transformer = transforms.Compose([ transforms.GloveVector(), transforms.ToNGram(n=n_gram, overlapse=True), transforms.Reshape((-1, n_gram * 300)), ]) # Transformer reuters_dataset.transform = transformer # Linear regression model = etnn.RRCell(n_gram * 300, n_authors) elif model_type == 'cgfs': # CNN Glove Feature Selector cgfs, transformer = cgfs_selector.load_cgfs(fold=k) # Transformer reuters_dataset.transform = transformer
input_sparsity = 0.1 w_sparsity = 0.1 input_scaling = 0.5 n_test = 10 n_samples = 2 n_epoch = 100 text_length = 20 # Argument args = tools.functions.argument_parser_training_model() # Transforms transform = transforms.Compose([ transforms.Character(), transforms.ToIndex(start_ix=0), transforms.MaxIndex(max_id=83), transforms.ToNGram(n=text_length, overlapse=True), transforms.Reshape((-1, 20)) ]) # Author identification training dataset dataset_train = dataset.AuthorIdentificationDataset(root="./data/", download=True, transform=transform, problem=1, lang='en') # Author identification test dataset dataset_valid = dataset.AuthorIdentificationDataset(root="./data/", download=True, transform=transform, problem=1, train=False, lang='en') # Cross validation dataloader_train = torch.utils.data.DataLoader(torchlanguage.utils.CrossValidation(dataset_train), batch_size=1, shuffle=True) dataloader_valid = torch.utils.data.DataLoader(torchlanguage.utils.CrossValidation(dataset_valid, train=False), batch_size=1, shuffle=True) # Author to idx
threshold_list = np.linspace(-1.0, 2.0, n_thresholds) # Settings # training_samples = 110000 # test_samples = 11000 training_samples = 10000 cnn_window_size = 740 # Argument parser args = functions.argument_parser_training_model() if args.n_gram == 'c1': transforms = ltransforms.Compose([ ltransforms.ToLower(), ltransforms.Character(), ltransforms.ToIndex(start_ix=0), ltransforms.Reshape((-1)), ltransforms.MaxIndex(max_id=settings.voc_sizes[args.n_gram] - 1) ]) else: transforms = ltransforms.Compose([ ltransforms.ToLower(), ltransforms.Character2Gram(), ltransforms.ToIndex(start_ix=0), ltransforms.Reshape((-1)), ltransforms.MaxIndex(max_id=settings.voc_sizes[args.n_gram] - 1) ]) # end if # Style change detection dataset, training set pan18loader_train = torch.utils.data.DataLoader(dataset.SCDPartsDataset(
# Argument args = tools.functions.argument_parser_training_model() # CNN Glove Feature Selector cgfs = models.cgfs(pretrained=True, n_gram=2, n_features=60) # Remove last linear layer cgfs.linear2 = echotorch.nn.Identity() # Transformer transformer = transforms.Compose([ transforms.RemoveLines(), transforms.GloveVector(model=tools.settings.lang_models[args.lang]), transforms.ToNGram(n=2, overlapse=True), transforms.Reshape((-1, 1, 2, 300)), transforms.FeatureSelector(cgfs, 60, to_variable=True), transforms.Reshape((1, -1, 60)), # transforms.Statistics() # transforms.Normalize(mean=-4.56512329954, std=0.911449706065) # transforms.NormalizeDim(mean=True, std=False, dim=2) ]) # For each problem for problem in np.arange(1, 3): # Author identification training dataset pan18loader_training = torch.utils.data.DataLoader( dataset.AuthorIdentificationDataset(root="./data/", download=True, transform=transformer, problem=problem, lang=args.lang), batch_size=1, shuffle=True ) # Author identification test dataset
n_classes=15) feature_selector.load_state_dict(torch.load(open(args.feature_selector, 'rb'))) feature_selector.linear = etnn.Identity() if args.cuda: feature_selector.cuda() # end if feature_selector_voc = torch.load(open(args.feature_selector_voc, 'rb')) # Transforms transform = transforms.Compose([ transforms.Character(), transforms.ToIndex(token_to_ix=feature_selector_voc), transforms.MaxIndex(max_id=83), transforms.ToNGram(n=20, overlapse=True), transforms.Reshape((-1, 20)), transforms.ToCUDA(), transforms.FeatureSelector(model=feature_selector, n_features=150, to_variable=True), transforms.ToCPU(), transforms.Normalize(mean=-5.08, std=0.3294) ]) # Results parameter_averages = np.zeros(n_test) parameter_max = np.zeros(n_test) # For each leaky rate values index = 0 for leaky_rate in np.linspace(0.6, 1.0, n_test): # Log