def run(settings): model = pick_model(settings.model)() X_train, X_valid, Y_train, Y_valid = load_dataset(settings.data) print('Training the model') model.train(X_train, Y_train) print('Prediction accuracy on training set: {}'.format( model.test(X_train, Y_train))) print('Prediction accuracy on validation set: {}'.format( model.test(X_valid, Y_valid))) save_dir = 'saves/{}'.format(settings.name) save_file = '{}/save.model'.format(save_dir) os.makedirs(save_dir, exist_ok=True) model.save(save_file) print('Saved model in {}'.format(save_file))
def build(dataset_name: str, format: str, config: Config) -> List[BuildReport]: """ Loads the desired dataset by its name in the specified format. Args: dataset_name (str): The name of the dataset (e.g., "police-shootings"). format (str): The desired output format, e.g., "python" or "json" config (Config): The configuration information Returns: List[BuildReport]: Information about the built datasets """ formats = parse_format(format) if dataset_name == "*": dataset_names = get_all_datasets() else: dataset_names = [dataset_name] for dataset_name in dataset_names: dataset = load_dataset(dataset_name) for format in formats: yield build_dataset(dataset, format, config)
# end if # end for return base_w # end create_matrices #################################################### # Main #################################################### # Parse args args, use_cuda, param_space, xp = argument_parsing.parser_esn_training() # Load from directory reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset( ) # Print authors xp.write(u"Authors : {}".format(reutersc50_dataset.authors), log_level=0) # Last space last_space = dict() # Create W matrices base_w = create_matrices(int(args.get_space()['reservoir_size'][-1]), float(args.get_space()['w_sparsity'][-1]), int(args.get_space()['n_layers'][-1])) # Iterate for space in param_space: # Params
# end if return esn_wv, esn_c3 # end create_esn_models #################################################### # Main function #################################################### # Parse args args, use_cuda, param_space, xp = argument_parsing.parser_esn_training() # Load from directory sfgram_dataset_wv, sfgram_loader_train_wv, sfgram_loader_test_wv = dataset.load_dataset( args.author, 'wv') sfgram_dataset_c3, sfgram_loader_train_c3, sfgram_loader_test_c3 = dataset.load_dataset( args.author, 'c3') # Print authors xp.write(u"Author : {}".format(sfgram_dataset_wv.author), log_level=0) xp.write(u"Texts : {}".format(len(sfgram_dataset_wv.texts)), log_level=0) # W index w_index = 0 # Last space last_space = dict() # Iterate for space in param_space:
""" return confusion_matrix[0, 0] / (confusion_matrix[0, 0] + confusion_matrix[0, 1]) # end compute_accuracy #################################################### # Main function #################################################### # Parse args args, use_cuda, param_space, xp = argument_parsing.parser_esn_training() # Load from directory sfgram_dataset, sfgram_loader_train, sfgram_loader_test = dataset.load_dataset( args.author, args.transformer[0][0][0]) # Print authors xp.write(u"Author : {}".format(sfgram_dataset.author), log_level=0) xp.write(u"Texts : {}".format(len(sfgram_dataset.texts)), log_level=0) # W index w_index = 0 # Last space last_space = dict() # Iterate for space in param_space: # Params reservoir_size, w_sparsity, leak_rate, input_scaling, \
import torch.utils.data from torch.autograd import Variable import echotorch.nn as etnn import echotorch.utils from tools import argument_parsing, dataset, functions, features import matplotlib.pyplot as plt #################################################### # Main #################################################### # Parse args args, use_cuda, param_space, xp = argument_parsing.parser_esn_training() # Load from directory reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset( args.dataset_size, sentence_level=True) # Print authors xp.write(u"Authors : {}".format(reutersc50_dataset.authors), log_level=0) # First params w = functions.manage_w(xp, args, args.keep_w) # W index w_index = 0 # Last space last_space = dict() # Iterate for space in param_space:
import echotorch.utils from tools import argument_parsing, dataset, functions, features import matplotlib.pyplot as plt import torch.nn.functional as F #################################################### # Main #################################################### # Parse args args, use_cuda, param_space, xp = argument_parsing.parser_esn_training() # Load from directory reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset(args.dataset_size, shuffle=False) # Print authors xp.write(u"Authors : {}".format(reutersc50_dataset.authors), log_level=0) # First params w = functions.manage_w(xp, args, args.keep_w) # W index w_index = 0 # Last space last_space = dict() # Iterate for space in param_space:
def train_ccsaa(fold=0, ccsaa_epoch=100, text_length=20, n_gram='c1', dataset_size=100, dataset_start=0, cuda=True, save=False, save_dir='.'): """ Train CCSAA :param fold: :param ccsaa_epoch: :param text_length: :param n_gram: :param dataset_size: :param dataset_start: :param cuda: :return: """ # Save path save_path = os.path.join(save_dir, str(int(dataset_size)), str(int(dataset_start))) # Transforms if n_gram == 'c1': transform = transforms.Compose([ transforms.Character(), transforms.ToIndex(start_ix=0), transforms.ToNGram(n=text_length, overlapse=True), transforms.Reshape((-1, text_length)) ]) else: transform = transforms.Compose([ transforms.Character2Gram(), transforms.ToIndex(start_ix=0), transforms.ToNGram(n=text_length, overlapse=True), transforms.Reshape((-1, text_length)) ]) # end if # Load from directory reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset( dataset_size=dataset_size, dataset_start=dataset_start) reutersc50_dataset.transform = transform # Loss function loss_function = nn.CrossEntropyLoss() # Set fold reuters_loader_train.dataset.set_fold(fold) reuters_loader_test.dataset.set_fold(fold) # Model model = torchlanguage.models.CCSAA( text_length=text_length, vocab_size=settings.ccsaa_voc_size, embedding_dim=settings.ccsaa_embedding_dim, n_classes=settings.n_authors) if cuda: model.cuda() # end if # Load if save and os.path.exists( os.path.join(save_path, u"ccsaa." + str(fold) + u".pth")) and os.path.exists( os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth")): model.load_state_dict( torch.load( open(os.path.join(save_path, u"ccsaa." + str(fold) + u".pth"), 'rb'))) voc = torch.load( open(os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth"), 'rb')) return model, voc # end if # Optimizer optimizer = optim.SGD(model.parameters(), lr=settings.ccsaa_lr, momentum=settings.ccsaa_momentum) # Best model best_acc = 0.0 best_model = model.state_dict() # Fail count fail_count = 0 # Epoch for epoch in range(10000): # Total losses training_loss = 0.0 training_total = 0.0 test_loss = 0.0 test_total = 0.0 # Get test data for this fold for i, data in enumerate(reuters_loader_train): # Inputs and labels inputs, labels, time_labels = data # Reshape inputs = inputs.view(-1, text_length) # Outputs outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0]) # To variable inputs, outputs = Variable(inputs), Variable(outputs) if cuda: inputs, outputs = inputs.cuda(), outputs.cuda() # end if # Zero grad model.zero_grad() # Compute output log_probs = model(inputs) # Loss loss = loss_function(log_probs, outputs) # Backward and step loss.backward() optimizer.step() # Add training_loss += loss.data[0] training_total += 1.0 # end for # Counters total = 0.0 success = 0.0 # For each test sample for i, data in enumerate(reuters_loader_test): # Inputs and labels inputs, labels, time_labels = data # Reshape inputs = inputs.view(-1, text_length) # Outputs outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0]) # To variable inputs, outputs = Variable(inputs), Variable(outputs) if cuda: inputs, outputs = inputs.cuda(), outputs.cuda() # end if # Forward model_outputs = model(inputs) loss = loss_function(model_outputs, outputs) # Take the max as predicted _, predicted = torch.max(model_outputs.data, 1) # Add to correctly classified word success += (predicted == outputs.data).sum() total += predicted.size(0) # Add loss test_loss += loss.data[0] test_total += 1.0 # end for # Accuracy accuracy = success / total * 100.0 # print(u"Epoch {}, train loss {}, test loss {}, accuracy {}".format(epoch, training_loss / training_total, test_loss / test_total, accuracy)) # Save if best if accuracy > best_acc and epoch > 10: best_acc = accuracy best_model = model.state_dict() fail_count = 0 elif epoch > 10: fail_count += 1 # end if if fail_count > ccsaa_epoch: break # end if # end for # Load best model.load_state_dict(best_model) # Save if save: # Create dir if not exists if not os.path.exists(save_path): os.mkdir(save_path) # end if # Save torch.save( model.state_dict(), open(os.path.join(save_path, u"ccsaa." + str(fold) + u".pth"), 'wb')) # Save doc torch.save( transform.transforms[1].token_to_ix, open(os.path.join(save_path, u"ccsaa." + str(fold) + u".voc.pth"), 'wb')) # end if return model, transform.transforms[1].token_to_ix
def train_cgfs(fold=0, cgfs_epoch=100, n_gram='c3', dataset_size=100, dataset_start=0, cuda=True, save=False, save_dir='.'): """ Train a CGFS selector :param fold: :param cgfs_epoch: :param n_gram: :param dataset_size: :param dataset_start: :param cuda: :return: """ # Global global stats_sum, stats_sd # Save path save_path = os.path.join(save_dir, str(int(dataset_size)), str(int(dataset_start))) # Word embedding transform = torchlanguage.transforms.Compose([ torchlanguage.transforms.GloveVector(model='en_vectors_web_lg'), torchlanguage.transforms.ToNGram(n=3), torchlanguage.transforms.Reshape((-1, 3, settings.glove_embedding_dim)) ]) # Load from directory reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset( dataset_size=dataset_size, dataset_start=dataset_start) reutersc50_dataset.transform = transform # Loss function loss_function = nn.NLLLoss() # Set fold reuters_loader_train.dataset.set_fold(fold) reuters_loader_test.dataset.set_fold(fold) # Model model = torchlanguage.models.CGFS( n_gram=3, n_authors=settings.n_authors, n_features=settings.cgfs_output_dim[n_gram]) if cuda: model.cuda() # end if # FS file fs_file = os.path.join(save_path, u"cgfs." + str(fold) + u".p") # Load if save and os.path.exists(fs_file): print(fs_file) model.load_state_dict(torch.load(open(fs_file, 'rb'))) return model # end if # Best model best_acc = 0.0 best_model = model.state_dict() # Optimizer optimizer = optim.SGD(model.parameters(), lr=settings.cgfs_lr, momentum=settings.cgfs_momentum) # Fail count fail_count = 0 # Epoch for epoch in range(10000): # Total losses training_loss = 0.0 training_total = 0.0 test_loss = 0.0 test_total = 0.0 # Get test data for this fold for i, data in enumerate(reuters_loader_train): # Inputs and labels inputs, labels, time_labels = data # View inputs = inputs.view((-1, 1, 3, settings.glove_embedding_dim)) # Outputs outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0]) # To variable inputs, outputs = Variable(inputs), Variable(outputs) if cuda: inputs, outputs = inputs.cuda(), outputs.cuda() # end if # Zero grad model.zero_grad() # Compute output log_probs = model(inputs) # Loss loss = loss_function(log_probs, outputs) # Backward and step loss.backward() optimizer.step() # Add training_loss += loss.data[0] training_total += 1.0 # end for # Counters total = 0.0 success = 0.0 doc_total = 0.0 doc_success = 0.0 # Statistics stats_sum = 0.0 stats_sd = 0.0 # For each test sample for i, data in enumerate(reuters_loader_test): # Inputs and labels inputs, labels, time_labels = data # View inputs = inputs.view((-1, 1, 3, settings.glove_embedding_dim)) # Outputs outputs = torch.LongTensor(inputs.size(0)).fill_(labels[0]) # To variable inputs, outputs, labels = Variable(inputs), Variable( outputs), Variable(labels) if cuda: inputs, outputs, labels = inputs.cuda(), outputs.cuda( ), labels.cuda() # end if # Forward model_outputs = model(inputs) loss = loss_function(model_outputs, outputs) # Take the max as predicted _, predicted = torch.max(model_outputs.data, 1) # Add to correctly classified word success += (predicted == outputs.data).sum() total += predicted.size(0) # Add loss test_loss += loss.data[0] test_total += 1.0 # Normalized y_predicted = echotorch.utils.max_average_through_time( model_outputs, dim=0) # Compare if torch.equal(y_predicted, labels): doc_success += 1.0 # end if doc_total += 1.0 # end for # Accuracy accuracy = success / total * 100.0 doc_accuracy = doc_success / doc_total * 100.0 # Print and save loss print( u"Epoch {}, training total {}, train loss {}, test total {}, test loss {}, accuracy {}, doc accuracy {} (mean {}. std {})" .format(epoch, training_total, training_loss, test_total, test_loss, accuracy, doc_accuracy, stats_sum / test_total, stats_sd / test_total)) # Save if best if accuracy > best_acc and epoch > 10: best_acc = accuracy best_model = model.state_dict() fail_count = 0 elif epoch > 10: fail_count += 1 # end if # Fail if fail_count > cgfs_epoch: break # end if # end for # Load best model.load_state_dict(best_model) # Save if save: # Create dir if not exists if not os.path.exists(save_path): os.mkdir(save_path) # end if # Save torch.save( model.state_dict(), open(os.path.join(save_path, u"cgfs." + str(fold) + u".pth"), 'wb')) # Save info torch.save({ 'mean': stats_sum, 'std': stats_sd }, open( os.path.join(save_path, u"cgfs.info." + str(fold) + u".pth"), 'wb')) # end if return model, {'mean': stats_sum, 'std': stats_sd}
args.add_argument(command="--name", name="name", type=str, help="Experiment's name", extended=False, required=True) args.add_argument(command="--description", name="description", type=str, help="Experiment's description", extended=False, required=True) args.add_argument(command="--output", name="output", type=str, help="Experiment's output directory", required=True, extended=False) args.add_argument(command="--n-samples", name="n_samples", type=int, help="Number of different reservoir to test", default=1, extended=False) args.add_argument(command="--verbose", name="verbose", type=int, help="Verbose level", default=2, extended=False) # Parse arguments args.parse() # Load from directory reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset( args.dataset_size, sentence_level=False, n_authors=args.n_authors ) # Dataset start reutersc50_dataset.set_start(0) # Experiment xp = nsNLP.tools.ResultManager\ ( args.output, args.name, args.description, args.get_space(), 1, args.k,
import numpy as np import torch.utils.data from torch.autograd import Variable import echotorch.nn as etnn import echotorch.utils from tools import argument_parsing, dataset, functions, features, cenc_selector, settings #################################################### # Main #################################################### # Parse args args, use_cuda, param_space, xp = argument_parsing.parser_esn_training() # Load from directory reutersc50_dataset, reuters_loader_train, reuters_loader_test = dataset.load_dataset( args.dataset_size) # Print authors xp.write(u"Authors : {}".format(reutersc50_dataset.authors), log_level=0) # First params w = functions.manage_w(xp, args, args.keep_w) # W index w_index = 0 # Last space last_space = dict() # Iterate for space in param_space:
import torch #################################################### # Main #################################################### np.random.seed(1) tf.set_random_seed(1) torch.manual_seed(1) # Parse args args, use_cuda, param_space, xp = argument_parsing.parser_training() # Load from directory if args.inverse_dev_test: reutersc50_dataset, reuters_loader_train, reuters_loader_dev, reuters_loader_test = dataset.load_dataset( args.dataset_size, k=args.k, n_authors=args.n_authors) else: reutersc50_dataset, reuters_loader_train, reuters_loader_test, reuters_loader_dev = dataset.load_dataset( args.dataset_size, k=args.k, n_authors=args.n_authors) # end if # Disable CUDA if not args.cuda: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # end if # Print authors xp.write(u"Authors : {}".format(reutersc50_dataset.authors), log_level=0) # Last space last_space = dict()
import generators as G import math from tools import load_glove_embeddings as gle import os #################################################### # Main #################################################### os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # Parse args args, use_cuda, param_space, xp = argument_parsing.parser_training() # Load from directory reutersc50_dataset, reuters_loader_train, reuters_loader_dev, reuters_loader_test = dataset.load_dataset( dataset_size=args.dataset_size, n_authors=args.n_authors, k=10) # Print authors xp.write(u"Authors : {}".format(reutersc50_dataset.authors), log_level=0) # Last space last_space = dict() # Iterate for space in param_space: # Params hidden_size, cell_size, feature, lang, dataset_start, window_size, learning_window, embedding_size, rnn_type, num_layers, dropout, output_dropout = functions.get_params( space) # Load GloVe word2index, embedding_matrix = gle.load_glove_embeddings(
# end if return esn_c3 # end create_esn_models #################################################### # Main function #################################################### # Parse args args, use_cuda, param_space, xp = argument_parsing.parser_esn_training() # Load from directory sfgram_dataset_wv, sfgram_loader_train_wv, sfgram_loader_dev_wv, sfgram_loader_test_wv = dataset.load_dataset( args.author, 'wv', remove_authors=args.remove_authors) sfgram_dataset_c3, sfgram_loader_train_c3, sfgram_loader_dev_c3, sfgram_loader_test_c3 = dataset.load_dataset( args.author, 'c3', remove_authors=args.remove_authors) # Print authors xp.write(u"Author : {}".format(sfgram_dataset_wv.author), log_level=0) xp.write(u"Texts : {}".format(len(sfgram_dataset_wv.texts)), log_level=0) # W index w_index = 0 # Last space last_space = dict() # Threshold n_threshold = 200
import argparse import torchlanguage.transforms import codecs #################################################### # Main #################################################### # Parse args parser = argparse.ArgumentParser() parser.add_argument("--datadir", type=str) parser.add_argument("--k", type=int, default=10) args = parser.parse_args() # Load from directory reutersc50_dataset, reuters_loader_train, reuters_loader_dev, reuters_loader_test = dataset.load_dataset( k=args.k, n_authors=15) # For each fold for k in range(args.k): # Fold paths fold_dir = os.path.join(args.datadir, u"k{}".format(k)) fold_train_dir = os.path.join(fold_dir, u"train") fold_test_dir = os.path.join(fold_dir, u"test") # Create directories try: os.mkdir(fold_dir) os.mkdir(fold_train_dir) os.mkdir(fold_test_dir) except OSError: pass
type=str, help="Experiment's output directory", required=True, extended=False) args.add_argument(command="--verbose", name="verbose", type=int, help="Verbose level", default=2, extended=False) # Parse arguments args.parse() # Load from directory sfgram_dataset, sfgram_loader_train, sfgram_loader_test = dataset.load_dataset( args.author, '') # Experiment xp = nsNLP.tools.ResultManager\ ( args.output, args.name, args.description, args.get_space(), 1, args.k, verbose=args.verbose ) # Average average_k_fold = np.array([])
""" return confusion_matrix[0, 0] / (confusion_matrix[0, 0] + confusion_matrix[0, 1]) # end compute_accuracy #################################################### # Main function #################################################### # Parse args args, use_cuda, param_space, xp = argument_parsing.parser_esn_training() # Load from directory sfgram_dataset, sfgram_loader_train, sfgram_loader_dev, sfgram_loader_test = dataset.load_dataset( args.author, args.transformer[0][0][0], remove_authors=args.remove_authors) # Print authors xp.write(u"Author : {}".format(sfgram_dataset.author), log_level=0) xp.write(u"Texts : {}".format(len(sfgram_dataset.texts)), log_level=0) # W index w_index = 0 # Last space last_space = dict() # Iterate for space in param_space: # Params reservoir_size, w_sparsity, leak_rate, input_scaling, \
import os import matplotlib.pyplot as plt #################################################### # Main #################################################### # Parse args args, use_cuda, param_space, xp = argument_parsing.parser_training() # Load from directory if args.inverse_dev_test: reutersc50_dataset, reuters_loader_train, reuters_loader_dev, reuters_loader_test = dataset.load_dataset( args.dataset_size, k=args.k, n_authors=args.n_authors, features=args.precomputed_features ) else: reutersc50_dataset, reuters_loader_train, reuters_loader_test, reuters_loader_dev = dataset.load_dataset( args.dataset_size, k=args.k, n_authors=args.n_authors, features=args.precomputed_features ) # end if # Disable CUDA if not args.cuda: os.environ['CUDA_VISIBLE_DEVICES'] = '-1' # end if