def eval_on_dev(): """Split train into train and dev and fit""" model, config = get_model() # load training data train_data_provider = DatasetProvider( os.path.join(base, cfg.get('data', 'train')), cfg.get('data', 'tokenizer_pickle')) x_train, y_train = train_data_provider.load_as_int_seqs() x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.20, random_state=2020) # convert to (n_train x vocabl_size) matrix x_train = utils.sequences_to_matrix(x_train, config['input_vocab_size']) x_val = utils.sequences_to_matrix(x_val, config['input_vocab_size']) train_loader = make_data_loader(x_train, torch.tensor(y_train), cfg.getint('model', 'batch'), 'train') val_loader = make_data_loader(x_val, torch.tensor(y_val), cfg.getint('model', 'batch'), 'dev') label_counts = torch.bincount(torch.tensor(y_train)) weights = len(y_train) / (2.0 * label_counts) print('class weights:', weights) best_roc_auc, optimal_epochs = fit(model, train_loader, val_loader, weights, cfg.getint('model', 'epochs')) print('best roc %.4f after %d epochs\n' % (best_roc_auc, optimal_epochs)) return optimal_epochs
def eval_on_test(n_epochs): """Train on training set and evaluate on test""" model, config = get_model() # training data train_data_provider = DatasetProvider( os.path.join(base, cfg.get('data', 'train')), cfg.get('data', 'tokenizer_pickle')) # test set test_data_provider = DatasetProvider( os.path.join(base, cfg.get('data', 'test')), cfg.get('data', 'tokenizer_pickle')) x_train, y_train = train_data_provider.load_as_int_seqs() x_test, y_test = test_data_provider.load_as_int_seqs() x_train = utils.sequences_to_matrix(x_train, config['input_vocab_size']) x_test = utils.sequences_to_matrix(x_test, config['input_vocab_size']) train_loader = make_data_loader(x_train, torch.tensor(y_train), cfg.getint('model', 'batch'), 'train') test_loader = make_data_loader(x_test, torch.tensor(y_test), cfg.getint('model', 'batch'), 'dev') label_counts = torch.bincount(torch.tensor(y_train)) weights = len(y_train) / (2.0 * label_counts) fit(model, train_loader, test_loader, weights, n_epochs)
def main(): """My main main""" dp = data.DatasetProvider(os.path.join(base, cfg.get('data', 'cuis')), os.path.join(base, cfg.get('data', 'codes')), cfg.get('args', 'cui_vocab_size'), cfg.get('args', 'code_vocab_size')) in_seqs, out_seqs = dp.load_as_sequences() tr_in_seqs, val_in_seqs, tr_out_seqs, val_out_seqs = train_test_split( in_seqs, out_seqs, test_size=0.20, random_state=2020) print('loaded %d training and %d validation samples' % \ (len(tr_in_seqs), len(val_in_seqs))) max_cui_seq_len = max(len(seq) for seq in tr_in_seqs) max_code_seq_len = max(len(seq) for seq in tr_out_seqs) print('longest cui sequence:', max_cui_seq_len) print('longest code sequence:', max_code_seq_len) train_loader = make_data_loader( utils.pad_sequences(tr_in_seqs, max_len=cfg.getint('args', 'max_len')), utils.sequences_to_matrix(tr_out_seqs, len(dp.output_tokenizer.stoi)), cfg.getint('model', 'batch'), 'train') val_loader = make_data_loader( utils.pad_sequences(val_in_seqs, max_len=cfg.getint('args', 'max_len')), utils.sequences_to_matrix(val_out_seqs, len(dp.output_tokenizer.stoi)), cfg.getint('model', 'batch'), 'dev') model = TransformerEncoder(input_vocab_size=len(dp.input_tokenizer.stoi), output_vocab_size=len(dp.output_tokenizer.stoi), d_model=cfg.getint('model', 'd_model'), d_inner=cfg.getint('model', 'd_inner'), n_layers=cfg.getint('model', 'n_layers'), n_head=cfg.getint('model', 'n_head'), d_k=cfg.getint('model', 'd_k'), d_v=cfg.getint('model', 'd_v'), dropout=cfg.getfloat('model', 'dropout'), max_len=cfg.getint('args', 'max_len')) best_loss, optimal_epochs = fit(model, train_loader, val_loader, cfg.getint('model', 'epochs')) print('best loss %.4f after %d epochs' % (best_loss, optimal_epochs))
def main(): """My main main""" dp = data.DatasetProvider(os.path.join(base, cfg.get('data', 'cuis')), os.path.join(base, cfg.get('data', 'codes')), cfg.get('args', 'cui_vocab_size'), cfg.get('args', 'code_vocab_size')) in_seqs, out_seqs = dp.load_as_sequences() tr_in_seqs, val_in_seqs, tr_out_seqs, val_out_seqs = train_test_split( in_seqs, out_seqs, test_size=0.10, random_state=2020) print('loaded %d training and %d validation samples' % \ (len(tr_in_seqs), len(val_in_seqs))) max_cui_seq_len = max(len(seq) for seq in tr_in_seqs) print('longest cui sequence:', max_cui_seq_len) max_code_seq_len = max(len(seq) for seq in tr_out_seqs) print('longest code sequence:', max_code_seq_len) train_loader = make_data_loader( tr_in_seqs, utils.sequences_to_matrix(tr_out_seqs, len(dp.output_tokenizer.stoi)), cfg.getint('model', 'batch'), 'train') val_loader = make_data_loader( val_in_seqs, utils.sequences_to_matrix(val_out_seqs, len(dp.output_tokenizer.stoi)), cfg.getint('model', 'batch'), 'dev') model = BagOfEmbeddings(input_vocab_size=len(dp.input_tokenizer.stoi), output_vocab_size=len(dp.output_tokenizer.stoi), embed_dim=cfg.getint('model', 'embed'), hidden_units=cfg.getint('model', 'hidden'), dropout_rate=cfg.getfloat('model', 'dropout')) best_loss, optimal_epochs = fit(model, train_loader, val_loader, cfg.getint('model', 'epochs')) print('best loss %.4f after %d epochs' % (best_loss, optimal_epochs))
def data_dense(): """Data to feed into code prediction model""" train_data = os.path.join(base, cfg.get('data', 'train')) test_data = os.path.join(base, cfg.get('data', 'test')) # load model configuration pkl = open(cfg.get('data', 'config_pickle'), 'rb') config = pickle.load(pkl) # instantiate model and load parameters model = bow.BagOfWords(**config, save_config=False) state_dict = torch.load(cfg.get('data', 'model_file')) model.load_state_dict(state_dict) model.eval() # load training data first train_data_provider = DatasetProvider(train_data, cfg.get('data', 'tokenizer_pickle')) x_train, y_train = train_data_provider.load_as_int_seqs() x_train = utils.sequences_to_matrix(x_train, config['input_vocab_size']) # make training vectors for target task x_train = get_dense_representations(model, x_train) # now load the test set test_data_provider = DatasetProvider(test_data, cfg.get('data', 'tokenizer_pickle')) x_test, y_test = test_data_provider.load_as_int_seqs() x_test = utils.sequences_to_matrix(x_test, config['input_vocab_size']) # make test vectors for target task x_test = get_dense_representations(model, x_test) return x_train, y_train, x_test, y_test