def acronyms_finetune(args): args.git_hash = get_git_revision_hash() render_args(args) prev_args, bsg_model, vocab, _ = restore_model(args.bsg_experiment) # Load Data data_dir = '../eval/eval_data/minnesota/' sense_fp = os.path.join(data_dir, 'sense_inventory_ii') lfs, lf_sf_map, sf_lf_map = parse_sense_df(sense_fp) df = pd.read_csv(os.path.join(data_dir, 'preprocessed_dataset_window_{}.csv'.format(prev_args.window))) df['target_lf_idx'] = df['sf'].combine(df['target_lf'], lambda sf, lf: target_lf_index(lf, sf_lf_map[sf])) prev_N = df.shape[0] df = df[df['target_lf_idx'] > -1] print('Removed {} examples for which the target LF is not exactly in the sense inventory ii'.format( prev_N - df.shape[0])) sfs = df['sf'].unique().tolist() used_sf_lf_map = defaultdict(list) dominant_sfs = set() for sf in sfs: subset_df = df[df['sf'] == sf] used_target_idxs = subset_df['target_lf_idx'].unique() if len(used_target_idxs) == 1: dominant_sfs.add(sf) else: for lf_idx in used_target_idxs: used_sf_lf_map[sf].append(sf_lf_map[sf][lf_idx]) prev_N = df.shape[0] df = df[~df['sf'].isin(dominant_sfs)] print(('Removing {} examples from {} SF\'s because they have only 1 sense associated with' ' them after preprocessing'.format(prev_N - df.shape[0], len(dominant_sfs)))) df['used_target_lf_idx'] = df['sf'].combine(df['target_lf'], lambda sf, lf: target_lf_index(lf, used_sf_lf_map[sf])) sf_tokenized_lf_map = {} for sf, lf_list in used_sf_lf_map.items(): sf_tokenized_lf_map[sf] = list(map(lf_tokenizer, lf_list)) train_df, test_df = train_test_split(df, random_state=1992, test_size=0.2) train_batcher = AcronymBatcherLoader(train_df, batch_size=args.batch_size) test_batcher = AcronymBatcherLoader(test_df, batch_size=args.batch_size) render_test_statistics(test_df, used_sf_lf_map) # Create model experiments directory or clear if it already exists weights_dir = os.path.join('../acronyms', 'weights', args.experiment) if os.path.exists(weights_dir): print('Clearing out previous weights in {}'.format(weights_dir)) rmtree(weights_dir) os.mkdir(weights_dir) results_dir = os.path.join('../acronyms', weights_dir, 'results') os.mkdir(results_dir) os.mkdir(os.path.join(results_dir, 'confusion')) model = AcronymExpander(bsg_model) # Instantiate Adam optimizer trainable_params = filter(lambda x: x.requires_grad, model.parameters()) optimizer = torch.optim.Adam(trainable_params, lr=args.lr) loss_func = nn.CrossEntropyLoss() best_weights = None best_epoch = 1 lowest_test_loss = run_test_epoch(args, test_batcher, model, loss_func, vocab, sf_tokenized_lf_map) # Make sure it's calculating gradients model.train() # just sets .requires_grad = True for epoch in range(1, args.epochs + 1): sleep(0.1) # Make sure logging is synchronous with tqdm progress bar print('Starting Epoch={}'.format(epoch)) train_loss = run_train_epoch(args, train_batcher, model, loss_func, optimizer, vocab, sf_tokenized_lf_map) test_loss = run_test_epoch(args, test_batcher, model, loss_func, vocab, sf_tokenized_lf_map) losses_dict = { 'train': train_loss, 'test_loss': test_loss } checkpoint_fp = os.path.join(weights_dir, 'checkpoint_{}.pth'.format(epoch)) save_checkpoint(args, model, optimizer, vocab, losses_dict, checkpoint_fp=checkpoint_fp) lowest_test_loss = min(lowest_test_loss, test_loss) best_weights = model.state_dict() if lowest_test_loss == test_loss: best_epoch = epoch if args.debug: break print('Loading weights from {} epoch to perform error analysis'.format(best_epoch)) model.load_state_dict(best_weights) losses_dict['test_loss'] = lowest_test_loss checkpoint_fp = os.path.join(weights_dir, 'checkpoint_best.pth') save_checkpoint(args, model, optimizer, vocab, losses_dict, checkpoint_fp=checkpoint_fp) error_analysis(test_batcher, model, used_sf_lf_map, loss_func, vocab, results_dir=results_dir)
parser.add_argument('-combine_phrases', default=False, action='store_true') parser.add_argument('-section2vec', default=False, action='store_true') parser.add_argument('--epochs', default=4, type=int) parser.add_argument('--lr', default=0.001, type=float) parser.add_argument('--window', default=5, type=int) parser.add_argument('-use_pretrained', default=False, action='store_true') # Model Hyperparameters parser.add_argument('--encoder_hidden_dim', default=64, type=int, help='hidden dimension for encoder') parser.add_argument('--encoder_input_dim', default=64, type=int, help='embedding dimemsions for encoder') parser.add_argument('--hinge_loss_margin', default=1.0, type=float, help='reconstruction margin') parser.add_argument('--latent_dim', default=100, type=int, help='z dimension') args = parser.parse_args() args.git_hash = get_git_revision_hash() render_args(args) # Load Data debug_str = '_mini' if args.debug else '' phrase_str = '_phrase' if args.combine_phrases else '' ids_infile = os.path.join(args.data_dir, 'ids{}{}.npy'.format(debug_str, phrase_str)) print('Loading data from {}...'.format(ids_infile)) with open(ids_infile, 'rb') as fd: ids = np.load(fd) # Load Vocabulary vocab_infile = '../preprocess/data/vocab{}{}.pk'.format(debug_str, phrase_str) print('Loading vocabulary from {}...'.format(vocab_infile)) with open(vocab_infile, 'rb') as fd: vocab = pickle.load(fd)
def run_evaluation(args, acronym_model, dataset_loader, restore_func, train_frac=0.0): """ :param args: argparse instance specifying evaluation configuration (including pre-trained model path) :param acronym_model: PyTorch model to rank candidate acronym expansions (an instance of model from ./modules/) :param dataset_loader: function to load acronym expansion dataset (i.e. either CASI or Reverse Substitution MIMIC) :param restore_func: Function to load pre-trained model weights (different for BSG and LMC) :param train_frac: If you want to fine tune the model, this should be about 0.8. Otherwise, default of 0.0 means the entire dataset is used as a test set for evaluation :return: """ args.git_hash = get_git_revision_hash() render_args(args) if args.lm_type == 'bsg': prev_args, lm, token_vocab, _ = restore_func(args.lm_experiment, ckpt=args.ckpt) metadata_vocab = None prev_args.metadata = None else: prev_args, lm, token_vocab, metadata_vocab, _, _, _ = restore_func( args.lm_experiment, ckpt=args.ckpt) train_batcher, test_batcher, train_df, test_df, sf_lf_map = dataset_loader( prev_args, train_frac=train_frac, batch_size=args.batch_size) args.metadata = prev_args.metadata # Construct smoothed empirical probabilities of metadata conditioned on LF ~ p(metadata|LF) lf_metadata_counts = extract_smoothed_metadata_probs( metadata=args.metadata) casi_dir = os.path.join(home_dir, 'shared_data', 'casi') canonical_lfs = pd.read_csv(os.path.join(casi_dir, 'labeled_sf_lf_map.csv')) canonical_sf_lf_map = dict( canonical_lfs.groupby('target_lf_sense')['target_label'].apply(list)) sf_tokenized_lf_map = defaultdict(list) prev_vocab_size = token_vocab.size() for sf, lf_list in sf_lf_map.items(): token_vocab.add_token(sf.lower()) for lf in lf_list: canonical_lf_arr = list(set(canonical_sf_lf_map[lf])) assert len(canonical_lf_arr) == 1 canonical_lf = canonical_lf_arr[0] tokens = lf_tokenizer(canonical_lf, token_vocab) sf_tokenized_lf_map[sf].append(tokens) for t in tokens: token_vocab.add_token(t) new_vocab_size = token_vocab.size() print('Added {} tokens to vocabulary from LF targets and SFs.'.format( new_vocab_size - prev_vocab_size)) render_test_statistics(test_df, sf_lf_map) if lf_metadata_counts is not None: if args.dataset == 'mimic': train_lf_metadata_counts, val_lf_metadata_counts = split_marginals( lf_metadata_counts) else: train_lf_metadata_counts = lf_metadata_counts val_lf_metadata_counts = _generate_marginals(test_df) render_dominant_section_accuracy(train_lf_metadata_counts, val_lf_metadata_counts, sf_lf_map) # Create model experiments directory or clear if it already exists weights_dir = os.path.join(home_dir, 'weights', 'acronyms', args.experiment) if os.path.exists(weights_dir): print('Clearing out previous weights in {}'.format(weights_dir)) rmtree(weights_dir) os.mkdir(weights_dir) results_dir = os.path.join(home_dir, 'acronyms', weights_dir, 'results') os.mkdir(results_dir) os.mkdir(os.path.join(results_dir, 'confusion')) model = acronym_model(args, lm, token_vocab).to(args.device) # Instantiate Adam optimizer trainable_params = filter(lambda x: x.requires_grad, model.parameters()) optimizer = torch.optim.Adam(trainable_params, lr=args.lr) loss_func = nn.CrossEntropyLoss() best_weights = model.state_dict() best_epoch = 1 lowest_test_loss, highest_test_acc = run_test_epoch( args, test_batcher, model, loss_func, token_vocab, metadata_vocab, sf_tokenized_lf_map, sf_lf_map, lf_metadata_counts) metrics = analyze(args, test_batcher, model, sf_lf_map, loss_func, token_vocab, metadata_vocab, sf_tokenized_lf_map, lf_metadata_counts, results_dir=results_dir) metrics['log_loss'] = lowest_test_loss metrics['accuracy'] = highest_test_acc if args.epochs == 0: return metrics # Make sure it's calculating gradients for epoch in range(1, args.epochs + 1): sleep(0.1) # Make sure logging is synchronous with tqdm progress bar print('Starting Epoch={}'.format(epoch)) _ = run_train_epoch(args, train_batcher, model, loss_func, optimizer, token_vocab, metadata_vocab, sf_tokenized_lf_map, sf_lf_map, lf_metadata_counts) test_loss, test_acc = run_test_epoch(args, test_batcher, model, loss_func, token_vocab, metadata_vocab, sf_tokenized_lf_map, sf_lf_map, lf_metadata_counts) analyze(args, test_batcher, model, sf_lf_map, loss_func, token_vocab, metadata_vocab, sf_tokenized_lf_map, lf_metadata_counts, results_dir=results_dir) lowest_test_loss = min(lowest_test_loss, test_loss) highest_test_acc = max(highest_test_acc, test_acc) if lowest_test_loss == test_loss: best_weights = model.state_dict() best_epoch = epoch print('Loading weights from {} epoch to perform error analysis'.format( best_epoch)) model.load_state_dict(best_weights) metrics = analyze(args, test_batcher, model, sf_lf_map, loss_func, token_vocab, metadata_vocab, sf_tokenized_lf_map, lf_metadata_counts, results_dir=results_dir) metrics['log_loss'] = lowest_test_loss metrics['accuracy'] = highest_test_acc return metrics