def test(config): print('config.best_model_path', config.best_model_path) model = torch.load(config.best_model_path) test_samples_path = os.path.join( config.setup_dir, 'test_samples.jsonl') batcher = Batcher(input_file=config.setup_path(test_samples_path)) # a lookup table of torch.Tensor objects, keyed by user/paper ID. bert_lookup = utils.load_pkl(os.path.join(config.kp_setup_dir, 'bert_lookup.pkl')) predictions = centroid_scibert.generate_predictions(config, model, batcher, bert_lookup) prediction_filename = config.test_save(predictions, 'test.predictions.jsonl') print('prediction filename', prediction_filename) map_score = float(centroid_scibert.eval_map_file(prediction_filename)) hits_at_1 = float(centroid_scibert.eval_hits_at_k_file(prediction_filename, 1)) hits_at_3 = float(centroid_scibert.eval_hits_at_k_file(prediction_filename, 3)) hits_at_5 = float(centroid_scibert.eval_hits_at_k_file(prediction_filename, 5)) hits_at_10 = float(centroid_scibert.eval_hits_at_k_file(prediction_filename, 10)) score_lines = [ [config.name, text, data] for text, data in [ ('MAP', map_score), ('Hits@1', hits_at_1), ('Hits@3', hits_at_3), ('Hits@5', hits_at_5), ('Hits@10', hits_at_10) ] ] config.test_save(score_lines, 'test.scores.tsv')
def infer(config): experiment_dir = Path(config['experiment_dir']).resolve() model = utils.load_pkl(config['tfidf_model']) dataset = Dataset(**config['dataset']) paperids = list(model.bow_archives_by_paperid.keys()) paperidx_by_id = {paperid: index for index, paperid in enumerate(paperids)} score_file_path = experiment_dir.joinpath(config['name'] + '-scores.csv') bids_by_forum = expertise.utils.get_bids_by_forum(dataset) submission_ids = [n for n in dataset.submission_ids] reviewer_ids = [r for r in dataset.reviewer_ids] # samples = expertise.utils.format_bid_labels(submission_ids, bids_by_forum) scores = {} max_score = 0.0 for paperid, userid in itertools.product(submission_ids, reviewer_ids): # label = data['label'] if userid not in scores: # bow_archive is a list of BOWs. if userid in model.bow_archives_by_userid and len( model.bow_archives_by_userid[userid]) > 0: bow_archive = model.bow_archives_by_userid[userid] else: bow_archive = [[]] best_scores = np.amax(model.index[bow_archive], axis=0) scores[userid] = best_scores user_max_score = max(best_scores) if user_max_score > max_score: max_score = user_max_score print('max score', max_score) with open(score_file_path, 'w') as w: for userid, user_scores in scores.items(): for paperidx, paper_score in enumerate(user_scores): paperid = paperids[paperidx] score = scores[userid][paperidx] / max_score w.write('{0},{1},{2:.3f}'.format(paperid, userid, score)) w.write('\n') return config
def setup(config): print('starting setup') dataset = Dataset(**config.dataset) bids_by_forum = utils.get_bids_by_forum(dataset) vocab = utils.load_pkl( os.path.join(config.kp_setup_dir, 'textrank_vocab.pkl')) (train_set_ids, dev_set_ids, test_set_ids) = utils.split_ids(list(dataset.submission_ids), seed=config.random_seed) def fold_reader(id): fold_file = f'{id}.jsonl' fold_path = os.path.join(config.kp_setup_dir, 'folds', fold_file) return utils.jsonl_reader(fold_path) train_folds = [fold_reader(i) for i in train_set_ids] dev_folds = [fold_reader(i) for i in dev_set_ids] test_folds = [fold_reader(i) for i in test_set_ids] train_samples = (data_to_sample(data, vocab, config.max_num_keyphrases) for data in itertools.chain(*train_folds)) train_samples_path = os.path.join(config.setup_dir, 'train_samples.jsonl') utils.dump_jsonl(train_samples_path, train_samples) dev_samples = (data_to_sample(data, vocab, config.max_num_keyphrases) for data in itertools.chain(*dev_folds)) dev_samples_path = os.path.join(config.setup_dir, 'dev_samples.jsonl') utils.dump_jsonl(dev_samples_path, dev_samples) test_samples = (data_to_sample(data, vocab, config.max_num_keyphrases) for data in itertools.chain(*test_folds)) test_samples_path = os.path.join(config.setup_dir, 'test_samples.jsonl') utils.dump_jsonl(test_samples_path, test_samples) # features_dir = './scibert_features/akbc19/setup/archives-features/' features_dir = config.bert_features_dir archive_features_dir = os.path.join(features_dir, 'archives-features') submission_features_dir = os.path.join(features_dir, 'submissions-features')
def setup(config): print('starting setup') setup_dir = os.path.join(config.experiment_dir, 'setup') if not os.path.exists(setup_dir): os.mkdir(setup_dir) dataset = Dataset(**config.dataset) vocab = utils.load_pkl( os.path.join(config.kp_setup_dir, 'textrank_vocab.pkl')) (train_set_ids, dev_set_ids, test_set_ids) = utils.split_ids(list(dataset.submission_ids), seed=config.random_seed) def fold_reader(id): fold_file = f'{id}.jsonl' fold_path = os.path.join(config.bpr_samples, fold_file) return utils.jsonl_reader(fold_path) train_folds = [fold_reader(i) for i in train_set_ids] dev_folds = [fold_reader(i) for i in dev_set_ids] test_folds = [fold_reader(i) for i in test_set_ids] train_samples = (data_to_sample(data, vocab, config.max_num_keyphrases) for data in itertools.chain(*train_folds)) train_samples_path = os.path.join(setup_dir, 'train_samples.jsonl') utils.dump_jsonl(train_samples_path, train_samples) dev_samples = (data_to_sample(data, vocab, config.max_num_keyphrases) for data in itertools.chain(*dev_folds)) dev_samples_path = os.path.join(setup_dir, 'dev_samples.jsonl') utils.dump_jsonl(dev_samples_path, dev_samples) test_samples = (data_to_sample(data, vocab, config.max_num_keyphrases) for data in itertools.chain(*test_folds)) test_samples_path = os.path.join(setup_dir, 'test_samples.jsonl') utils.dump_jsonl(test_samples_path, test_samples) return config
def setup_bert_kps_lookup(config): print('starting setup') # features_dir = config.bert_features_dir archive_features_dir = os.path.join(config.experiment_dir, 'setup', 'archives-features') submission_features_dir = os.path.join(config.experiment_dir, 'setup', 'submissions-features') textrank_kps = utils.load_pkl( os.path.join(config.setup_dir, 'textrank_kps_by_id.pkl')) bert_lookup = {} for target_dir in [archive_features_dir, submission_features_dir]: for filename in os.listdir(target_dir): print(filename) item_id = filename.replace('.npy', '') filepath = os.path.join(target_dir, filename) archives = np.load(filepath) document_kps = textrank_kps[item_id] kps_seen = [] kp_features = [] for document in archives: features = document['features'] for feature in features: if feature['token'] in document_kps and feature[ 'token'] not in kps_seen: kps_seen.append(feature['token']) kp_features.append(feature['layers'][-1]['values']) kp_features = kp_features[:config.max_num_keyphrases] while len(kp_features) < config.max_num_keyphrases: kp_features.append(np.zeros(config.bert_dim)) result = np.array(kp_features) bert_lookup[item_id] = torch.Tensor(result) return bert_lookup
def train(config): for train_subdir in ['dev_scores', 'dev_predictions']: train_subdir_path = os.path.join(config.train_dir, train_subdir) if not os.path.exists(train_subdir_path): os.mkdir(train_subdir_path) vocab_path = os.path.join(config.kp_setup_dir, 'textrank_vocab.pkl') vocab = utils.load_pkl(vocab_path) torch.manual_seed(config.random_seed) train_samples_path = os.path.join(config.setup_dir, 'train_samples.jsonl') dev_samples_path = os.path.join(config.setup_dir, 'dev_samples.jsonl') print('reading train samples from ', train_samples_path) batcher = Batcher(input_file=train_samples_path) batcher_dev = Batcher(input_file=dev_samples_path) model = centroid_scibert.Model(config, vocab) if config.use_cuda: model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=config.l2penalty) # Stats best_map = 0 sum_loss = 0.0 # a lookup table of torch.Tensor objects, keyed by user/paper ID. bert_lookup = utils.load_pkl( os.path.join(config.kp_setup_dir, 'bert_lookup.pkl')) print('Begin Training') # Training loop for counter, batch in enumerate( batcher.batches(batch_size=config.batch_size)): batch_source = [] batch_pos = [] batch_neg = [] for data in batch: batch_source.append(bert_lookup[data['source_id']]) batch_pos.append(bert_lookup[data['positive_id']]) batch_neg.append(bert_lookup[data['negative_id']]) print('num_batches: {}'.format(counter)) optimizer.zero_grad() loss_parameters = (torch.stack(batch_source), torch.stack(batch_pos), torch.stack(batch_neg)) loss = model.compute_loss(*loss_parameters) loss.backward() # torch.nn.utils.clip_grad_norm(model.parameters(), config.clip) optimizer.step() # Question: is this if block just for monitoring? if counter % 100 == 0: this_loss = loss.cpu().data.numpy() sum_loss += this_loss print( 'Processed {} batches, Loss of batch {}: {}. Average loss: {}'. format(counter, counter, this_loss, sum_loss / (counter / 100))) if counter % config.eval_every == 0: # is this reset needed? batcher_dev.reset() predictions = centroid_scibert.generate_predictions( config, model, batcher_dev, bert_lookup) prediction_filename = config.train_save( predictions, 'dev_predictions/dev.predictions.{}.jsonl'.format(counter)) print('prediction filename', prediction_filename) map_score = float( centroid_scibert.eval_map_file(prediction_filename)) hits_at_1 = float( centroid_scibert.eval_hits_at_k_file(prediction_filename, 1)) hits_at_3 = float( centroid_scibert.eval_hits_at_k_file(prediction_filename, 3)) hits_at_5 = float( centroid_scibert.eval_hits_at_k_file(prediction_filename, 5)) hits_at_10 = float( centroid_scibert.eval_hits_at_k_file(prediction_filename, 10)) score_lines = [[ config.name, counter, text, data ] for text, data in [('MAP', map_score), ( 'Hits@1', hits_at_1), ('Hits@3', hits_at_3), ('Hits@5', hits_at_5), ('Hits@10', hits_at_10)]] config.train_save(score_lines, 'dev_scores/dev.scores.{}.tsv'.format(counter)) if map_score > best_map: best_map = map_score best_model_path = os.path.join( config.train_dir, 'model_{}_{}.torch'.format(config.name, 'best')) torch.save(model, best_model_path) config.best_model_path = best_model_path config.best_map_score = best_map config.hits_at_1 = hits_at_1 config.hits_at_3 = hits_at_3 config.hits_at_5 = hits_at_5 config.hits_at_10 = hits_at_10 config.save_config() config.train_save(score_lines, 'dev.scores.best.tsv') if counter == config.num_minibatches: break
def train(config): train_dir = os.path.join(config.experiment_dir, 'train') if not os.path.isdir(train_dir): os.mkdir(train_dir) for train_subdir in ['dev_scores', 'dev_predictions']: train_subdir_path = os.path.join(train_dir, train_subdir) if not os.path.exists(train_subdir_path): os.mkdir(train_subdir_path) vocab_file = os.path.join(config.kp_setup_dir, 'textrank_vocab.pkl') vocab = utils.load_pkl(vocab_file) torch.manual_seed(config.random_seed) batcher = Batcher(input_file=os.path.join(config.experiment_dir, 'setup', 'train_samples.jsonl')) batcher_dev = Batcher(input_file=os.path.join( config.experiment_dir, 'setup', 'dev_samples.jsonl')) model = centroid.Model(config, vocab) if config.use_cuda: model = model.cuda() optimizer = optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=config.l2penalty) # Stats best_map = 0 sum_loss = 0.0 print('Begin Training') # Training loop for counter, batch in enumerate( batcher.batches(batch_size=config.batch_size)): batch_source = [] batch_pos = [] batch_neg = [] batch_source_lens = [] batch_pos_lens = [] batch_neg_lens = [] for data in batch: batch_source.append(np.asarray(data['source'])) batch_pos.append(np.asarray(data['positive'])) batch_neg.append(np.asarray(data['negative'])) batch_source_lens.append( np.asarray(data['source_length'], dtype=np.float32)) batch_pos_lens.append( np.asarray(data['positive_length'], dtype=np.float32)) batch_neg_lens.append( np.asarray(data['negative_length'], dtype=np.float32)) print('num_batches: {}'.format(counter)) optimizer.zero_grad() loss_parameters = (np.asarray(batch_source), np.asarray(batch_pos), np.asarray(batch_neg), np.asarray(batch_source_lens, dtype=np.float32), np.asarray(batch_pos_lens, dtype=np.float32), np.asarray(batch_neg_lens, dtype=np.float32)) loss = model.compute_loss(*loss_parameters) loss.backward() # torch.nn.utils.clip_grad_norm(model.parameters(), config.clip) optimizer.step() # Question: is this if block just for monitoring? if counter % 100 == 0: this_loss = loss.cpu().data.numpy() sum_loss += this_loss print( 'Processed {} batches, Loss of batch {}: {}. Average loss: {}'. format(counter, counter, this_loss, sum_loss / (counter / 100))) if counter % config.eval_every == 0: # is this reset needed? batcher_dev.reset() predictions = centroid.generate_predictions( config, model, batcher_dev) prediction_filename = os.path.join( train_dir, 'dev_predictions/dev.predictions.{}.jsonl'.format(counter)) utils.dump_jsonl(prediction_filename, predictions) print('prediction filename', prediction_filename) map_score = float(centroid.eval_map_file(prediction_filename)) hits_at_1 = float( centroid.eval_hits_at_k_file(prediction_filename, 1)) hits_at_3 = float( centroid.eval_hits_at_k_file(prediction_filename, 3)) hits_at_5 = float( centroid.eval_hits_at_k_file(prediction_filename, 5)) hits_at_10 = float( centroid.eval_hits_at_k_file(prediction_filename, 10)) score_lines = [[ config.name, counter, text, data ] for text, data in [('MAP', map_score), ( 'Hits@1', hits_at_1), ('Hits@3', hits_at_3), ('Hits@5', hits_at_5), ('Hits@10', hits_at_10)]] dev_scores_file = os.path.join( train_dir, 'dev_scores/dev.scores.{}.tsv'.format(counter)) utils.dump_csv(dev_scores_file, score_lines) if map_score > best_map: best_map = map_score best_model_path = os.path.join( train_dir, 'model_{}_{}.torch'.format(config.name, 'best')) torch.save(model, best_model_path) config.update(best_model_path=best_model_path) best_scores_file = os.path.join(train_dir, 'dev.scores.best.tsv') utils.dump_csv(best_scores_file, score_lines) if counter == config.num_minibatches: return config return config