def main(arguments): parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # Logistics parser.add_argument("--cuda", help="CUDA id to use", type=int, default=0) parser.add_argument("--use_pytorch", help="1 to use PyTorch", type=int, default=1) parser.add_argument("--log_file", help="File to log to", type=str) parser.add_argument("--model_file", help="File to load model from", type=str) parser.add_argument("--dictionary", help="File to log to", type=str, default='/misc/vlgscratch4/BowmanGroup/awang/data/wikipedia/wiki_lower_small.txt.dict.pkl') parser.add_argument("--emb_file", help="File to load pretrained embeddings from", type=str, default='') # Task options parser.add_argument("--tasks", help="Tasks to evaluate on, as a comma separated list", type=str) parser.add_argument("--max_seq_len", help="Max sequence length", type=int, default=40) parser.add_argument("--batch_size", help="Batch size to use", type=int, default=64) args = parser.parse_args(arguments) logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) fileHandler = logging.FileHandler(args.log_file) logging.getLogger().addHandler(fileHandler) # Set params for SentEval params_senteval = {'usepytorch': True, 'task_path': PATH_TO_DATA, 'batch_size': args.batch_size} params_senteval = dotdict(params_senteval) # Build model use_preemb = False if args.emb_file: use_preemb = True model, model_options, worddict, wv_embs = \ sdae.load_model(saveto=args.model_file, dictionary=args.dictionary, embeddings=args.emb_file, reload_=True, use_preemb=use_preemb) params_senteval.encoder = model params_senteval.model_options = model_options params_senteval.worddict = worddict params_senteval.wv_embs = wv_embs se = senteval.SentEval(params_senteval, batcher, prepare) tasks = args.tasks.split(',') results = se.eval(tasks) print(results)
def main(arguments): parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument("--use_pytorch", help="1 to use PyTorch", type=int, default=1) parser.add_argument("--log_file", help="File to log to", type=str) parser.add_argument("--model_file", help="File containing trained model", type=str) parser.add_argument("--small", help="Use small training data if available", type=int, default=1) parser.add_argument("--lower", help="Lower case data", type=int, default=0) args = parser.parse_args(arguments) # Set params for SentEval params_senteval = {'usepytorch': True, 'task_path': PATH_TO_DATA, 'batch_size': 512} params_senteval = dotdict(params_senteval) # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) fileHandler = logging.FileHandler(args.log_file) logging.getLogger().addHandler(fileHandler) #params_senteval.encoder = pkl.load(open(args.model_file, 'rb')) params_senteval.encoder = FastSent.load(args.model_file) se = senteval.SentEval(params_senteval, batcher, prepare) ''' tasks = ['MR', 'CR', 'SUBJ', 'MPQA', 'SST', 'TREC', 'SICKRelatedness', 'SICKEntailment', 'MRPC', 'STS14', 'SQuAD', 'Quora'] ''' tasks = ['Quora', 'Reasoning'] se.eval(tasks, small=args.small, lower=args.lower)
results.append("{0:.4f}/{0:.4f}".format(stsbenchmark_dev_pear, stsbenchmark_test_pear)) writer.writerow(results) """ Evaluation of trained model on Transfer Tasks (SentEval) """ # define transfer tasks transfer_tasks = ['ABSA_CH'] if params.lang == 'CH' else ['ABSA_SP', 'STS_SP'] # define senteval params # Can choose to use MLP instead params_senteval = dotdict({'usepytorch': True, 'task_path': PATH_TO_DATA, 'seed': 1111, 'kfold': 5}) # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if __name__ == "__main__": # We map cuda to the current cuda device # this only works when we set params.gpu_id = 0 map_locations = {} for d in range(4): if d != params.gpu_id: map_locations['cuda:{}'.format(d)] = "cuda:{}".format(params.gpu_id) # collect number of epochs trained in directory model_files = filter(lambda s: params.outputmodelname + '-' in s and 'encoder' not in s,
sentvec = [] for word in sent: if word in params.word_vec: sentvec.append(params.word_vec[word]) if not sentvec: sentvec.append(params.word_vec['.']) sentvec = np.mean(sentvec, 0) embeddings.append(sentvec) embeddings = np.vstack(embeddings) return embeddings # Set params for SentEval params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': False, 'kfold': 5} params_senteval = dotdict(params_senteval) # set gpu device torch.cuda.set_device(0) # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if __name__ == "__main__": se = senteval.SentEval(params_senteval, batcher, prepare) transfer_tasks = [ 'MR', 'CR', 'MPQA', 'SUBJ', 'SST', 'TREC', 'MRPC', 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'STS14' ] results = se.eval(transfer_tasks)
# define transfer tasks if params.dis: transfer_tasks = ['DIS'] elif params.pdtb: transfer_tasks = ['PDTB_IMEX'] # 'PDTB_EX' elif params.dat: transfer_tasks = ['DAT'] else: transfer_tasks = ['MR', 'CR', 'SUBJ', 'MPQA', 'SST', 'TREC', 'SICKRelatedness', 'SICKEntailment', 'MRPC', 'STS14'] # define senteval params if params.mlp: # keep nhid the same as DisSent model (otherwise we can try 1024) params_senteval = dotdict({'usepytorch': True, 'task_path': PATH_TO_DATA, 'seed': 1111, 'kfold': 5, 'classifier': 'MLP', 'nhid': 512, 'bilinear': params.bilinear}) else: params_senteval = dotdict({'usepytorch': True, 'task_path': PATH_TO_DATA, 'seed': 1111, 'kfold': 5, 'bilinear': params.bilinear}) # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if __name__ == "__main__": # We map cuda to the current cuda device # this only works when we set params.gpu_id = 0 map_locations = {} for d in range(4): if d != params.gpu_id:
def main(arguments): parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) # Logistics parser.add_argument("--cuda", help="CUDA id to use", type=int, default=0) parser.add_argument("--use_pytorch", help="1 to use PyTorch", type=int, default=1) parser.add_argument("--log_file", help="File to log to", type=str) parser.add_argument("--model_file", help="File to load model from", type=str) parser.add_argument("--dict_file", help="File to load dict from", type=str) # Task options parser.add_argument("--tasks", help="Tasks to evaluate on, as a comma separated list", type=str) parser.add_argument("--max_seq_len", help="Max sequence length", type=int, default=40) # Model options parser.add_argument("--batch_size", help="Batch size to use", type=int, default=32) # Classifier options parser.add_argument("--cls_batch_size", help="Batch size to use for classifier", type=int, default=32) args = parser.parse_args(arguments) logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) fileHandler = logging.FileHandler(args.log_file) logging.getLogger().addHandler(fileHandler) # Set params for SentEval params_senteval = { 'usepytorch': True, 'task_path': PATH_TO_DATA, 'max_seq_len': args.max_seq_len, 'batch_size': args.batch_size } params_senteval['classifier'] = { 'nhid': 0, 'optim': 'adam', 'batch_size': args.cls_batch_size, 'tenacity': 5, 'epoch_size': 4 } params_senteval = dotdict(params_senteval) with open(args.dict_file, 'rb') as fh: data = pkl.load(fh) word2idx = data[0] word2idx['<pad>'] = len(word2idx) n_words = len(word2idx) # Load model params_senteval.encoder = convsent.load_model(args.model_file, n_words=n_words) params_senteval.word2idx = word2idx se = senteval.SentEval(params_senteval, batcher, prepare) tasks = args.tasks.split(',') results = se.eval(tasks) print(results)