def cross_validation_main(args): if args.tasks == "all_tasks": task_names = tasks.ALL_TASKS else: task_names = args.tasks.split(",") task_dict = tasks.get_task_dict(task_names) task = task_dict[task_names[0]] args.task_names = task_names training_docs = task.training_docs() all_cross_validation_results, item_scores = mc_cross_validation( args, training_docs, task) best_k = largest_indices(item_scores, 100)[0] print(best_k) print(item_scores) all_cross_validation_results, item_scores = mc_cross_validation( args, training_docs, task, best_k.tolist()) best_k = largest_indices(item_scores, 32)[0] print(best_k) print(item_scores) train_subset = [] for i in range(len(training_docs)): if i in best_k: train_subset.append(training_docs[i]) lm = models.get_model(args.model).create_from_arg_string(args.model_args) accuracy = task.evaluate(task.validation_docs(), lm, args.provide_description, args.num_fewshot, train_doc=train_subset) print("final accuracy: ", accuracy)
def main(): args = parse_args() random.seed(args.seed) np.random.seed(args.seed) if args.tasks == "all_tasks": task_names = tasks.ALL_TASKS else: task_names = args.tasks.split(",") task_dict = tasks.get_task_dict(task_names) os.makedirs(args.output_base_path, exist_ok=True) for task_name, task in task_dict.items(): if not task.has_validation_docs(): docs = task.training_docs() else: docs = task.validation_docs() with open(os.path.join(args.output_base_path, task_name), "w") as f: for i, doc in zip(range(args.num_examples), docs): f.write(EXAMPLE_DIVIDER.format(i=i)) ctx = task.fewshot_context( doc=doc, provide_description=args.provide_description, num_fewshot=args.num_fewshot, ) f.write(ctx + "\n")
def main(): args = parse_args() random.seed(args.seed) np.random.seed(args.seed) lm = models.get_model(args.model).create_from_arg_string(args.model_args) if args.tasks == "all_tasks": task_names = tasks.ALL_TASKS else: task_names = args.tasks.split(",") task_dict = tasks.get_task_dict(task_names) results = {} for task_name, task in task_dict.items(): if not task.has_validation_docs(): continue result = task.evaluate( docs=task.validation_docs(), lm=lm, provide_description=args.provide_description, num_fewshot=args.num_fewshot, ) results[task_name] = result dumped = json.dumps(results, indent=2) print(dumped) if args.output_path: with open(args.output_path, "w") as f: f.write(dumped)
def main(): args = parse_args() if os.path.exists(args.output_path): print(f"Output path {args.output_path} exists!!!") return random.seed(args.seed) np.random.seed(args.seed) if args.limit: print( "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." ) if args.tasks == "all_tasks": task_names = tasks.ALL_TASKS else: task_names = args.tasks.split(",") task_dict = tasks.get_task_dict(task_names) lm = models.get_model(args.model) train_args = simple_parse_args_string(args.train_args) model_args = simple_parse_args_string(args.model_args) if train_args: train_args.update(model_args) train_args["seed"] = args.seed results = evaluator.evaluate(lm, task_dict, args.provide_description, args.num_fewshot, args.limit, train_args, args.model_args, args.seed) results["args"] = args.__dict__ dumped = json.dumps(results, indent=2) print(dumped) if args.output_path: with open(args.output_path, "w") as f: f.write(dumped) for task, task_res in results.items(): if task not in task_names: continue if "train_args" not in task_res: experiment = comet_ml.Experiment( api_key=os.environ.get('COMET_API_KEY'), project_name=os.environ.get('COMET_PROJECT', "few-shot"), workspace=os.environ.get('COMET_WORKSPACE', "yuvalkirstain"), ) experiment.log_asset(args.output_path) else: experiment = comet_ml.ExistingExperiment( api_key=os.environ.get('COMET_API_KEY'), previous_experiment=task_res["train_args"] ["previous_experiment"]) experiment.log_asset(args.output_path)
def test_evaluator(taskname, task_class): task_dict = tasks.get_task_dict([taskname]) os.system("rm test_cache.db") lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db") def ll_fn(reqs): for ctx, cont in reqs: if len(ctx) == 0: continue # space convention assert ctx[-1] != " " assert cont[0] == " " or ctx[-1] == "\n" res = [] random.seed(42) for _ in reqs: res.append((-random.random(), False)) return res def ll_perp_fn(reqs): for (string, ) in reqs: assert isinstance(string, str) res = [] random.seed(42) for _ in reqs: res.append(-random.random()) return res lm.loglikelihood = ll_fn lm.loglikelihood_rolling = ll_perp_fn limit = 10 e1 = evaluator.evaluate( lm=lm, task_dict=task_dict, num_fewshot=0, limit=limit, bootstrap_iters=10, description_dict=None, ) e2 = evaluator.evaluate( lm=lm, task_dict=task_dict, num_fewshot=0, limit=limit, bootstrap_iters=10, description_dict=None, ) # check that caching is working assert e1 == e2
def main(): args = parse_args() np.random.seed(args.seed) if args.tasks == "all_tasks": task_names = tasks.ALL_TASKS else: task_names = args.tasks.split(",") task_dict = tasks.get_task_dict(task_names) description_dict = {} if args.description_dict_path: with open(args.description_dict_path, "r") as f: description_dict = json.load(f) os.makedirs(args.output_base_path, exist_ok=True) for task_name, task in task_dict.items(): rnd = random.Random() rnd.seed(args.seed) iters = [] for set in args.sets.split(","): if set == "train" and task.has_training_docs(): docs = task.training_docs() if set == "val" and task.has_validation_docs(): docs = task.validation_docs() if set == "test" and task.has_test_docs(): docs = task.test_docs() iters.append(docs) docs = join_iters(iters) description = ( description_dict[task_name] if description_dict and task_name in description_dict else "" ) with open(os.path.join(args.output_base_path, task_name), "w") as f: for i, doc in ( zip(range(args.num_examples), docs) if args.num_examples > 0 else enumerate(docs) ): f.write(EXAMPLE_DIVIDER.format(i=i)) ctx = task.fewshot_context( doc=doc, num_fewshot=args.num_fewshot, rnd=rnd, description=description, ) f.write(ctx + "\n")
def main(): random.seed(42) np.random.seed(42) lm = models.get_model(model).create_from_arg_string(model_args) if limit: print( "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." ) if not no_cache: lm = base.CachingLM( lm, 'lm_cache/' + model + '_' + model_args.replace('=', '-').replace(',', '_') + '.db') task_dict = tasks.get_task_dict([task]) for desc in fewshot_descriptions: custom_task_dict = { k: CustomDescTask(v, desc) for k, v in task_dict.items() } results = evaluator.evaluate(lm, custom_task_dict, True, num_fewshot, limit) dumped = json.dumps(results, indent=2) print('Description:', desc) print(dumped) # MAKE TABLE from pytablewriter import MarkdownTableWriter writer = MarkdownTableWriter() writer.headers = ["Task", "Metric", "Value"] values = [] for k, dic in results.items(): for m, v in dic.items(): values.append([k, m, '%.4f' % v]) k = "" writer.value_matrix = values print(writer.dumps())
def main(): args = parse_args() random.seed(args.seed) np.random.seed(args.seed) lm = models.get_model(args.model).create_from_arg_string(args.model_args) if args.limit: print( "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." ) if not args.no_cache: lm = base.CachingLM( lm, 'lm_cache/' + args.model + '_' + args.model_args.replace('=', '-').replace(',', '_') + '.db') if args.tasks == "all_tasks": task_names = tasks.ALL_TASKS else: task_names = args.tasks.split(",") task_dict = tasks.get_task_dict(task_names) results = evaluator.evaluate(lm, task_dict, args.provide_description, args.num_fewshot, args.limit) dumped = json.dumps(results, indent=2) print(dumped) if args.output_path: with open(args.output_path, "w") as f: f.write(dumped) # MAKE TABLE from pytablewriter import MarkdownTableWriter writer = MarkdownTableWriter() writer.headers = ["Task", "Metric", "Value"] values = [] for k, dic in results.items(): for m, v in dic.items(): values.append([k, m, '%.4f' % v]) k = "" writer.value_matrix = values print(writer.dumps())
def run_eval(self, eval_tasks=None): was_training = self.model.training self.model.eval() in_micro_batches = self.model.micro_batches # store input microbatches - we need to set to 1 during eval self.model.micro_batches = 1 if eval_tasks is None: eval_tasks = ["lambada", "piqa", "hellaswag", "winogrande", "mathqa", "pubmedqa"] results = evaluator.evaluate(lm=self, task_dict=tasks.get_task_dict(eval_tasks), provide_description=False, num_fewshot=0, limit=None, bootstrap_iters=2).get('results') if was_training: self.model.train() self.model.micro_batches = in_micro_batches return results
def test_evaluator(taskname, Task): task_dict = tasks.get_task_dict([taskname]) lm = models.get_model('dummy')() def ll_fn(reqs): for ctx, cont in reqs: # space convention assert ctx[-1] != ' ' assert cont[0] == ' ' or ctx[-1] == '\n' res = [] random.seed(42) for _ in reqs: res.append((-random.random(), False)) return res lm.loglikelihood = ll_fn evaluator.evaluate(lm, task_dict, False, 0, 10)
def main(): args = parse_args() np.random.seed(args.seed) if args.tasks == "all_tasks": task_names = tasks.ALL_TASKS else: task_names = args.tasks.split(",") task_dict = tasks.get_task_dict(task_names) os.makedirs(args.output_base_path, exist_ok=True) for task_name, task in task_dict.items(): rnd = random.Random() rnd.seed(args.seed) iters = [] for set in args.sets.split(","): if set == 'train' and task.has_training_docs(): docs = task.training_docs() if set == 'val' and task.has_validation_docs(): docs = task.validation_docs() if set == 'test' and task.has_test_docs(): docs = task.test_docs() iters.append(docs) docs = join_iters(iters) with open(os.path.join(args.output_base_path, task_name), "w") as f: for i, doc in zip( range(args.num_examples), docs) if args.num_examples > 0 else enumerate(docs): f.write(EXAMPLE_DIVIDER.format(i=i)) ctx = task.fewshot_context( doc=doc, provide_description=args.provide_description, num_fewshot=args.num_fewshot, rnd=rnd) f.write(ctx + "\n")
def main(): args = parse_args() random.seed(args.seed) np.random.seed(args.seed) lm = models.get_model(args.model).create_from_arg_string(args.model_args) if args.cache: lm = base.CachingLM(lm, 'lm_cache/' + args.model + '_' + args.model_args.replace('=', '-').replace(',', '_') + '.db') if args.tasks == "all_tasks": task_names = tasks.ALL_TASKS else: task_names = args.tasks.split(",") task_dict = tasks.get_task_dict(task_names) results = evaluator.evaluate(lm, task_dict, args.provide_description, args.num_fewshot, args.limit) dumped = json.dumps(results, indent=2) print(dumped) if args.output_path: with open(args.output_path, "w") as f: f.write(dumped)
def main(): args = parse_args() random.seed(args.seed) np.random.seed(args.seed) lm = models.get_model(args.model).create_from_arg_string(args.model_args) if args.tasks == "all_tasks": task_names = tasks.ALL_TASKS else: task_names = args.tasks.split(",") task_dict = tasks.get_task_dict(task_names) # TODO: fall back to test docs task_dict_items = [(name, task) for name, task in task_dict.items() if task.has_validation_docs()] results = collections.defaultdict(dict) requests = collections.defaultdict(list) requests_origin = collections.defaultdict(list) # if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory, # we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad # (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have # TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable docs = {} # get lists of each type of requeste for task_name, task in task_dict_items: for doc_id, doc in enumerate( itertools.islice(task.validation_docs(), 0, args.limit)): docs[(task_name, doc_id)] = doc ctx = task.fewshot_context( doc=doc, provide_description=args.provide_description, num_fewshot=args.num_fewshot, ) reqs = task.construct_requests(doc, ctx) for i, req in enumerate(reqs): requests[req.type].append(req) # i: index in requests for a single task instance # doc_id: unique id that we can get back to a doc using `docs` requests_origin[req.type].append((i, task_name, doc, doc_id)) # all responses for each (task, doc) process_res_queue = collections.defaultdict(list) # execute each type of request for reqtype, reqs in requests.items(): # TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing # only in index. We could implement some kind of caching, but that would be more of a bandaid # solution. we could also implement some kind of autogrouping here; they should end up next to each other. resps = getattr(lm, reqtype)([req.args for req in reqs]) resps = [ x if req.index is None else x[req.index] for x, req in zip(resps, reqs) ] for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]): process_res_queue[(task_name, doc_id)].append((i, resp)) vals = collections.defaultdict(list) # unpack results and sort back in order and return control to Task for (task_name, doc_id), requests in process_res_queue.items(): requests.sort(key=lambda x: x[0]) requests = [x[1] for x in requests] task = task_dict[task_name] doc = docs[(task_name, doc_id)] metrics = task.process_results(doc, requests) for metric, value in metrics.items(): vals[(task_name, metric)].append(value) # aggregate results for (task_name, metric), items in vals.items(): task = task_dict[task_name] results[task_name][metric] = task.aggregation()[metric](items) dumped = json.dumps(results, indent=2) print(dumped) if args.output_path: with open(args.output_path, "w") as f: f.write(dumped)
start = time.time() t.train(train_dataset.get_samples()) print(f"Train fn compiled in {time.time() - start:.06}s") start = time.time() for val_set in val_sets.values(): t.eval(val_set.get_samples()) print(f"Eval fn compiled in {time.time() - start:.06}s") wandb.init(project='mesh-transformer-jax', entity="eleutherai", name=params["name"], config=params) eval_task_dict = tasks.get_task_dict(eval_tasks) while True: loss, last_loss = t.train(train_dataset.get_samples()) wandb.log({'train/loss': loss, 'train/last_loss': last_loss}, step) if (step % ckpt_every == 0 and step) or step == total_steps: t.save(step, bucket, model_dir, aux={"train_loader": train_dataset.get_state()}, init=False, delete_old=step % keep_every != 0) if step == total_steps: print("training completed!")
t = build_model(params, tpu_name, region, preemptible) adaptor = EvalHarnessAdaptor(t, seq, total_batch, shrink=pe != "fixed") step, aux = t.load(bucket, model_dir) t.move() results = evaluator.evaluate( adaptor, tasks.get_task_dict([ "lambada", "piqa", "hellaswag", "winogrande", "mathqa", "pubmedqa", # "boolq", # "cb", # "copa", # "multirc", # "record", # "wic", # "wsc", ]), False, 0, None) dumped = json.dumps(results, indent=2) print(dumped) results = evaluator.evaluate(adaptor, tasks.get_task_dict([
def main(): args = parse_args() random.seed(args.seed) np.random.seed(args.seed) lm = models.get_model(args.model).create_from_arg_string( args.model_args, { 'batch_size': args.batch_size, 'device': args.device }) if args.limit: print( "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." ) if not args.no_cache: lm = base.CachingLM( lm, 'lm_cache/' + args.model + '_' + args.model_args.replace( '=', '-').replace(',', '_').replace('/', '-') + '.db') if args.tasks == "all_tasks": task_names = tasks.ALL_TASKS else: task_names = args.tasks.split(",") task_dict = tasks.get_task_dict(task_names) results = evaluator.evaluate(lm, task_dict, args.provide_description, args.num_fewshot, args.limit) dumped = json.dumps(results, indent=2) print(dumped) if args.output_path: with open(args.output_path, "w") as f: f.write(dumped) # MAKE TABLE from pytablewriter import MarkdownTableWriter, LatexTableWriter md_writer = MarkdownTableWriter() latex_writer = LatexTableWriter() md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"] latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"] values = [] for k, dic in results["results"].items(): version = results["versions"][k] for m, v in dic.items(): if m.endswith("_stderr"): continue if m + "_stderr" in dic: se = dic[m + "_stderr"] values.append([k, version, m, '%.4f' % v, '±', '%.4f' % se]) else: values.append([k, version, m, '%.4f' % v, '', '']) k = "" version = "" md_writer.value_matrix = values latex_writer.value_matrix = values # todo: make latex table look good # print(latex_writer.dumps()) print( f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}" ) print(md_writer.dumps())
def run_eval( self, eval_tasks=None, num_fewshot=0, bootstrap_iters=2, description_dict=None, use_cache=True, name="neox", limit=None ): was_training = self.model.training self.model.eval() in_micro_batches = ( self.model.micro_batches ) # store input microbatches - we need to set to 1 during eval, but want to return to its original value after self.model.micro_batches = 1 if eval_tasks is None: eval_tasks = [ "lambada", "piqa", "hellaswag", "winogrande", "mathqa", "pubmedqa", ] # **HACK INCOMING**: # first get task dict on local main rank # the tasks are downloaded *as they are initialized*, and the downloads don't like multithreading. # so we download them once on the local main rank, wait, and then initialize them on all other ranks, which *should* load from the cache. if self.is_local_main: task_dict = tasks.get_task_dict(eval_tasks) # torch barrier if torch.distributed.is_initialized(): torch.distributed.barrier() task_dict = tasks.get_task_dict(eval_tasks) lm = self if use_cache: # TODO(jon-tow): Append a subset of `neox_args` to the cache database # name arg to distinguish model runs that use different configurations. lm = base.CachingLM(lm, 'lm_cache/' + name + '.db') results = evaluator.evaluate( lm=lm, task_dict=tasks.get_task_dict(eval_tasks), description_dict=description_dict, num_fewshot=num_fewshot, limit=limit, bootstrap_iters=bootstrap_iters, ) results["config"] = { "model": name, "model_args": dataclasses.asdict(self.neox_args), "num_fewshot": num_fewshot, "batch_size": self.batch_size, "device": str(self.device), "no_cache": not use_cache, "limit": limit, "bootstrap_iters": bootstrap_iters, "description_dict": description_dict } if was_training: self.model.train() self.model.micro_batches = in_micro_batches return results