def test_evaluator(taskname, task_class):
    task_dict = tasks.get_task_dict([taskname])

    os.system("rm test_cache.db")
    lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")

    def ll_fn(reqs):
        for ctx, cont in reqs:
            if len(ctx) == 0:
                continue
            # space convention
            assert ctx[-1] != " "
            assert cont[0] == " " or ctx[-1] == "\n"

        res = []

        random.seed(42)
        for _ in reqs:
            res.append((-random.random(), False))

        return res

    def ll_perp_fn(reqs):
        for (string, ) in reqs:
            assert isinstance(string, str)

        res = []
        random.seed(42)
        for _ in reqs:
            res.append(-random.random())

        return res

    lm.loglikelihood = ll_fn
    lm.loglikelihood_rolling = ll_perp_fn

    limit = 10
    e1 = evaluator.evaluate(
        lm=lm,
        task_dict=task_dict,
        num_fewshot=0,
        limit=limit,
        bootstrap_iters=10,
        description_dict=None,
    )
    e2 = evaluator.evaluate(
        lm=lm,
        task_dict=task_dict,
        num_fewshot=0,
        limit=limit,
        bootstrap_iters=10,
        description_dict=None,
    )

    # check that caching is working
    assert e1 == e2
예제 #2
0
def main():
    args = parse_args()

    if os.path.exists(args.output_path):
        print(f"Output path {args.output_path} exists!!!")
        return

    random.seed(args.seed)
    np.random.seed(args.seed)

    if args.limit:
        print(
            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )

    if args.tasks == "all_tasks":
        task_names = tasks.ALL_TASKS
    else:
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)

    lm = models.get_model(args.model)

    train_args = simple_parse_args_string(args.train_args)
    model_args = simple_parse_args_string(args.model_args)

    if train_args:
        train_args.update(model_args)
        train_args["seed"] = args.seed

    results = evaluator.evaluate(lm, task_dict, args.provide_description,
                                 args.num_fewshot, args.limit, train_args,
                                 args.model_args, args.seed)

    results["args"] = args.__dict__
    dumped = json.dumps(results, indent=2)
    print(dumped)
    if args.output_path:
        with open(args.output_path, "w") as f:
            f.write(dumped)

    for task, task_res in results.items():
        if task not in task_names:
            continue
        if "train_args" not in task_res:
            experiment = comet_ml.Experiment(
                api_key=os.environ.get('COMET_API_KEY'),
                project_name=os.environ.get('COMET_PROJECT', "few-shot"),
                workspace=os.environ.get('COMET_WORKSPACE', "yuvalkirstain"),
            )
            experiment.log_asset(args.output_path)
        else:
            experiment = comet_ml.ExistingExperiment(
                api_key=os.environ.get('COMET_API_KEY'),
                previous_experiment=task_res["train_args"]
                ["previous_experiment"])
            experiment.log_asset(args.output_path)
def main():
    lm = DryrunLM()

    task_list = "arc_challenge,arc_easy,boolq,cola,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,record,rte,sciq,sst,triviaqa,webqs,wic,wikitext,winogrande,wnli,wsc"
    values = []
    for taskname in task_list.split(","):
        lm.tokencost = 0
        evaluator.evaluate(
            lm=lm,
            task_dict={taskname: tasks.get_task(taskname)()},
            num_fewshot=0,
            limit=None,
            bootstrap_iters=10,
            description_dict=None,
        )

        print(taskname, lm.tokencost)
        values.append([
            taskname,
            lm.tokencost,
            lm.tokencost / 1000 * 0.0008,
            lm.tokencost / 1000 * 0.0012,
            lm.tokencost / 1000 * 0.006,
            lm.tokencost / 1000 * 0.06,
        ])
    from pytablewriter import MarkdownTableWriter

    writer = MarkdownTableWriter()
    writer.headers = ["Task", "Tokens", "Ada", "Babbage", "Curie", "Davinci"]

    values.sort(key=lambda x: -x[1])
    totcost = sum([x[1] for x in values])
    values.append([
        "**Total**",
        totcost,
        totcost / 1000 * 0.0008,
        totcost / 1000 * 0.0012,
        totcost / 1000 * 0.006,
        totcost / 1000 * 0.06,
    ])

    writer.value_matrix = values

    print(writer.dumps())
def test_evaluator(taskname, Task):
    task_dict = tasks.get_task_dict([taskname])
    lm = models.get_model('dummy')()

    def ll_fn(reqs):
        for ctx, cont in reqs:
            # space convention
            assert ctx[-1] != ' '
            assert cont[0] == ' ' or ctx[-1] == '\n'

        res = []

        random.seed(42)
        for _ in reqs:
            res.append((-random.random(), False))

        return res

    lm.loglikelihood = ll_fn
    evaluator.evaluate(lm, task_dict, False, 0, 10)
예제 #5
0
def main():
    lm = DryrunLM()

    values = []
    for taskname in list(tasks.TASK_REGISTRY.keys()):
        lm.tokencost = 0
        evaluator.evaluate(lm, {taskname: tasks.get_task(taskname)()}, False,
                           0, None)

        print(taskname, lm.tokencost)
        values.append([taskname, lm.tokencost, lm.tokencost / 1000 * 0.06])
    from pytablewriter import MarkdownTableWriter

    writer = MarkdownTableWriter()
    writer.headers = ["Task", "Tokens", "Davinci Cost"]

    values.sort(key=lambda x: -x[1])
    totcost = sum([x[1] for x in values])
    values.append(["**Total**", totcost, totcost / 1000 * 0.06])

    writer.value_matrix = values

    print(writer.dumps())
예제 #6
0
def main():

    args = parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)

    lm = models.get_model(args.model).create_from_arg_string(args.model_args)

    if args.limit:
        print(
            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )

    if not args.no_cache:
        lm = base.CachingLM(
            lm, 'lm_cache/' + args.model + '_' +
            args.model_args.replace('=', '-').replace(',', '_') + '.db')
    if args.tasks == "all_tasks":
        task_names = tasks.ALL_TASKS
    else:
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)

    results = evaluator.evaluate(lm, task_dict, args.provide_description,
                                 args.num_fewshot, args.limit)

    dumped = json.dumps(results, indent=2)
    print(dumped)
    if args.output_path:
        with open(args.output_path, "w") as f:
            f.write(dumped)

    # MAKE TABLE
    from pytablewriter import MarkdownTableWriter

    writer = MarkdownTableWriter()
    writer.headers = ["Task", "Metric", "Value"]

    values = []

    for k, dic in results.items():
        for m, v in dic.items():
            values.append([k, m, '%.4f' % v])
            k = ""
    writer.value_matrix = values

    print(writer.dumps())
예제 #7
0
def main():
    random.seed(42)
    np.random.seed(42)

    lm = models.get_model(model).create_from_arg_string(model_args)

    if limit:
        print(
            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )

    if not no_cache:
        lm = base.CachingLM(
            lm, 'lm_cache/' + model + '_' +
            model_args.replace('=', '-').replace(',', '_') + '.db')

    task_dict = tasks.get_task_dict([task])

    for desc in fewshot_descriptions:
        custom_task_dict = {
            k: CustomDescTask(v, desc)
            for k, v in task_dict.items()
        }

        results = evaluator.evaluate(lm, custom_task_dict, True, num_fewshot,
                                     limit)

        dumped = json.dumps(results, indent=2)

        print('Description:', desc)
        print(dumped)

        # MAKE TABLE
        from pytablewriter import MarkdownTableWriter

        writer = MarkdownTableWriter()
        writer.headers = ["Task", "Metric", "Value"]

        values = []

        for k, dic in results.items():
            for m, v in dic.items():
                values.append([k, m, '%.4f' % v])
                k = ""
        writer.value_matrix = values

        print(writer.dumps())
예제 #8
0
 def run_eval(self, eval_tasks=None):
     was_training = self.model.training
     self.model.eval()
     in_micro_batches = self.model.micro_batches # store input microbatches - we need to set to 1 during eval
     self.model.micro_batches = 1
     if eval_tasks is None:
         eval_tasks = ["lambada", "piqa", "hellaswag", "winogrande", "mathqa", "pubmedqa"]
     results = evaluator.evaluate(lm=self,
                                     task_dict=tasks.get_task_dict(eval_tasks),
                                     provide_description=False,
                                     num_fewshot=0,
                                     limit=None,
                                     bootstrap_iters=2).get('results')
     if was_training:
         self.model.train()
     self.model.micro_batches = in_micro_batches
     return results
예제 #9
0
def main():

    args = parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)

    lm = models.get_model(args.model).create_from_arg_string(args.model_args)

    if args.cache:
        lm = base.CachingLM(lm, 'lm_cache/' + args.model + '_' + args.model_args.replace('=', '-').replace(',', '_') + '.db')
    if args.tasks == "all_tasks":
        task_names = tasks.ALL_TASKS
    else:
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)

    results = evaluator.evaluate(lm, task_dict, args.provide_description, args.num_fewshot, args.limit)

    dumped = json.dumps(results, indent=2)
    print(dumped)
    if args.output_path:
        with open(args.output_path, "w") as f:
            f.write(dumped)
예제 #10
0
    t = build_model(params, tpu_name, region, preemptible)
    adaptor = EvalHarnessAdaptor(t, seq, total_batch, shrink=pe != "fixed")

    step, aux = t.load(bucket, model_dir)
    t.move()

    results = evaluator.evaluate(
        adaptor,
        tasks.get_task_dict([
            "lambada",
            "piqa",
            "hellaswag",
            "winogrande",
            "mathqa",
            "pubmedqa",
            # "boolq",
            # "cb",
            # "copa",
            # "multirc",
            # "record",
            # "wic",
            # "wsc",
        ]),
        False,
        0,
        None)
    dumped = json.dumps(results, indent=2)
    print(dumped)

    results = evaluator.evaluate(adaptor,
                                 tasks.get_task_dict([
                                     "lambada_cloze",
예제 #11
0
def main():

    args = parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)

    lm = models.get_model(args.model).create_from_arg_string(
        args.model_args, {
            'batch_size': args.batch_size,
            'device': args.device
        })

    if args.limit:
        print(
            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )

    if not args.no_cache:
        lm = base.CachingLM(
            lm, 'lm_cache/' + args.model + '_' + args.model_args.replace(
                '=', '-').replace(',', '_').replace('/', '-') + '.db')
    if args.tasks == "all_tasks":
        task_names = tasks.ALL_TASKS
    else:
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)

    results = evaluator.evaluate(lm, task_dict, args.provide_description,
                                 args.num_fewshot, args.limit)

    dumped = json.dumps(results, indent=2)
    print(dumped)
    if args.output_path:
        with open(args.output_path, "w") as f:
            f.write(dumped)

    # MAKE TABLE
    from pytablewriter import MarkdownTableWriter, LatexTableWriter

    md_writer = MarkdownTableWriter()
    latex_writer = LatexTableWriter()
    md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
    latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]

    values = []

    for k, dic in results["results"].items():
        version = results["versions"][k]
        for m, v in dic.items():
            if m.endswith("_stderr"): continue

            if m + "_stderr" in dic:
                se = dic[m + "_stderr"]

                values.append([k, version, m, '%.4f' % v, '±', '%.4f' % se])
            else:
                values.append([k, version, m, '%.4f' % v, '', ''])
            k = ""
            version = ""
    md_writer.value_matrix = values
    latex_writer.value_matrix = values

    # todo: make latex table look good
    # print(latex_writer.dumps())

    print(
        f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
    )
    print(md_writer.dumps())
예제 #12
0
    def run_eval(
        self,
        eval_tasks=None,
        num_fewshot=0,
        bootstrap_iters=2,
        description_dict=None,
        use_cache=True,
        name="neox",
        limit=None
    ):
        was_training = self.model.training
        self.model.eval()
        in_micro_batches = (
            self.model.micro_batches
        )  # store input microbatches - we need to set to 1 during eval, but want to return to its original value after
        self.model.micro_batches = 1
        if eval_tasks is None:
            eval_tasks = [
                "lambada",
                "piqa",
                "hellaswag",
                "winogrande",
                "mathqa",
                "pubmedqa",
            ]

        # **HACK INCOMING**:
        # first get task dict on local main rank
        # the tasks are downloaded *as they are initialized*, and the downloads don't like multithreading.
        # so we download them once on the local main rank, wait, and then initialize them on all other ranks, which *should* load from the cache.
        if self.is_local_main:
            task_dict = tasks.get_task_dict(eval_tasks)
        # torch barrier
        if torch.distributed.is_initialized():
            torch.distributed.barrier()
        task_dict = tasks.get_task_dict(eval_tasks)

        lm = self
        if use_cache:
            # TODO(jon-tow): Append a subset of `neox_args` to the cache database
            # name arg to distinguish model runs that use different configurations.
            lm = base.CachingLM(lm, 'lm_cache/' + name + '.db')

        results = evaluator.evaluate(
            lm=lm,
            task_dict=tasks.get_task_dict(eval_tasks),
            description_dict=description_dict,
            num_fewshot=num_fewshot,
            limit=limit,
            bootstrap_iters=bootstrap_iters,
        )

        results["config"] = {
            "model": name,
            "model_args": dataclasses.asdict(self.neox_args),
            "num_fewshot": num_fewshot,
            "batch_size": self.batch_size,
            "device": str(self.device),
            "no_cache": not use_cache,
            "limit": limit,
            "bootstrap_iters": bootstrap_iters,
            "description_dict": description_dict
        }

        if was_training:
            self.model.train()
        self.model.micro_batches = in_micro_batches
        return results
예제 #13
0
            print(f"step {step} done")

        if step % val_every == 0:
            for name, val_set in val_sets.items():
                val_loss = []
                for i, _ in tqdm(
                        zip(val_set.sample_once(), range(val_batches)),
                        desc=f"validation for step {step}, set {name}",
                        total=val_batches):
                    val_loss.append(t.eval(i))
                val_loss = np.array(val_loss).mean()
                print(
                    f"validation loss for step {step}, set {name}: {val_loss}")

                wandb.log({f'val/loss_{name}': float(val_loss)}, step)

            results = evaluator.evaluate(adaptor, eval_task_dict, False, 0,
                                         None)

            flat_results = {}

            for task_name, task_res in results.items():
                for metric_name, metric_res in task_res.items():
                    flat_results[f"{task_name}/{metric_name}"] = float(
                        metric_res)

            dumped = json.dumps(results, indent=2)
            print(f"step {step} val results: {dumped}")
            wandb.log(flat_results, step)
        step += 1