Exemplo n.º 1
0
def cross_validation_main(args):
    if args.tasks == "all_tasks":
        task_names = tasks.ALL_TASKS
    else:
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)
    task = task_dict[task_names[0]]
    args.task_names = task_names
    training_docs = task.training_docs()

    all_cross_validation_results, item_scores = mc_cross_validation(
        args, training_docs, task)
    best_k = largest_indices(item_scores, 100)[0]
    print(best_k)
    print(item_scores)
    all_cross_validation_results, item_scores = mc_cross_validation(
        args, training_docs, task, best_k.tolist())
    best_k = largest_indices(item_scores, 32)[0]
    print(best_k)
    print(item_scores)

    train_subset = []
    for i in range(len(training_docs)):
        if i in best_k:
            train_subset.append(training_docs[i])
    lm = models.get_model(args.model).create_from_arg_string(args.model_args)
    accuracy = task.evaluate(task.validation_docs(),
                             lm,
                             args.provide_description,
                             args.num_fewshot,
                             train_doc=train_subset)
    print("final accuracy: ", accuracy)
Exemplo n.º 2
0
def main():
    args = parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)

    if args.tasks == "all_tasks":
        task_names = tasks.ALL_TASKS
    else:
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)
    os.makedirs(args.output_base_path, exist_ok=True)
    for task_name, task in task_dict.items():
        if not task.has_validation_docs():
            docs = task.training_docs()
        else:
            docs = task.validation_docs()
        with open(os.path.join(args.output_base_path, task_name), "w") as f:
            for i, doc in zip(range(args.num_examples), docs):
                f.write(EXAMPLE_DIVIDER.format(i=i))
                ctx = task.fewshot_context(
                    doc=doc,
                    provide_description=args.provide_description,
                    num_fewshot=args.num_fewshot,
                )
                f.write(ctx + "\n")
Exemplo n.º 3
0
def main():
    args = parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)

    lm = models.get_model(args.model).create_from_arg_string(args.model_args)
    if args.tasks == "all_tasks":
        task_names = tasks.ALL_TASKS
    else:
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)
    results = {}
    for task_name, task in task_dict.items():
        if not task.has_validation_docs():
            continue
        result = task.evaluate(
            docs=task.validation_docs(),
            lm=lm,
            provide_description=args.provide_description,
            num_fewshot=args.num_fewshot,
        )
        results[task_name] = result

    dumped = json.dumps(results, indent=2)
    print(dumped)
    if args.output_path:
        with open(args.output_path, "w") as f:
            f.write(dumped)
Exemplo n.º 4
0
def main():
    args = parse_args()

    if os.path.exists(args.output_path):
        print(f"Output path {args.output_path} exists!!!")
        return

    random.seed(args.seed)
    np.random.seed(args.seed)

    if args.limit:
        print(
            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )

    if args.tasks == "all_tasks":
        task_names = tasks.ALL_TASKS
    else:
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)

    lm = models.get_model(args.model)

    train_args = simple_parse_args_string(args.train_args)
    model_args = simple_parse_args_string(args.model_args)

    if train_args:
        train_args.update(model_args)
        train_args["seed"] = args.seed

    results = evaluator.evaluate(lm, task_dict, args.provide_description,
                                 args.num_fewshot, args.limit, train_args,
                                 args.model_args, args.seed)

    results["args"] = args.__dict__
    dumped = json.dumps(results, indent=2)
    print(dumped)
    if args.output_path:
        with open(args.output_path, "w") as f:
            f.write(dumped)

    for task, task_res in results.items():
        if task not in task_names:
            continue
        if "train_args" not in task_res:
            experiment = comet_ml.Experiment(
                api_key=os.environ.get('COMET_API_KEY'),
                project_name=os.environ.get('COMET_PROJECT', "few-shot"),
                workspace=os.environ.get('COMET_WORKSPACE', "yuvalkirstain"),
            )
            experiment.log_asset(args.output_path)
        else:
            experiment = comet_ml.ExistingExperiment(
                api_key=os.environ.get('COMET_API_KEY'),
                previous_experiment=task_res["train_args"]
                ["previous_experiment"])
            experiment.log_asset(args.output_path)
def test_evaluator(taskname, task_class):
    task_dict = tasks.get_task_dict([taskname])

    os.system("rm test_cache.db")
    lm = base.CachingLM(models.get_model("dummy")(), "test_cache.db")

    def ll_fn(reqs):
        for ctx, cont in reqs:
            if len(ctx) == 0:
                continue
            # space convention
            assert ctx[-1] != " "
            assert cont[0] == " " or ctx[-1] == "\n"

        res = []

        random.seed(42)
        for _ in reqs:
            res.append((-random.random(), False))

        return res

    def ll_perp_fn(reqs):
        for (string, ) in reqs:
            assert isinstance(string, str)

        res = []
        random.seed(42)
        for _ in reqs:
            res.append(-random.random())

        return res

    lm.loglikelihood = ll_fn
    lm.loglikelihood_rolling = ll_perp_fn

    limit = 10
    e1 = evaluator.evaluate(
        lm=lm,
        task_dict=task_dict,
        num_fewshot=0,
        limit=limit,
        bootstrap_iters=10,
        description_dict=None,
    )
    e2 = evaluator.evaluate(
        lm=lm,
        task_dict=task_dict,
        num_fewshot=0,
        limit=limit,
        bootstrap_iters=10,
        description_dict=None,
    )

    # check that caching is working
    assert e1 == e2
Exemplo n.º 6
0
def main():
    args = parse_args()
    np.random.seed(args.seed)

    if args.tasks == "all_tasks":
        task_names = tasks.ALL_TASKS
    else:
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)

    description_dict = {}
    if args.description_dict_path:
        with open(args.description_dict_path, "r") as f:
            description_dict = json.load(f)

    os.makedirs(args.output_base_path, exist_ok=True)
    for task_name, task in task_dict.items():
        rnd = random.Random()
        rnd.seed(args.seed)

        iters = []

        for set in args.sets.split(","):
            if set == "train" and task.has_training_docs():
                docs = task.training_docs()
            if set == "val" and task.has_validation_docs():
                docs = task.validation_docs()
            if set == "test" and task.has_test_docs():
                docs = task.test_docs()
            iters.append(docs)

        docs = join_iters(iters)

        description = (
            description_dict[task_name]
            if description_dict and task_name in description_dict
            else ""
        )

        with open(os.path.join(args.output_base_path, task_name), "w") as f:
            for i, doc in (
                zip(range(args.num_examples), docs)
                if args.num_examples > 0
                else enumerate(docs)
            ):
                f.write(EXAMPLE_DIVIDER.format(i=i))
                ctx = task.fewshot_context(
                    doc=doc,
                    num_fewshot=args.num_fewshot,
                    rnd=rnd,
                    description=description,
                )
                f.write(ctx + "\n")
Exemplo n.º 7
0
def main():
    random.seed(42)
    np.random.seed(42)

    lm = models.get_model(model).create_from_arg_string(model_args)

    if limit:
        print(
            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )

    if not no_cache:
        lm = base.CachingLM(
            lm, 'lm_cache/' + model + '_' +
            model_args.replace('=', '-').replace(',', '_') + '.db')

    task_dict = tasks.get_task_dict([task])

    for desc in fewshot_descriptions:
        custom_task_dict = {
            k: CustomDescTask(v, desc)
            for k, v in task_dict.items()
        }

        results = evaluator.evaluate(lm, custom_task_dict, True, num_fewshot,
                                     limit)

        dumped = json.dumps(results, indent=2)

        print('Description:', desc)
        print(dumped)

        # MAKE TABLE
        from pytablewriter import MarkdownTableWriter

        writer = MarkdownTableWriter()
        writer.headers = ["Task", "Metric", "Value"]

        values = []

        for k, dic in results.items():
            for m, v in dic.items():
                values.append([k, m, '%.4f' % v])
                k = ""
        writer.value_matrix = values

        print(writer.dumps())
Exemplo n.º 8
0
def main():

    args = parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)

    lm = models.get_model(args.model).create_from_arg_string(args.model_args)

    if args.limit:
        print(
            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )

    if not args.no_cache:
        lm = base.CachingLM(
            lm, 'lm_cache/' + args.model + '_' +
            args.model_args.replace('=', '-').replace(',', '_') + '.db')
    if args.tasks == "all_tasks":
        task_names = tasks.ALL_TASKS
    else:
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)

    results = evaluator.evaluate(lm, task_dict, args.provide_description,
                                 args.num_fewshot, args.limit)

    dumped = json.dumps(results, indent=2)
    print(dumped)
    if args.output_path:
        with open(args.output_path, "w") as f:
            f.write(dumped)

    # MAKE TABLE
    from pytablewriter import MarkdownTableWriter

    writer = MarkdownTableWriter()
    writer.headers = ["Task", "Metric", "Value"]

    values = []

    for k, dic in results.items():
        for m, v in dic.items():
            values.append([k, m, '%.4f' % v])
            k = ""
    writer.value_matrix = values

    print(writer.dumps())
Exemplo n.º 9
0
 def run_eval(self, eval_tasks=None):
     was_training = self.model.training
     self.model.eval()
     in_micro_batches = self.model.micro_batches # store input microbatches - we need to set to 1 during eval
     self.model.micro_batches = 1
     if eval_tasks is None:
         eval_tasks = ["lambada", "piqa", "hellaswag", "winogrande", "mathqa", "pubmedqa"]
     results = evaluator.evaluate(lm=self,
                                     task_dict=tasks.get_task_dict(eval_tasks),
                                     provide_description=False,
                                     num_fewshot=0,
                                     limit=None,
                                     bootstrap_iters=2).get('results')
     if was_training:
         self.model.train()
     self.model.micro_batches = in_micro_batches
     return results
def test_evaluator(taskname, Task):
    task_dict = tasks.get_task_dict([taskname])
    lm = models.get_model('dummy')()

    def ll_fn(reqs):
        for ctx, cont in reqs:
            # space convention
            assert ctx[-1] != ' '
            assert cont[0] == ' ' or ctx[-1] == '\n'

        res = []

        random.seed(42)
        for _ in reqs:
            res.append((-random.random(), False))

        return res

    lm.loglikelihood = ll_fn
    evaluator.evaluate(lm, task_dict, False, 0, 10)
Exemplo n.º 11
0
def main():
    args = parse_args()
    np.random.seed(args.seed)

    if args.tasks == "all_tasks":
        task_names = tasks.ALL_TASKS
    else:
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)
    os.makedirs(args.output_base_path, exist_ok=True)
    for task_name, task in task_dict.items():
        rnd = random.Random()
        rnd.seed(args.seed)

        iters = []

        for set in args.sets.split(","):
            if set == 'train' and task.has_training_docs():
                docs = task.training_docs()
            if set == 'val' and task.has_validation_docs():
                docs = task.validation_docs()
            if set == 'test' and task.has_test_docs():
                docs = task.test_docs()
            iters.append(docs)

        docs = join_iters(iters)

        with open(os.path.join(args.output_base_path, task_name), "w") as f:
            for i, doc in zip(
                    range(args.num_examples),
                    docs) if args.num_examples > 0 else enumerate(docs):
                f.write(EXAMPLE_DIVIDER.format(i=i))
                ctx = task.fewshot_context(
                    doc=doc,
                    provide_description=args.provide_description,
                    num_fewshot=args.num_fewshot,
                    rnd=rnd)
                f.write(ctx + "\n")
Exemplo n.º 12
0
def main():

    args = parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)

    lm = models.get_model(args.model).create_from_arg_string(args.model_args)

    if args.cache:
        lm = base.CachingLM(lm, 'lm_cache/' + args.model + '_' + args.model_args.replace('=', '-').replace(',', '_') + '.db')
    if args.tasks == "all_tasks":
        task_names = tasks.ALL_TASKS
    else:
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)

    results = evaluator.evaluate(lm, task_dict, args.provide_description, args.num_fewshot, args.limit)

    dumped = json.dumps(results, indent=2)
    print(dumped)
    if args.output_path:
        with open(args.output_path, "w") as f:
            f.write(dumped)
Exemplo n.º 13
0
def main():
    args = parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)

    lm = models.get_model(args.model).create_from_arg_string(args.model_args)
    if args.tasks == "all_tasks":
        task_names = tasks.ALL_TASKS
    else:
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)

    # TODO: fall back to test docs
    task_dict_items = [(name, task) for name, task in task_dict.items()
                       if task.has_validation_docs()]

    results = collections.defaultdict(dict)

    requests = collections.defaultdict(list)
    requests_origin = collections.defaultdict(list)

    # if we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger memory,
    # we can always modify this plumbing to support that, but i didn't want to include it just yet because overengineering is bad
    # (or we could make it write the requests to disk and then read them back out again - probably using an sqlite db because of all the moving parts we have

    # TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable

    docs = {}

    # get lists of each type of requeste
    for task_name, task in task_dict_items:
        for doc_id, doc in enumerate(
                itertools.islice(task.validation_docs(), 0, args.limit)):
            docs[(task_name, doc_id)] = doc

            ctx = task.fewshot_context(
                doc=doc,
                provide_description=args.provide_description,
                num_fewshot=args.num_fewshot,
            )

            reqs = task.construct_requests(doc, ctx)

            for i, req in enumerate(reqs):
                requests[req.type].append(req)
                # i: index in requests for a single task instance
                # doc_id: unique id that we can get back to a doc using `docs`
                requests_origin[req.type].append((i, task_name, doc, doc_id))

    # all responses for each (task, doc)
    process_res_queue = collections.defaultdict(list)

    # execute each type of request
    for reqtype, reqs in requests.items():
        # TODO: right now, this code runs multiple seperate LM requests for multiple Requests differing
        # only in index. We could implement some kind of caching, but that would be more of a bandaid
        # solution. we could also implement some kind of autogrouping here; they should end up next to each other.

        resps = getattr(lm, reqtype)([req.args for req in reqs])

        resps = [
            x if req.index is None else x[req.index]
            for x, req in zip(resps, reqs)
        ]

        for resp, (i, task_name, doc, doc_id) in zip(resps,
                                                     requests_origin[reqtype]):
            process_res_queue[(task_name, doc_id)].append((i, resp))

    vals = collections.defaultdict(list)

    # unpack results and sort back in order and return control to Task
    for (task_name, doc_id), requests in process_res_queue.items():
        requests.sort(key=lambda x: x[0])
        requests = [x[1] for x in requests]

        task = task_dict[task_name]
        doc = docs[(task_name, doc_id)]

        metrics = task.process_results(doc, requests)
        for metric, value in metrics.items():
            vals[(task_name, metric)].append(value)

    # aggregate results
    for (task_name, metric), items in vals.items():
        task = task_dict[task_name]
        results[task_name][metric] = task.aggregation()[metric](items)

    dumped = json.dumps(results, indent=2)
    print(dumped)
    if args.output_path:
        with open(args.output_path, "w") as f:
            f.write(dumped)
Exemplo n.º 14
0
    start = time.time()
    t.train(train_dataset.get_samples())
    print(f"Train fn compiled in {time.time() - start:.06}s")

    start = time.time()
    for val_set in val_sets.values():
        t.eval(val_set.get_samples())
    print(f"Eval fn compiled in {time.time() - start:.06}s")

    wandb.init(project='mesh-transformer-jax',
               entity="eleutherai",
               name=params["name"],
               config=params)

    eval_task_dict = tasks.get_task_dict(eval_tasks)

    while True:
        loss, last_loss = t.train(train_dataset.get_samples())
        wandb.log({'train/loss': loss, 'train/last_loss': last_loss}, step)

        if (step % ckpt_every == 0 and step) or step == total_steps:
            t.save(step,
                   bucket,
                   model_dir,
                   aux={"train_loader": train_dataset.get_state()},
                   init=False,
                   delete_old=step % keep_every != 0)

            if step == total_steps:
                print("training completed!")
Exemplo n.º 15
0
    t = build_model(params, tpu_name, region, preemptible)
    adaptor = EvalHarnessAdaptor(t, seq, total_batch, shrink=pe != "fixed")

    step, aux = t.load(bucket, model_dir)
    t.move()

    results = evaluator.evaluate(
        adaptor,
        tasks.get_task_dict([
            "lambada",
            "piqa",
            "hellaswag",
            "winogrande",
            "mathqa",
            "pubmedqa",
            # "boolq",
            # "cb",
            # "copa",
            # "multirc",
            # "record",
            # "wic",
            # "wsc",
        ]),
        False,
        0,
        None)
    dumped = json.dumps(results, indent=2)
    print(dumped)

    results = evaluator.evaluate(adaptor,
                                 tasks.get_task_dict([
Exemplo n.º 16
0
def main():

    args = parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)

    lm = models.get_model(args.model).create_from_arg_string(
        args.model_args, {
            'batch_size': args.batch_size,
            'device': args.device
        })

    if args.limit:
        print(
            "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
        )

    if not args.no_cache:
        lm = base.CachingLM(
            lm, 'lm_cache/' + args.model + '_' + args.model_args.replace(
                '=', '-').replace(',', '_').replace('/', '-') + '.db')
    if args.tasks == "all_tasks":
        task_names = tasks.ALL_TASKS
    else:
        task_names = args.tasks.split(",")
    task_dict = tasks.get_task_dict(task_names)

    results = evaluator.evaluate(lm, task_dict, args.provide_description,
                                 args.num_fewshot, args.limit)

    dumped = json.dumps(results, indent=2)
    print(dumped)
    if args.output_path:
        with open(args.output_path, "w") as f:
            f.write(dumped)

    # MAKE TABLE
    from pytablewriter import MarkdownTableWriter, LatexTableWriter

    md_writer = MarkdownTableWriter()
    latex_writer = LatexTableWriter()
    md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
    latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]

    values = []

    for k, dic in results["results"].items():
        version = results["versions"][k]
        for m, v in dic.items():
            if m.endswith("_stderr"): continue

            if m + "_stderr" in dic:
                se = dic[m + "_stderr"]

                values.append([k, version, m, '%.4f' % v, '±', '%.4f' % se])
            else:
                values.append([k, version, m, '%.4f' % v, '', ''])
            k = ""
            version = ""
    md_writer.value_matrix = values
    latex_writer.value_matrix = values

    # todo: make latex table look good
    # print(latex_writer.dumps())

    print(
        f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}"
    )
    print(md_writer.dumps())
Exemplo n.º 17
0
    def run_eval(
        self,
        eval_tasks=None,
        num_fewshot=0,
        bootstrap_iters=2,
        description_dict=None,
        use_cache=True,
        name="neox",
        limit=None
    ):
        was_training = self.model.training
        self.model.eval()
        in_micro_batches = (
            self.model.micro_batches
        )  # store input microbatches - we need to set to 1 during eval, but want to return to its original value after
        self.model.micro_batches = 1
        if eval_tasks is None:
            eval_tasks = [
                "lambada",
                "piqa",
                "hellaswag",
                "winogrande",
                "mathqa",
                "pubmedqa",
            ]

        # **HACK INCOMING**:
        # first get task dict on local main rank
        # the tasks are downloaded *as they are initialized*, and the downloads don't like multithreading.
        # so we download them once on the local main rank, wait, and then initialize them on all other ranks, which *should* load from the cache.
        if self.is_local_main:
            task_dict = tasks.get_task_dict(eval_tasks)
        # torch barrier
        if torch.distributed.is_initialized():
            torch.distributed.barrier()
        task_dict = tasks.get_task_dict(eval_tasks)

        lm = self
        if use_cache:
            # TODO(jon-tow): Append a subset of `neox_args` to the cache database
            # name arg to distinguish model runs that use different configurations.
            lm = base.CachingLM(lm, 'lm_cache/' + name + '.db')

        results = evaluator.evaluate(
            lm=lm,
            task_dict=tasks.get_task_dict(eval_tasks),
            description_dict=description_dict,
            num_fewshot=num_fewshot,
            limit=limit,
            bootstrap_iters=bootstrap_iters,
        )

        results["config"] = {
            "model": name,
            "model_args": dataclasses.asdict(self.neox_args),
            "num_fewshot": num_fewshot,
            "batch_size": self.batch_size,
            "device": str(self.device),
            "no_cache": not use_cache,
            "limit": limit,
            "bootstrap_iters": bootstrap_iters,
            "description_dict": description_dict
        }

        if was_training:
            self.model.train()
        self.model.micro_batches = in_micro_batches
        return results