Пример #1
0
def test_api(api, sss_or_tss=True):
    print("{:} start testing the api : {:}".format(time_string(), api))
    api.clear_params(12)
    api.reload(index=12)

    # Query the informations of 1113-th architecture
    info_strs = api.query_info_str_by_arch(1113)
    print(info_strs)
    info = api.query_by_index(113)
    print("{:}\n".format(info))
    info = api.query_by_index(113, "cifar100")
    print("{:}\n".format(info))

    info = api.query_meta_info_by_index(115, "90" if sss_or_tss else "200")
    print("{:}\n".format(info))

    for dataset in ["cifar10", "cifar100", "ImageNet16-120"]:
        for xset in ["train", "test", "valid"]:
            best_index, highest_accuracy = api.find_best(dataset, xset)
        print("")
    params = api.get_net_param(12, "cifar10", None)

    # Obtain the config and create the network
    config = api.get_net_config(12, "cifar10")
    print("{:}\n".format(config))
    network = get_cell_based_tiny_net(config)
    network.load_state_dict(next(iter(params.values())))

    # Obtain the cost information
    info = api.get_cost_info(12, "cifar10")
    print("{:}\n".format(info))
    info = api.get_latency(12, "cifar10")
    print("{:}\n".format(info))
    for index in [13, 15, 19, 200]:
        info = api.get_latency(index, "cifar10")

    # Count the number of architectures
    info = api.statistics("cifar100", "12")
    print("{:} statistics results : {:}\n".format(time_string(), info))

    # Show the information of the 123-th architecture
    api.show(123)

    # Obtain both cost and performance information
    info = api.get_more_info(1234, "cifar10")
    print("{:}\n".format(info))
    print("{:} finish testing the api : {:}".format(time_string(), api))

    if not sss_or_tss:
        arch_str = "|nor_conv_3x3~0|+|nor_conv_3x3~0|avg_pool_3x3~1|+|skip_connect~0|nor_conv_3x3~1|skip_connect~2|"
        matrix = api.str2matrix(arch_str)
        print("Compute the adjacency matrix of {:}".format(arch_str))
        print(matrix)
    info = api.simulate_train_eval(123, "cifar10")
    print("simulate_train_eval : {:}\n\n".format(info))
Пример #2
0
def visualize_curve(api_dict, vis_save_dir):
    vis_save_dir = vis_save_dir.resolve()
    vis_save_dir.mkdir(parents=True, exist_ok=True)

    dpi, width, height = 250, 5000, 2000
    figsize = width / float(dpi), height / float(dpi)
    LabelSize, LegendFontsize = 28, 28

    def sub_plot_fn(ax, search_space, dataset):
        max_time = x_axis_s[(dataset, search_space)]
        alg2data = fetch_data(search_space=search_space, dataset=dataset)
        alg2accuracies = OrderedDict()
        total_tickets = 200
        time_tickets = [
            float(i) / total_tickets * int(max_time)
            for i in range(total_tickets)
        ]
        ax.set_xlim(0, x_axis_s[(dataset, search_space)])
        ax.set_ylim(y_min_s[(dataset, search_space)],
                    y_max_s[(dataset, search_space)])
        for tick in ax.get_xticklabels():
            tick.set_rotation(25)
            tick.set_fontsize(LabelSize - 6)
        for tick in ax.get_yticklabels():
            tick.set_fontsize(LabelSize - 6)
        ax.xaxis.set_major_formatter(major_formatter)
        for idx, (alg, xdata) in enumerate(alg2data.items()):
            accuracies = []
            for ticket in time_tickets:
                # import pdb; pdb.set_trace()
                accuracy, accuracy_std = query_performance(
                    api_dict[search_space], xdata["data"], dataset, ticket)
                accuracies.append(accuracy)
            # print('{:} plot alg : {:10s}, final accuracy = {:.2f}$\pm${:.2f}'.format(time_string(), alg, accuracy, accuracy_std))
            print("{:} plot alg : {:10s} on {:}".format(
                time_string(), alg, search_space))
            alg2accuracies[alg] = accuracies
            ax.plot(
                time_tickets,
                accuracies,
                c=xdata["color"],
                linestyle=xdata["linestyle"],
                label="{:}".format(alg),
            )
            ax.set_xlabel("Estimated wall-clock time", fontsize=LabelSize)
            ax.set_ylabel("Test accuracy", fontsize=LabelSize)
            ax.set_title(
                r"Results on {:} over {:}".format(name2label[dataset],
                                                  spaces2latex[search_space]),
                fontsize=LabelSize,
            )
        ax.legend(loc=4, fontsize=LegendFontsize)

    fig, axs = plt.subplots(1, 2, figsize=figsize)
    sub_plot_fn(axs[0], "tss", "cifar10")
    sub_plot_fn(axs[1], "sss", "cifar10")
    save_path = (vis_save_dir / "full-curve.png").resolve()
    fig.savefig(save_path, dpi=dpi, bbox_inches="tight", format="png")
    print("{:} save into {:}".format(time_string(), save_path))
    plt.close("all")
Пример #3
0
def train_shared_cnn(
    xloader,
    shared_cnn,
    controller,
    criterion,
    scheduler,
    optimizer,
    epoch_str,
    print_freq,
    logger,
):
    data_time, batch_time = AverageMeter(), AverageMeter()
    losses, top1s, top5s, xend = (
        AverageMeter(),
        AverageMeter(),
        AverageMeter(),
        time.time(),
    )

    shared_cnn.train()
    controller.eval()

    for step, (inputs, targets) in enumerate(xloader):
        scheduler.update(None, 1.0 * step / len(xloader))
        targets = targets.cuda(non_blocking=True)
        # measure data loading time
        data_time.update(time.time() - xend)

        with torch.no_grad():
            _, _, sampled_arch = controller()

        optimizer.zero_grad()
        shared_cnn.module.update_arch(sampled_arch)
        _, logits = shared_cnn(inputs)
        loss = criterion(logits, targets)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(shared_cnn.parameters(), 5)
        optimizer.step()
        # record
        prec1, prec5 = obtain_accuracy(logits.data, targets.data, topk=(1, 5))
        losses.update(loss.item(), inputs.size(0))
        top1s.update(prec1.item(), inputs.size(0))
        top5s.update(prec5.item(), inputs.size(0))

        # measure elapsed time
        batch_time.update(time.time() - xend)
        xend = time.time()

        if step % print_freq == 0 or step + 1 == len(xloader):
            Sstr = (
                "*Train-Shared-CNN* " + time_string() +
                " [{:}][{:03d}/{:03d}]".format(epoch_str, step, len(xloader)))
            Tstr = "Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})".format(
                batch_time=batch_time, data_time=data_time)
            Wstr = "[Loss {loss.val:.3f} ({loss.avg:.3f})  Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})]".format(
                loss=losses, top1=top1s, top5=top5s)
            logger.log(Sstr + " " + Tstr + " " + Wstr)
    return losses.avg, top1s.avg, top5s.avg
Пример #4
0
def filter_indexes(xlist, mode, save_dir, seeds):
    all_indexes = []
    for index in xlist:
        if mode == "cover":
            all_indexes.append(index)
        else:
            for seed in seeds:
                temp_path = save_dir / "arch-{:06d}-seed-{:04d}.pth".format(
                    index, seed)
                if not temp_path.exists():
                    all_indexes.append(index)
                    break
    print("{:} [FILTER-INDEXES] : there are {:}/{:} architectures in total".
          format(time_string(), len(all_indexes), len(xlist)))

    SLURM_PROCID, SLURM_NTASKS = "SLURM_PROCID", "SLURM_NTASKS"
    if SLURM_PROCID in os.environ and SLURM_NTASKS in os.environ:  # run on the slurm
        proc_id, ntasks = int(os.environ[SLURM_PROCID]), int(
            os.environ[SLURM_NTASKS])
        assert 0 <= proc_id < ntasks, "invalid proc_id {:} vs ntasks {:}".format(
            proc_id, ntasks)
        scales = [
            int(float(i) / ntasks * len(all_indexes)) for i in range(ntasks)
        ] + [len(all_indexes)]
        per_job = []
        for i in range(ntasks):
            xs, xe = min(max(scales[i], 0),
                         len(all_indexes) - 1), min(max(scales[i + 1] - 1, 0),
                                                    len(all_indexes) - 1)
            per_job.append((xs, xe))
        for i, srange in enumerate(per_job):
            print("  -->> {:2d}/{:02d} : {:}".format(i, ntasks, srange))
        current_range = per_job[proc_id]
        all_indexes = [
            all_indexes[i]
            for i in range(current_range[0], current_range[1] + 1)
        ]
        # set the device id
        device = proc_id % torch.cuda.device_count()
        torch.cuda.set_device(device)
        print("  set the device id = {:}".format(device))
    print(
        "{:} [FILTER-INDEXES] : after filtering there are {:} architectures in total"
        .format(time_string(), len(all_indexes)))
    return all_indexes
Пример #5
0
def search_valid(xloader, network, criterion, extra_info, print_freq, logger):
    data_time, batch_time, losses, top1, top5 = (
        AverageMeter(),
        AverageMeter(),
        AverageMeter(),
        AverageMeter(),
        AverageMeter(),
    )

    network.eval()
    network.apply(change_key("search_mode", "search"))
    end = time.time()
    # logger.log('Starting evaluating {:}'.format(epoch_info))
    with torch.no_grad():
        for i, (inputs, targets) in enumerate(xloader):
            # measure data loading time
            data_time.update(time.time() - end)
            # calculate prediction and loss
            targets = targets.cuda(non_blocking=True)

            logits, expected_flop = network(inputs)
            loss = criterion(logits, targets)
            # record
            prec1, prec5 = obtain_accuracy(logits.data,
                                           targets.data,
                                           topk=(1, 5))
            losses.update(loss.item(), inputs.size(0))
            top1.update(prec1.item(), inputs.size(0))
            top5.update(prec5.item(), inputs.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            if i % print_freq == 0 or (i + 1) == len(xloader):
                Sstr = ("**VALID** " + time_string() +
                        " [{:}][{:03d}/{:03d}]".format(extra_info, i,
                                                       len(xloader)))
                Tstr = "Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})".format(
                    batch_time=batch_time, data_time=data_time)
                Lstr = "Loss {loss.val:.3f} ({loss.avg:.3f})  Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})".format(
                    loss=losses, top1=top1, top5=top5)
                Istr = "Size={:}".format(list(inputs.size()))
                logger.log(Sstr + " " + Tstr + " " + Lstr + " " + Istr)

    logger.log(
        " **VALID** Prec@1 {top1.avg:.2f} Prec@5 {top5.avg:.2f} Error@1 {error1:.2f} Error@5 {error5:.2f} Loss:{loss:.3f}"
        .format(
            top1=top1,
            top5=top5,
            error1=100 - top1.avg,
            error5=100 - top5.avg,
            loss=losses.avg,
        ))

    return losses.avg, top1.avg, top5.avg
Пример #6
0
def check_unique_arch(meta_file):
    api = API(str(meta_file))
    arch_strs = deepcopy(api.meta_archs)
    xarchs = [CellStructure.str2structure(x) for x in arch_strs]

    def get_unique_matrix(archs, consider_zero):
        UniquStrs = [arch.to_unique_str(consider_zero) for arch in archs]
        print("{:} create unique-string ({:}/{:}) done".format(
            time_string(), len(set(UniquStrs)), len(UniquStrs)))
        Unique2Index = dict()
        for index, xstr in enumerate(UniquStrs):
            if xstr not in Unique2Index:
                Unique2Index[xstr] = list()
            Unique2Index[xstr].append(index)
        sm_matrix = torch.eye(len(archs)).bool()
        for _, xlist in Unique2Index.items():
            for i in xlist:
                for j in xlist:
                    sm_matrix[i, j] = True
        unique_ids, unique_num = [-1 for _ in archs], 0
        for i in range(len(unique_ids)):
            if unique_ids[i] > -1:
                continue
            neighbours = sm_matrix[i].nonzero().view(-1).tolist()
            for nghb in neighbours:
                assert unique_ids[nghb] == -1, "impossible"
                unique_ids[nghb] = unique_num
            unique_num += 1
        return sm_matrix, unique_ids, unique_num

    print("There are {:} valid-archs".format(
        sum(arch.check_valid() for arch in xarchs)))
    sm_matrix, uniqueIDs, unique_num = get_unique_matrix(xarchs, None)
    print(
        "{:} There are {:} unique architectures (considering nothing).".format(
            time_string(), unique_num))
    sm_matrix, uniqueIDs, unique_num = get_unique_matrix(xarchs, False)
    print("{:} There are {:} unique architectures (not considering zero).".
          format(time_string(), unique_num))
    sm_matrix, uniqueIDs, unique_num = get_unique_matrix(xarchs, True)
    print("{:} There are {:} unique architectures (considering zero).".format(
        time_string(), unique_num))
Пример #7
0
def main(xargs, api):
    torch.set_num_threads(4)
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(args)

    search_space = get_search_spaces(xargs.search_space, "nats-bench")
    if xargs.search_space == "tss":
        random_arch = random_topology_func(search_space)
        mutate_arch = mutate_topology_func(search_space)
    else:
        random_arch = random_size_func(search_space)
        mutate_arch = mutate_size_func(search_space)

    x_start_time = time.time()
    logger.log("{:} use api : {:}".format(time_string(), api))
    logger.log("-" * 30 +
               " start searching with the time budget of {:} s".format(
                   xargs.time_budget))
    history, current_best_index, total_times = regularized_evolution(
        xargs.ea_cycles,
        xargs.ea_population,
        xargs.ea_sample_size,
        xargs.time_budget,
        random_arch,
        mutate_arch,
        api,
        xargs.use_proxy > 0,
        xargs.dataset,
    )
    logger.log(
        "{:} regularized_evolution finish with history of {:} arch with {:.1f} s (real-cost={:.2f} s)."
        .format(time_string(), len(history), total_times[-1],
                time.time() - x_start_time))
    best_arch = max(history, key=lambda x: x[0])[1]
    logger.log("{:} best arch is {:}".format(time_string(), best_arch))

    info = api.query_info_str_by_arch(
        best_arch, "200" if xargs.search_space == "tss" else "90")
    logger.log("{:}".format(info))
    logger.log("-" * 100)
    logger.close()
    return logger.log_dir, current_best_index, total_times
Пример #8
0
def main(xargs, api):
    torch.set_num_threads(4)
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(args)

    logger.log("{:} use api : {:}".format(time_string(), api))
    api.reset_time()

    search_space = get_search_spaces(xargs.search_space, "nats-bench")
    if xargs.search_space == "tss":
        random_arch = random_topology_func(search_space)
    else:
        random_arch = random_size_func(search_space)

    best_arch, best_acc, total_time_cost, history = None, -1, [], []
    current_best_index = []
    while len(total_time_cost) == 0 or total_time_cost[-1] < xargs.time_budget:
        arch = random_arch()
        accuracy, _, _, total_cost = api.simulate_train_eval(
            arch, xargs.dataset, hp="12"
        )
        total_time_cost.append(total_cost)
        history.append(arch)
        if best_arch is None or best_acc < accuracy:
            best_acc, best_arch = accuracy, arch
        logger.log(
            "[{:03d}] : {:} : accuracy = {:.2f}%".format(len(history), arch, accuracy)
        )
        current_best_index.append(api.query_index_by_arch(best_arch))
    logger.log(
        "{:} best arch is {:}, accuracy = {:.2f}%, visit {:} archs with {:.1f} s.".format(
            time_string(), best_arch, best_acc, len(history), total_time_cost[-1]
        )
    )

    info = api.query_info_str_by_arch(
        best_arch, "200" if xargs.search_space == "tss" else "90"
    )
    logger.log("{:}".format(info))
    logger.log("-" * 100)
    logger.close()
    return logger.log_dir, current_best_index, total_time_cost
Пример #9
0
def search_func(
    xloader, network, criterion, scheduler, w_optimizer, epoch_str, print_freq, logger
):
    data_time, batch_time = AverageMeter(), AverageMeter()
    base_losses, base_top1, base_top5 = AverageMeter(), AverageMeter(), AverageMeter()
    network.train()
    end = time.time()
    for step, (base_inputs, base_targets, arch_inputs, arch_targets) in enumerate(
        xloader
    ):
        scheduler.update(None, 1.0 * step / len(xloader))
        base_targets = base_targets.cuda(non_blocking=True)
        arch_targets = arch_targets.cuda(non_blocking=True)
        # measure data loading time
        data_time.update(time.time() - end)

        # update the weights
        network.module.random_genotype(True)
        w_optimizer.zero_grad()
        _, logits = network(base_inputs)
        base_loss = criterion(logits, base_targets)
        base_loss.backward()
        nn.utils.clip_grad_norm_(network.parameters(), 5)
        w_optimizer.step()
        # record
        base_prec1, base_prec5 = obtain_accuracy(
            logits.data, base_targets.data, topk=(1, 5)
        )
        base_losses.update(base_loss.item(), base_inputs.size(0))
        base_top1.update(base_prec1.item(), base_inputs.size(0))
        base_top5.update(base_prec5.item(), base_inputs.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if step % print_freq == 0 or step + 1 == len(xloader):
            Sstr = (
                "*SEARCH* "
                + time_string()
                + " [{:}][{:03d}/{:03d}]".format(epoch_str, step, len(xloader))
            )
            Tstr = "Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})".format(
                batch_time=batch_time, data_time=data_time
            )
            Wstr = "Base [Loss {loss.val:.3f} ({loss.avg:.3f})  Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})]".format(
                loss=base_losses, top1=base_top1, top5=base_top5
            )
            logger.log(Sstr + " " + Tstr + " " + Wstr)
    return base_losses.avg, base_top1.avg, base_top5.avg
Пример #10
0
def filter_indexes(xlist, mode, save_dir, seeds):
    all_indexes = []
    for index in xlist:
        if mode == "cover":
            all_indexes.append(index)
        else:
            for seed in seeds:
                temp_path = save_dir / "arch-{:06d}-seed-{:04d}.pth".format(
                    index, seed)
                if not temp_path.exists():
                    all_indexes.append(index)
                    break
    print("{:} [FILTER-INDEXES] : there are {:}/{:} architectures in total".
          format(time_string(), len(all_indexes), len(xlist)))
    return all_indexes
Пример #11
0
 def sub_plot_fn(ax, search_space, dataset):
     max_time = x_axis_s[(dataset, search_space)]
     alg2data = fetch_data(search_space=search_space, dataset=dataset)
     alg2accuracies = OrderedDict()
     total_tickets = 200
     time_tickets = [
         float(i) / total_tickets * int(max_time)
         for i in range(total_tickets)
     ]
     ax.set_xlim(0, x_axis_s[(dataset, search_space)])
     ax.set_ylim(y_min_s[(dataset, search_space)],
                 y_max_s[(dataset, search_space)])
     for tick in ax.get_xticklabels():
         tick.set_rotation(25)
         tick.set_fontsize(LabelSize - 6)
     for tick in ax.get_yticklabels():
         tick.set_fontsize(LabelSize - 6)
     ax.xaxis.set_major_formatter(major_formatter)
     for idx, (alg, xdata) in enumerate(alg2data.items()):
         accuracies = []
         for ticket in time_tickets:
             # import pdb; pdb.set_trace()
             accuracy, accuracy_std = query_performance(
                 api_dict[search_space], xdata["data"], dataset, ticket)
             accuracies.append(accuracy)
         # print('{:} plot alg : {:10s}, final accuracy = {:.2f}$\pm${:.2f}'.format(time_string(), alg, accuracy, accuracy_std))
         print("{:} plot alg : {:10s} on {:}".format(
             time_string(), alg, search_space))
         alg2accuracies[alg] = accuracies
         ax.plot(
             time_tickets,
             accuracies,
             c=xdata["color"],
             linestyle=xdata["linestyle"],
             label="{:}".format(alg),
         )
         ax.set_xlabel("Estimated wall-clock time", fontsize=LabelSize)
         ax.set_ylabel("Test accuracy", fontsize=LabelSize)
         ax.set_title(
             r"Results on {:} over {:}".format(name2label[dataset],
                                               spaces2latex[search_space]),
             fontsize=LabelSize,
         )
     ax.legend(loc=4, fontsize=LegendFontsize)
Пример #12
0
 def sub_plot_fn(ax, dataset):
     xdataset, max_time = dataset.split("-T")
     alg2data = fetch_data(search_space=search_space, dataset=dataset)
     alg2accuracies = OrderedDict()
     total_tickets = 150
     time_tickets = [
         float(i) / total_tickets * int(max_time)
         for i in range(total_tickets)
     ]
     colors = ["b", "g", "c", "m", "y"]
     ax.set_xlim(0, x_axis_s[(xdataset, search_space)])
     ax.set_ylim(y_min_s[(xdataset, search_space)],
                 y_max_s[(xdataset, search_space)])
     for idx, (alg, data) in enumerate(alg2data.items()):
         accuracies = []
         for ticket in time_tickets:
             accuracy, accuracy_std = query_performance(
                 api, data, xdataset, ticket)
             accuracies.append(accuracy)
         valid_str, test_str = show_valid_test(api, data, xdataset)
         # print('{:} plot alg : {:10s}, final accuracy = {:.2f}$\pm${:.2f}'.format(time_string(), alg, accuracy, accuracy_std))
         print("{:} plot alg : {:10s}  | validation = {:} | test = {:}".
               format(time_string(), alg, valid_str, test_str))
         alg2accuracies[alg] = accuracies
         ax.plot(
             [x / 100 for x in time_tickets],
             accuracies,
             c=colors[idx],
             label="{:}".format(alg),
         )
         ax.set_xlabel("Estimated wall-clock time (1e2 seconds)",
                       fontsize=LabelSize)
         ax.set_ylabel("Test accuracy on {:}".format(name2label[xdataset]),
                       fontsize=LabelSize)
         ax.set_title(
             "Searching results on {:}".format(name2label[xdataset]),
             fontsize=LabelSize + 4,
         )
     ax.legend(loc=4, fontsize=LegendFontsize)
Пример #13
0
 def get_unique_matrix(archs, consider_zero):
     UniquStrs = [arch.to_unique_str(consider_zero) for arch in archs]
     print("{:} create unique-string ({:}/{:}) done".format(
         time_string(), len(set(UniquStrs)), len(UniquStrs)))
     Unique2Index = dict()
     for index, xstr in enumerate(UniquStrs):
         if xstr not in Unique2Index:
             Unique2Index[xstr] = list()
         Unique2Index[xstr].append(index)
     sm_matrix = torch.eye(len(archs)).bool()
     for _, xlist in Unique2Index.items():
         for i in xlist:
             for j in xlist:
                 sm_matrix[i, j] = True
     unique_ids, unique_num = [-1 for _ in archs], 0
     for i in range(len(unique_ids)):
         if unique_ids[i] > -1:
             continue
         neighbours = sm_matrix[i].nonzero().view(-1).tolist()
         for nghb in neighbours:
             assert unique_ids[nghb] == -1, "impossible"
             unique_ids[nghb] = unique_num
         unique_num += 1
     return sm_matrix, unique_ids, unique_num
Пример #14
0
def search_train_v2(
    search_loader,
    network,
    criterion,
    scheduler,
    base_optimizer,
    arch_optimizer,
    optim_config,
    extra_info,
    print_freq,
    logger,
):
    data_time, batch_time = AverageMeter(), AverageMeter()
    base_losses, arch_losses, top1, top5 = (
        AverageMeter(),
        AverageMeter(),
        AverageMeter(),
        AverageMeter(),
    )
    arch_cls_losses, arch_flop_losses = AverageMeter(), AverageMeter()
    epoch_str, flop_need, flop_weight, flop_tolerant = (
        extra_info["epoch-str"],
        extra_info["FLOP-exp"],
        extra_info["FLOP-weight"],
        extra_info["FLOP-tolerant"],
    )

    network.train()
    logger.log(
        "[Search] : {:}, FLOP-Require={:.2f} MB, FLOP-WEIGHT={:.2f}".format(
            epoch_str, flop_need, flop_weight
        )
    )
    end = time.time()
    network.apply(change_key("search_mode", "search"))
    for step, (base_inputs, base_targets, arch_inputs, arch_targets) in enumerate(
        search_loader
    ):
        scheduler.update(None, 1.0 * step / len(search_loader))
        # calculate prediction and loss
        base_targets = base_targets.cuda(non_blocking=True)
        arch_targets = arch_targets.cuda(non_blocking=True)
        # measure data loading time
        data_time.update(time.time() - end)

        # update the weights
        base_optimizer.zero_grad()
        logits, expected_flop = network(base_inputs)
        base_loss = criterion(logits, base_targets)
        base_loss.backward()
        base_optimizer.step()
        # record
        prec1, prec5 = obtain_accuracy(logits.data, base_targets.data, topk=(1, 5))
        base_losses.update(base_loss.item(), base_inputs.size(0))
        top1.update(prec1.item(), base_inputs.size(0))
        top5.update(prec5.item(), base_inputs.size(0))

        # update the architecture
        arch_optimizer.zero_grad()
        logits, expected_flop = network(arch_inputs)
        flop_cur = network.module.get_flop("genotype", None, None)
        flop_loss, flop_loss_scale = get_flop_loss(
            expected_flop, flop_cur, flop_need, flop_tolerant
        )
        acls_loss = criterion(logits, arch_targets)
        arch_loss = acls_loss + flop_loss * flop_weight
        arch_loss.backward()
        arch_optimizer.step()

        # record
        arch_losses.update(arch_loss.item(), arch_inputs.size(0))
        arch_flop_losses.update(flop_loss_scale, arch_inputs.size(0))
        arch_cls_losses.update(acls_loss.item(), arch_inputs.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()
        if step % print_freq == 0 or (step + 1) == len(search_loader):
            Sstr = (
                "**TRAIN** "
                + time_string()
                + " [{:}][{:03d}/{:03d}]".format(epoch_str, step, len(search_loader))
            )
            Tstr = "Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})".format(
                batch_time=batch_time, data_time=data_time
            )
            Lstr = "Base-Loss {loss.val:.3f} ({loss.avg:.3f})  Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})".format(
                loss=base_losses, top1=top1, top5=top5
            )
            Vstr = "Acls-loss {aloss.val:.3f} ({aloss.avg:.3f}) FLOP-Loss {floss.val:.3f} ({floss.avg:.3f}) Arch-Loss {loss.val:.3f} ({loss.avg:.3f})".format(
                aloss=arch_cls_losses, floss=arch_flop_losses, loss=arch_losses
            )
            logger.log(Sstr + " " + Tstr + " " + Lstr + " " + Vstr)
            # num_bytes = torch.cuda.max_memory_allocated( next(network.parameters()).device ) * 1.0
            # logger.log(Sstr + ' ' + Tstr + ' ' + Lstr + ' ' + Vstr + ' GPU={:.2f}MB'.format(num_bytes/1e6))
            # Istr = 'Bsz={:} Asz={:}'.format(list(base_inputs.size()), list(arch_inputs.size()))
            # logger.log(Sstr + ' ' + Tstr + ' ' + Lstr + ' ' + Vstr + ' ' + Istr)
            # print(network.module.get_arch_info())
            # print(network.module.width_attentions[0])
            # print(network.module.width_attentions[1])

    logger.log(
        " **TRAIN** Prec@1 {top1.avg:.2f} Prec@5 {top5.avg:.2f} Error@1 {error1:.2f} Error@5 {error5:.2f} Base-Loss:{baseloss:.3f}, Arch-Loss={archloss:.3f}".format(
            top1=top1,
            top5=top5,
            error1=100 - top1.avg,
            error5=100 - top5.avg,
            baseloss=base_losses.avg,
            archloss=arch_losses.avg,
        )
    )
    return base_losses.avg, arch_losses.avg, top1.avg, top5.avg
Пример #15
0
def main(xargs):
    assert torch.cuda.is_available(), "CUDA is not available."
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(xargs.workers)
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(args)

    train_data, valid_data, xshape, class_num = get_datasets(
        xargs.dataset, xargs.data_path, -1)
    config = load_config(xargs.config_path, {
        "class_num": class_num,
        "xshape": xshape
    }, logger)
    search_loader, _, valid_loader = get_nas_search_loaders(
        train_data,
        valid_data,
        xargs.dataset,
        "configs/nas-benchmark/",
        (config.batch_size, config.test_batch_size),
        xargs.workers,
    )
    logger.log(
        "||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}"
        .format(xargs.dataset, len(search_loader), len(valid_loader),
                config.batch_size))
    logger.log("||||||| {:10s} ||||||| Config={:}".format(
        xargs.dataset, config))

    search_space = get_search_spaces("cell", xargs.search_space_name)
    if xargs.model_config is None:
        model_config = dict2config(
            dict(
                name="SETN",
                C=xargs.channel,
                N=xargs.num_cells,
                max_nodes=xargs.max_nodes,
                num_classes=class_num,
                space=search_space,
                affine=False,
                track_running_stats=bool(xargs.track_running_stats),
            ),
            None,
        )
    else:
        model_config = load_config(
            xargs.model_config,
            dict(
                num_classes=class_num,
                space=search_space,
                affine=False,
                track_running_stats=bool(xargs.track_running_stats),
            ),
            None,
        )
    logger.log("search space : {:}".format(search_space))
    search_model = get_cell_based_tiny_net(model_config)

    w_optimizer, w_scheduler, criterion = get_optim_scheduler(
        search_model.get_weights(), config)
    a_optimizer = torch.optim.Adam(
        search_model.get_alphas(),
        lr=xargs.arch_learning_rate,
        betas=(0.5, 0.999),
        weight_decay=xargs.arch_weight_decay,
    )
    logger.log("w-optimizer : {:}".format(w_optimizer))
    logger.log("a-optimizer : {:}".format(a_optimizer))
    logger.log("w-scheduler : {:}".format(w_scheduler))
    logger.log("criterion   : {:}".format(criterion))
    flop, param = get_model_infos(search_model, xshape)
    logger.log("FLOP = {:.2f} M, Params = {:.2f} MB".format(flop, param))
    logger.log("search-space : {:}".format(search_space))
    if xargs.arch_nas_dataset is None:
        api = None
    else:
        api = API(xargs.arch_nas_dataset)
    logger.log("{:} create API = {:} done".format(time_string(), api))

    last_info, model_base_path, model_best_path = (
        logger.path("info"),
        logger.path("model"),
        logger.path("best"),
    )
    network, criterion = torch.nn.DataParallel(
        search_model).cuda(), criterion.cuda()

    if last_info.exists():  # automatically resume from previous checkpoint
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(
            last_info))
        last_info = torch.load(last_info)
        start_epoch = last_info["epoch"]
        checkpoint = torch.load(last_info["last_checkpoint"])
        genotypes = checkpoint["genotypes"]
        valid_accuracies = checkpoint["valid_accuracies"]
        search_model.load_state_dict(checkpoint["search_model"])
        w_scheduler.load_state_dict(checkpoint["w_scheduler"])
        w_optimizer.load_state_dict(checkpoint["w_optimizer"])
        a_optimizer.load_state_dict(checkpoint["a_optimizer"])
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch."
            .format(last_info, start_epoch))
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        init_genotype, _ = get_best_arch(valid_loader, network,
                                         xargs.select_num)
        start_epoch, valid_accuracies, genotypes = 0, {
            "best": -1
        }, {
            -1: init_genotype
        }

    # start training
    start_time, search_time, epoch_time, total_epoch = (
        time.time(),
        AverageMeter(),
        AverageMeter(),
        config.epochs + config.warmup,
    )
    for epoch in range(start_epoch, total_epoch):
        w_scheduler.update(epoch, 0.0)
        need_time = "Time Left: {:}".format(
            convert_secs2time(epoch_time.val * (total_epoch - epoch), True))
        epoch_str = "{:03d}-{:03d}".format(epoch, total_epoch)
        logger.log("\n[Search the {:}-th epoch] {:}, LR={:}".format(
            epoch_str, need_time, min(w_scheduler.get_lr())))

        (
            search_w_loss,
            search_w_top1,
            search_w_top5,
            search_a_loss,
            search_a_top1,
            search_a_top5,
        ) = search_func(
            search_loader,
            network,
            criterion,
            w_scheduler,
            w_optimizer,
            a_optimizer,
            epoch_str,
            xargs.print_freq,
            logger,
        )
        search_time.update(time.time() - start_time)
        logger.log(
            "[{:}] search [base] : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s"
            .format(epoch_str, search_w_loss, search_w_top1, search_w_top5,
                    search_time.sum))
        logger.log(
            "[{:}] search [arch] : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%"
            .format(epoch_str, search_a_loss, search_a_top1, search_a_top5))

        genotype, temp_accuracy = get_best_arch(valid_loader, network,
                                                xargs.select_num)
        network.module.set_cal_mode("dynamic", genotype)
        valid_a_loss, valid_a_top1, valid_a_top5 = valid_func(
            valid_loader, network, criterion)
        logger.log(
            "[{:}] evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}% | {:}"
            .format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5,
                    genotype))
        # search_model.set_cal_mode('urs')
        # valid_a_loss , valid_a_top1 , valid_a_top5  = valid_func(valid_loader, network, criterion)
        # logger.log('[{:}] URS---evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))
        # search_model.set_cal_mode('joint')
        # valid_a_loss , valid_a_top1 , valid_a_top5  = valid_func(valid_loader, network, criterion)
        # logger.log('[{:}] JOINT-evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))
        # search_model.set_cal_mode('select')
        # valid_a_loss , valid_a_top1 , valid_a_top5  = valid_func(valid_loader, network, criterion)
        # logger.log('[{:}] Selec-evaluate : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%'.format(epoch_str, valid_a_loss, valid_a_top1, valid_a_top5))
        # check the best accuracy
        valid_accuracies[epoch] = valid_a_top1

        genotypes[epoch] = genotype
        logger.log("<<<--->>> The {:}-th epoch : {:}".format(
            epoch_str, genotypes[epoch]))
        # save checkpoint
        save_path = save_checkpoint(
            {
                "epoch": epoch + 1,
                "args": deepcopy(xargs),
                "search_model": search_model.state_dict(),
                "w_optimizer": w_optimizer.state_dict(),
                "a_optimizer": a_optimizer.state_dict(),
                "w_scheduler": w_scheduler.state_dict(),
                "genotypes": genotypes,
                "valid_accuracies": valid_accuracies,
            },
            model_base_path,
            logger,
        )
        last_info = save_checkpoint(
            {
                "epoch": epoch + 1,
                "args": deepcopy(args),
                "last_checkpoint": save_path,
            },
            logger.path("info"),
            logger,
        )
        with torch.no_grad():
            logger.log("{:}".format(search_model.show_alphas()))
        if api is not None:
            logger.log("{:}".format(api.query_by_arch(genotypes[epoch],
                                                      "200")))
        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    # the final post procedure : count the time
    start_time = time.time()
    genotype, temp_accuracy = get_best_arch(valid_loader, network,
                                            xargs.select_num)
    search_time.update(time.time() - start_time)
    network.module.set_cal_mode("dynamic", genotype)
    valid_a_loss, valid_a_top1, valid_a_top5 = valid_func(
        valid_loader, network, criterion)
    logger.log(
        "Last : the gentotype is : {:}, with the validation accuracy of {:.3f}%."
        .format(genotype, valid_a_top1))

    logger.log("\n" + "-" * 100)
    # check the performance from the architecture dataset
    logger.log(
        "SETN : run {:} epochs, cost {:.1f} s, last-geno is {:}.".format(
            total_epoch, search_time.sum, genotype))
    if api is not None:
        logger.log("{:}".format(api.query_by_arch(genotype, "200")))
    logger.close()
Пример #16
0
def search_func(
    xloader,
    network,
    criterion,
    scheduler,
    w_optimizer,
    a_optimizer,
    enable_controller,
    algo,
    epoch_str,
    print_freq,
    logger,
):
    data_time, batch_time = AverageMeter(), AverageMeter()
    base_losses, base_top1, base_top5 = AverageMeter(), AverageMeter(), AverageMeter()
    arch_losses, arch_top1, arch_top5 = AverageMeter(), AverageMeter(), AverageMeter()
    end = time.time()
    network.train()
    for step, (base_inputs, base_targets, arch_inputs, arch_targets) in enumerate(
        xloader
    ):
        scheduler.update(None, 1.0 * step / len(xloader))
        base_inputs = base_inputs.cuda(non_blocking=True)
        arch_inputs = arch_inputs.cuda(non_blocking=True)
        base_targets = base_targets.cuda(non_blocking=True)
        arch_targets = arch_targets.cuda(non_blocking=True)
        # measure data loading time
        data_time.update(time.time() - end)

        # Update the weights
        network.zero_grad()
        _, logits, _ = network(base_inputs)
        base_loss = criterion(logits, base_targets)
        base_loss.backward()
        w_optimizer.step()
        # record
        base_prec1, base_prec5 = obtain_accuracy(
            logits.data, base_targets.data, topk=(1, 5)
        )
        base_losses.update(base_loss.item(), base_inputs.size(0))
        base_top1.update(base_prec1.item(), base_inputs.size(0))
        base_top5.update(base_prec5.item(), base_inputs.size(0))

        # update the architecture-weight
        network.zero_grad()
        a_optimizer.zero_grad()
        _, logits, log_probs = network(arch_inputs)
        arch_prec1, arch_prec5 = obtain_accuracy(
            logits.data, arch_targets.data, topk=(1, 5)
        )
        if algo == "mask_rl":
            with torch.no_grad():
                RL_BASELINE_EMA.update(arch_prec1.item())
                rl_advantage = arch_prec1 - RL_BASELINE_EMA.value
            rl_log_prob = sum(log_probs)
            arch_loss = -rl_advantage * rl_log_prob
        elif algo == "tas" or algo == "mask_gumbel":
            arch_loss = criterion(logits, arch_targets)
        else:
            raise ValueError("invalid algorightm name: {:}".format(algo))
        if enable_controller:
            arch_loss.backward()
            a_optimizer.step()
        # record
        arch_losses.update(arch_loss.item(), arch_inputs.size(0))
        arch_top1.update(arch_prec1.item(), arch_inputs.size(0))
        arch_top5.update(arch_prec5.item(), arch_inputs.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if step % print_freq == 0 or step + 1 == len(xloader):
            Sstr = (
                "*SEARCH* "
                + time_string()
                + " [{:}][{:03d}/{:03d}]".format(epoch_str, step, len(xloader))
            )
            Tstr = "Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})".format(
                batch_time=batch_time, data_time=data_time
            )
            Wstr = "Base [Loss {loss.val:.3f} ({loss.avg:.3f})  Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})]".format(
                loss=base_losses, top1=base_top1, top5=base_top5
            )
            Astr = "Arch [Loss {loss.val:.3f} ({loss.avg:.3f})  Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})]".format(
                loss=arch_losses, top1=arch_top1, top5=arch_top5
            )
            logger.log(Sstr + " " + Tstr + " " + Wstr + " " + Astr)
    return (
        base_losses.avg,
        base_top1.avg,
        base_top5.avg,
        arch_losses.avg,
        arch_top1.avg,
        arch_top5.avg,
    )
Пример #17
0
def train_controller(
    xloader,
    shared_cnn,
    controller,
    criterion,
    optimizer,
    config,
    epoch_str,
    print_freq,
    logger,
):
    # config. (containing some necessary arg)
    #   baseline: The baseline score (i.e. average val_acc) from the previous epoch
    data_time, batch_time = AverageMeter(), AverageMeter()
    (
        GradnormMeter,
        LossMeter,
        ValAccMeter,
        EntropyMeter,
        BaselineMeter,
        RewardMeter,
        xend,
    ) = (
        AverageMeter(),
        AverageMeter(),
        AverageMeter(),
        AverageMeter(),
        AverageMeter(),
        AverageMeter(),
        time.time(),
    )

    shared_cnn.eval()
    controller.train()
    controller.zero_grad()
    # for step, (inputs, targets) in enumerate(xloader):
    loader_iter = iter(xloader)
    for step in range(config.ctl_train_steps * config.ctl_num_aggre):
        try:
            inputs, targets = next(loader_iter)
        except:
            loader_iter = iter(xloader)
            inputs, targets = next(loader_iter)
        targets = targets.cuda(non_blocking=True)
        # measure data loading time
        data_time.update(time.time() - xend)

        log_prob, entropy, sampled_arch = controller()
        with torch.no_grad():
            shared_cnn.module.update_arch(sampled_arch)
            _, logits = shared_cnn(inputs)
            val_top1, val_top5 = obtain_accuracy(logits.data,
                                                 targets.data,
                                                 topk=(1, 5))
            val_top1 = val_top1.view(-1) / 100
        reward = val_top1 + config.ctl_entropy_w * entropy
        if config.baseline is None:
            baseline = val_top1
        else:
            baseline = config.baseline - (1 - config.ctl_bl_dec) * (
                config.baseline - reward)

        loss = -1 * log_prob * (reward - baseline)

        # account
        RewardMeter.update(reward.item())
        BaselineMeter.update(baseline.item())
        ValAccMeter.update(val_top1.item() * 100)
        LossMeter.update(loss.item())
        EntropyMeter.update(entropy.item())

        # Average gradient over controller_num_aggregate samples
        loss = loss / config.ctl_num_aggre
        loss.backward(retain_graph=True)

        # measure elapsed time
        batch_time.update(time.time() - xend)
        xend = time.time()
        if (step + 1) % config.ctl_num_aggre == 0:
            grad_norm = torch.nn.utils.clip_grad_norm_(controller.parameters(),
                                                       5.0)
            GradnormMeter.update(grad_norm)
            optimizer.step()
            controller.zero_grad()

        if step % print_freq == 0:
            Sstr = ("*Train-Controller* " + time_string() +
                    " [{:}][{:03d}/{:03d}]".format(
                        epoch_str, step,
                        config.ctl_train_steps * config.ctl_num_aggre))
            Tstr = "Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})".format(
                batch_time=batch_time, data_time=data_time)
            Wstr = "[Loss {loss.val:.3f} ({loss.avg:.3f})  Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Reward {reward.val:.2f} ({reward.avg:.2f})] Baseline {basel.val:.2f} ({basel.avg:.2f})".format(
                loss=LossMeter,
                top1=ValAccMeter,
                reward=RewardMeter,
                basel=BaselineMeter,
            )
            Estr = "Entropy={:.4f} ({:.4f})".format(EntropyMeter.val,
                                                    EntropyMeter.avg)
            logger.log(Sstr + " " + Tstr + " " + Wstr + " " + Estr)

    return (
        LossMeter.avg,
        ValAccMeter.avg,
        BaselineMeter.avg,
        RewardMeter.avg,
        baseline.item(),
    )
Пример #18
0
def main(xargs):
    assert torch.cuda.is_available(), "CUDA is not available."
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(xargs.workers)
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(args)

    train_data, test_data, xshape, class_num = get_datasets(
        xargs.dataset, xargs.data_path, -1)
    logger.log("use config from : {:}".format(xargs.config_path))
    config = load_config(xargs.config_path, {
        "class_num": class_num,
        "xshape": xshape
    }, logger)
    _, train_loader, valid_loader = get_nas_search_loaders(
        train_data,
        test_data,
        xargs.dataset,
        "configs/nas-benchmark/",
        config.batch_size,
        xargs.workers,
    )
    # since ENAS will train the controller on valid-loader, we need to use train transformation for valid-loader
    valid_loader.dataset.transform = deepcopy(train_loader.dataset.transform)
    if hasattr(valid_loader.dataset, "transforms"):
        valid_loader.dataset.transforms = deepcopy(
            train_loader.dataset.transforms)
    # data loader
    logger.log(
        "||||||| {:10s} ||||||| Train-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}"
        .format(xargs.dataset, len(train_loader), len(valid_loader),
                config.batch_size))
    logger.log("||||||| {:10s} ||||||| Config={:}".format(
        xargs.dataset, config))

    search_space = get_search_spaces("cell", xargs.search_space_name)
    model_config = dict2config(
        {
            "name": "ENAS",
            "C": xargs.channel,
            "N": xargs.num_cells,
            "max_nodes": xargs.max_nodes,
            "num_classes": class_num,
            "space": search_space,
            "affine": False,
            "track_running_stats": bool(xargs.track_running_stats),
        },
        None,
    )
    shared_cnn = get_cell_based_tiny_net(model_config)
    controller = shared_cnn.create_controller()

    w_optimizer, w_scheduler, criterion = get_optim_scheduler(
        shared_cnn.parameters(), config)
    a_optimizer = torch.optim.Adam(
        controller.parameters(),
        lr=config.controller_lr,
        betas=config.controller_betas,
        eps=config.controller_eps,
    )
    logger.log("w-optimizer : {:}".format(w_optimizer))
    logger.log("a-optimizer : {:}".format(a_optimizer))
    logger.log("w-scheduler : {:}".format(w_scheduler))
    logger.log("criterion   : {:}".format(criterion))
    # flop, param  = get_model_infos(shared_cnn, xshape)
    # logger.log('{:}'.format(shared_cnn))
    # logger.log('FLOP = {:.2f} M, Params = {:.2f} MB'.format(flop, param))
    logger.log("search-space : {:}".format(search_space))
    if xargs.arch_nas_dataset is None:
        api = None
    else:
        api = API(xargs.arch_nas_dataset)
    logger.log("{:} create API = {:} done".format(time_string(), api))
    shared_cnn, controller, criterion = (
        torch.nn.DataParallel(shared_cnn).cuda(),
        controller.cuda(),
        criterion.cuda(),
    )

    last_info, model_base_path, model_best_path = (
        logger.path("info"),
        logger.path("model"),
        logger.path("best"),
    )

    if last_info.exists():  # automatically resume from previous checkpoint
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(
            last_info))
        last_info = torch.load(last_info)
        start_epoch = last_info["epoch"]
        checkpoint = torch.load(last_info["last_checkpoint"])
        genotypes = checkpoint["genotypes"]
        baseline = checkpoint["baseline"]
        valid_accuracies = checkpoint["valid_accuracies"]
        shared_cnn.load_state_dict(checkpoint["shared_cnn"])
        controller.load_state_dict(checkpoint["controller"])
        w_scheduler.load_state_dict(checkpoint["w_scheduler"])
        w_optimizer.load_state_dict(checkpoint["w_optimizer"])
        a_optimizer.load_state_dict(checkpoint["a_optimizer"])
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch."
            .format(last_info, start_epoch))
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        start_epoch, valid_accuracies, genotypes, baseline = 0, {
            "best": -1
        }, {}, None

    # start training
    start_time, search_time, epoch_time, total_epoch = (
        time.time(),
        AverageMeter(),
        AverageMeter(),
        config.epochs + config.warmup,
    )
    for epoch in range(start_epoch, total_epoch):
        w_scheduler.update(epoch, 0.0)
        need_time = "Time Left: {:}".format(
            convert_secs2time(epoch_time.val * (total_epoch - epoch), True))
        epoch_str = "{:03d}-{:03d}".format(epoch, total_epoch)
        logger.log(
            "\n[Search the {:}-th epoch] {:}, LR={:}, baseline={:}".format(
                epoch_str, need_time, min(w_scheduler.get_lr()), baseline))

        cnn_loss, cnn_top1, cnn_top5 = train_shared_cnn(
            train_loader,
            shared_cnn,
            controller,
            criterion,
            w_scheduler,
            w_optimizer,
            epoch_str,
            xargs.print_freq,
            logger,
        )
        logger.log(
            "[{:}] shared-cnn : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%"
            .format(epoch_str, cnn_loss, cnn_top1, cnn_top5))
        ctl_loss, ctl_acc, ctl_baseline, ctl_reward, baseline = train_controller(
            valid_loader,
            shared_cnn,
            controller,
            criterion,
            a_optimizer,
            dict2config(
                {
                    "baseline": baseline,
                    "ctl_train_steps": xargs.controller_train_steps,
                    "ctl_num_aggre": xargs.controller_num_aggregate,
                    "ctl_entropy_w": xargs.controller_entropy_weight,
                    "ctl_bl_dec": xargs.controller_bl_dec,
                },
                None,
            ),
            epoch_str,
            xargs.print_freq,
            logger,
        )
        search_time.update(time.time() - start_time)
        logger.log(
            "[{:}] controller : loss={:.2f}, accuracy={:.2f}%, baseline={:.2f}, reward={:.2f}, current-baseline={:.4f}, time-cost={:.1f} s"
            .format(
                epoch_str,
                ctl_loss,
                ctl_acc,
                ctl_baseline,
                ctl_reward,
                baseline,
                search_time.sum,
            ))
        best_arch, _ = get_best_arch(controller, shared_cnn, valid_loader)
        shared_cnn.module.update_arch(best_arch)
        _, best_valid_acc, _ = valid_func(valid_loader, shared_cnn, criterion)

        genotypes[epoch] = best_arch
        # check the best accuracy
        valid_accuracies[epoch] = best_valid_acc
        if best_valid_acc > valid_accuracies["best"]:
            valid_accuracies["best"] = best_valid_acc
            genotypes["best"] = best_arch
            find_best = True
        else:
            find_best = False

        logger.log("<<<--->>> The {:}-th epoch : {:}".format(
            epoch_str, genotypes[epoch]))
        # save checkpoint
        save_path = save_checkpoint(
            {
                "epoch": epoch + 1,
                "args": deepcopy(xargs),
                "baseline": baseline,
                "shared_cnn": shared_cnn.state_dict(),
                "controller": controller.state_dict(),
                "w_optimizer": w_optimizer.state_dict(),
                "a_optimizer": a_optimizer.state_dict(),
                "w_scheduler": w_scheduler.state_dict(),
                "genotypes": genotypes,
                "valid_accuracies": valid_accuracies,
            },
            model_base_path,
            logger,
        )
        last_info = save_checkpoint(
            {
                "epoch": epoch + 1,
                "args": deepcopy(args),
                "last_checkpoint": save_path,
            },
            logger.path("info"),
            logger,
        )
        if find_best:
            logger.log(
                "<<<--->>> The {:}-th epoch : find the highest validation accuracy : {:.2f}%."
                .format(epoch_str, best_valid_acc))
            copy_checkpoint(model_base_path, model_best_path, logger)
        if api is not None:
            logger.log("{:}".format(api.query_by_arch(genotypes[epoch],
                                                      "200")))
        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    logger.log("\n" + "-" * 100)
    logger.log("During searching, the best architecture is {:}".format(
        genotypes["best"]))
    logger.log("Its accuracy is {:.2f}%".format(valid_accuracies["best"]))
    logger.log("Randomly select {:} architectures and select the best.".format(
        xargs.controller_num_samples))
    start_time = time.time()
    final_arch, _ = get_best_arch(controller, shared_cnn, valid_loader,
                                  xargs.controller_num_samples)
    search_time.update(time.time() - start_time)
    shared_cnn.module.update_arch(final_arch)
    final_loss, final_top1, final_top5 = valid_func(valid_loader, shared_cnn,
                                                    criterion)
    logger.log("The Selected Final Architecture : {:}".format(final_arch))
    logger.log("Loss={:.3f}, Accuracy@1={:.2f}%, Accuracy@5={:.2f}%".format(
        final_loss, final_top1, final_top5))
    logger.log(
        "ENAS : run {:} epochs, cost {:.1f} s, last-geno is {:}.".format(
            total_epoch, search_time.sum, final_arch))
    if api is not None:
        logger.log("{:}".format(api.query_by_arch(final_arch)))
    logger.close()
Пример #19
0
def main(args):
    assert torch.cuda.is_available(), "CUDA is not available."
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    # torch.backends.cudnn.deterministic = True
    # torch.set_num_threads(args.workers)

    prepare_seed(args.rand_seed)
    logger = prepare_logger(args)

    train_data, valid_data, xshape, class_num = get_datasets(
        args.dataset, args.data_path, args.cutout_length)
    train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.workers,
        pin_memory=True,
    )
    valid_loader = torch.utils.data.DataLoader(
        valid_data,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=True,
    )
    # get configures
    model_config = load_config(args.model_config, {"class_num": class_num},
                               logger)
    optim_config = load_config(
        args.optim_config,
        {
            "class_num": class_num,
            "KD_alpha": args.KD_alpha,
            "KD_temperature": args.KD_temperature,
        },
        logger,
    )

    # load checkpoint
    teacher_base = load_net_from_checkpoint(args.KD_checkpoint)
    teacher = torch.nn.DataParallel(teacher_base).cuda()

    base_model = obtain_model(model_config)
    flop, param = get_model_infos(base_model, xshape)
    logger.log("Student ====>>>>:\n{:}".format(base_model))
    logger.log("Teacher ====>>>>:\n{:}".format(teacher_base))
    logger.log("model information : {:}".format(base_model.get_message()))
    logger.log("-" * 50)
    logger.log("Params={:.2f} MB, FLOPs={:.2f} M ... = {:.2f} G".format(
        param, flop, flop / 1e3))
    logger.log("-" * 50)
    logger.log("train_data : {:}".format(train_data))
    logger.log("valid_data : {:}".format(valid_data))
    optimizer, scheduler, criterion = get_optim_scheduler(
        base_model.parameters(), optim_config)
    logger.log("optimizer  : {:}".format(optimizer))
    logger.log("scheduler  : {:}".format(scheduler))
    logger.log("criterion  : {:}".format(criterion))

    last_info, model_base_path, model_best_path = (
        logger.path("info"),
        logger.path("model"),
        logger.path("best"),
    )
    network, criterion = torch.nn.DataParallel(
        base_model).cuda(), criterion.cuda()

    if last_info.exists():  # automatically resume from previous checkpoint
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(
            last_info))
        last_info = torch.load(last_info)
        start_epoch = last_info["epoch"] + 1
        checkpoint = torch.load(last_info["last_checkpoint"])
        base_model.load_state_dict(checkpoint["base-model"])
        scheduler.load_state_dict(checkpoint["scheduler"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        valid_accuracies = checkpoint["valid_accuracies"]
        max_bytes = checkpoint["max_bytes"]
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch."
            .format(last_info, start_epoch))
    elif args.resume is not None:
        assert Path(
            args.resume).exists(), "Can not find the resume file : {:}".format(
                args.resume)
        checkpoint = torch.load(args.resume)
        start_epoch = checkpoint["epoch"] + 1
        base_model.load_state_dict(checkpoint["base-model"])
        scheduler.load_state_dict(checkpoint["scheduler"])
        optimizer.load_state_dict(checkpoint["optimizer"])
        valid_accuracies = checkpoint["valid_accuracies"]
        max_bytes = checkpoint["max_bytes"]
        logger.log(
            "=> loading checkpoint from '{:}' start with {:}-th epoch.".format(
                args.resume, start_epoch))
    elif args.init_model is not None:
        assert Path(args.init_model).exists(
        ), "Can not find the initialization file : {:}".format(args.init_model)
        checkpoint = torch.load(args.init_model)
        base_model.load_state_dict(checkpoint["base-model"])
        start_epoch, valid_accuracies, max_bytes = 0, {"best": -1}, {}
        logger.log("=> initialize the model from {:}".format(args.init_model))
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        start_epoch, valid_accuracies, max_bytes = 0, {"best": -1}, {}

    train_func, valid_func = get_procedures(args.procedure)

    total_epoch = optim_config.epochs + optim_config.warmup
    # Main Training and Evaluation Loop
    start_time = time.time()
    epoch_time = AverageMeter()
    for epoch in range(start_epoch, total_epoch):
        scheduler.update(epoch, 0.0)
        need_time = "Time Left: {:}".format(
            convert_secs2time(epoch_time.avg * (total_epoch - epoch), True))
        epoch_str = "epoch={:03d}/{:03d}".format(epoch, total_epoch)
        LRs = scheduler.get_lr()
        find_best = False

        logger.log(
            "\n***{:s}*** start {:s} {:s}, LR=[{:.6f} ~ {:.6f}], scheduler={:}"
            .format(time_string(), epoch_str, need_time, min(LRs), max(LRs),
                    scheduler))

        # train for one epoch
        train_loss, train_acc1, train_acc5 = train_func(
            train_loader,
            teacher,
            network,
            criterion,
            scheduler,
            optimizer,
            optim_config,
            epoch_str,
            args.print_freq,
            logger,
        )
        # log the results
        logger.log(
            "***{:s}*** TRAIN [{:}] loss = {:.6f}, accuracy-1 = {:.2f}, accuracy-5 = {:.2f}"
            .format(time_string(), epoch_str, train_loss, train_acc1,
                    train_acc5))

        # evaluate the performance
        if (epoch % args.eval_frequency == 0) or (epoch + 1 == total_epoch):
            logger.log("-" * 150)
            valid_loss, valid_acc1, valid_acc5 = valid_func(
                valid_loader,
                teacher,
                network,
                criterion,
                optim_config,
                epoch_str,
                args.print_freq_eval,
                logger,
            )
            valid_accuracies[epoch] = valid_acc1
            logger.log(
                "***{:s}*** VALID [{:}] loss = {:.6f}, accuracy@1 = {:.2f}, accuracy@5 = {:.2f} | Best-Valid-Acc@1={:.2f}, Error@1={:.2f}"
                .format(
                    time_string(),
                    epoch_str,
                    valid_loss,
                    valid_acc1,
                    valid_acc5,
                    valid_accuracies["best"],
                    100 - valid_accuracies["best"],
                ))
            if valid_acc1 > valid_accuracies["best"]:
                valid_accuracies["best"] = valid_acc1
                find_best = True
                logger.log(
                    "Currently, the best validation accuracy found at {:03d}-epoch :: acc@1={:.2f}, acc@5={:.2f}, error@1={:.2f}, error@5={:.2f}, save into {:}."
                    .format(
                        epoch,
                        valid_acc1,
                        valid_acc5,
                        100 - valid_acc1,
                        100 - valid_acc5,
                        model_best_path,
                    ))
            num_bytes = (torch.cuda.max_memory_cached(
                next(network.parameters()).device) * 1.0)
            logger.log(
                "[GPU-Memory-Usage on {:} is {:} bytes, {:.2f} KB, {:.2f} MB, {:.2f} GB.]"
                .format(
                    next(network.parameters()).device,
                    int(num_bytes),
                    num_bytes / 1e3,
                    num_bytes / 1e6,
                    num_bytes / 1e9,
                ))
            max_bytes[epoch] = num_bytes
        if epoch % 10 == 0:
            torch.cuda.empty_cache()

        # save checkpoint
        save_path = save_checkpoint(
            {
                "epoch": epoch,
                "args": deepcopy(args),
                "max_bytes": deepcopy(max_bytes),
                "FLOP": flop,
                "PARAM": param,
                "valid_accuracies": deepcopy(valid_accuracies),
                "model-config": model_config._asdict(),
                "optim-config": optim_config._asdict(),
                "base-model": base_model.state_dict(),
                "scheduler": scheduler.state_dict(),
                "optimizer": optimizer.state_dict(),
            },
            model_base_path,
            logger,
        )
        if find_best:
            copy_checkpoint(model_base_path, model_best_path, logger)
        last_info = save_checkpoint(
            {
                "epoch": epoch,
                "args": deepcopy(args),
                "last_checkpoint": save_path,
            },
            logger.path("info"),
            logger,
        )

        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    logger.log("\n" + "-" * 200)
    logger.log("||| Params={:.2f} MB, FLOPs={:.2f} M ... = {:.2f} G".format(
        param, flop, flop / 1e3))
    logger.log(
        "Finish training/validation in {:} with Max-GPU-Memory of {:.2f} MB, and save final checkpoint into {:}"
        .format(
            convert_secs2time(epoch_time.sum, True),
            max(v for k, v in max_bytes.items()) / 1e6,
            logger.path("info"),
        ))
    logger.log("-" * 200 + "\n")
    logger.close()
Пример #20
0
def search_func(
    xloader,
    network,
    criterion,
    scheduler,
    w_optimizer,
    a_optimizer,
    epoch_str,
    print_freq,
    logger,
):
    data_time, batch_time = AverageMeter(), AverageMeter()
    base_losses, base_top1, base_top5 = AverageMeter(), AverageMeter(
    ), AverageMeter()
    arch_losses, arch_top1, arch_top5 = AverageMeter(), AverageMeter(
    ), AverageMeter()
    end = time.time()
    network.train()
    for step, (base_inputs, base_targets, arch_inputs,
               arch_targets) in enumerate(xloader):
        scheduler.update(None, 1.0 * step / len(xloader))
        base_targets = base_targets.cuda(non_blocking=True)
        arch_targets = arch_targets.cuda(non_blocking=True)
        # measure data loading time
        data_time.update(time.time() - end)

        # update the weights
        sampled_arch = network.module.dync_genotype(True)
        network.module.set_cal_mode("dynamic", sampled_arch)
        # network.module.set_cal_mode( 'urs' )
        network.zero_grad()
        _, logits = network(base_inputs)
        base_loss = criterion(logits, base_targets)
        base_loss.backward()
        w_optimizer.step()
        # record
        base_prec1, base_prec5 = obtain_accuracy(logits.data,
                                                 base_targets.data,
                                                 topk=(1, 5))
        base_losses.update(base_loss.item(), base_inputs.size(0))
        base_top1.update(base_prec1.item(), base_inputs.size(0))
        base_top5.update(base_prec5.item(), base_inputs.size(0))

        # update the architecture-weight
        network.module.set_cal_mode("joint")
        network.zero_grad()
        _, logits = network(arch_inputs)
        arch_loss = criterion(logits, arch_targets)
        arch_loss.backward()
        a_optimizer.step()
        # record
        arch_prec1, arch_prec5 = obtain_accuracy(logits.data,
                                                 arch_targets.data,
                                                 topk=(1, 5))
        arch_losses.update(arch_loss.item(), arch_inputs.size(0))
        arch_top1.update(arch_prec1.item(), arch_inputs.size(0))
        arch_top5.update(arch_prec5.item(), arch_inputs.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if step % print_freq == 0 or step + 1 == len(xloader):
            Sstr = (
                "*SEARCH* " + time_string() +
                " [{:}][{:03d}/{:03d}]".format(epoch_str, step, len(xloader)))
            Tstr = "Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})".format(
                batch_time=batch_time, data_time=data_time)
            Wstr = "Base [Loss {loss.val:.3f} ({loss.avg:.3f})  Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})]".format(
                loss=base_losses, top1=base_top1, top5=base_top5)
            Astr = "Arch [Loss {loss.val:.3f} ({loss.avg:.3f})  Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})]".format(
                loss=arch_losses, top1=arch_top1, top5=arch_top5)
            logger.log(Sstr + " " + Tstr + " " + Wstr + " " + Astr)
            # print (nn.functional.softmax(network.module.arch_parameters, dim=-1))
            # print (network.module.arch_parameters)
    return (
        base_losses.avg,
        base_top1.avg,
        base_top5.avg,
        arch_losses.avg,
        arch_top1.avg,
        arch_top5.avg,
    )
Пример #21
0
def main(xargs):
    assert torch.cuda.is_available(), "CUDA is not available."
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(xargs.workers)
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(args)

    train_data, valid_data, xshape, class_num = get_datasets(
        xargs.dataset, xargs.data_path, -1
    )
    # config_path = 'configs/nas-benchmark/algos/DARTS.config'
    config = load_config(
        xargs.config_path, {"class_num": class_num, "xshape": xshape}, logger
    )
    search_loader, _, valid_loader = get_nas_search_loaders(
        train_data,
        valid_data,
        xargs.dataset,
        "configs/nas-benchmark/",
        config.batch_size,
        xargs.workers,
    )
    logger.log(
        "||||||| {:10s} ||||||| Search-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}".format(
            xargs.dataset, len(search_loader), len(valid_loader), config.batch_size
        )
    )
    logger.log("||||||| {:10s} ||||||| Config={:}".format(xargs.dataset, config))

    search_space = get_search_spaces("cell", xargs.search_space_name)
    if xargs.model_config is None:
        model_config = dict2config(
            {
                "name": "DARTS-V1",
                "C": xargs.channel,
                "N": xargs.num_cells,
                "max_nodes": xargs.max_nodes,
                "num_classes": class_num,
                "space": search_space,
                "affine": False,
                "track_running_stats": bool(xargs.track_running_stats),
            },
            None,
        )
    else:
        model_config = load_config(
            xargs.model_config,
            {
                "num_classes": class_num,
                "space": search_space,
                "affine": False,
                "track_running_stats": bool(xargs.track_running_stats),
            },
            None,
        )
    search_model = get_cell_based_tiny_net(model_config)
    logger.log("search-model :\n{:}".format(search_model))

    w_optimizer, w_scheduler, criterion = get_optim_scheduler(
        search_model.get_weights(), config
    )
    a_optimizer = torch.optim.Adam(
        search_model.get_alphas(),
        lr=xargs.arch_learning_rate,
        betas=(0.5, 0.999),
        weight_decay=xargs.arch_weight_decay,
    )
    logger.log("w-optimizer : {:}".format(w_optimizer))
    logger.log("a-optimizer : {:}".format(a_optimizer))
    logger.log("w-scheduler : {:}".format(w_scheduler))
    logger.log("criterion   : {:}".format(criterion))
    flop, param = get_model_infos(search_model, xshape)
    # logger.log('{:}'.format(search_model))
    logger.log("FLOP = {:.2f} M, Params = {:.2f} MB".format(flop, param))
    if xargs.arch_nas_dataset is None:
        api = None
    else:
        api = API(xargs.arch_nas_dataset)
    logger.log("{:} create API = {:} done".format(time_string(), api))

    last_info, model_base_path, model_best_path = (
        logger.path("info"),
        logger.path("model"),
        logger.path("best"),
    )
    network, criterion = torch.nn.DataParallel(search_model).cuda(), criterion.cuda()

    if last_info.exists():  # automatically resume from previous checkpoint
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start".format(last_info)
        )
        last_info = torch.load(last_info)
        start_epoch = last_info["epoch"]
        checkpoint = torch.load(last_info["last_checkpoint"])
        genotypes = checkpoint["genotypes"]
        valid_accuracies = checkpoint["valid_accuracies"]
        search_model.load_state_dict(checkpoint["search_model"])
        w_scheduler.load_state_dict(checkpoint["w_scheduler"])
        w_optimizer.load_state_dict(checkpoint["w_optimizer"])
        a_optimizer.load_state_dict(checkpoint["a_optimizer"])
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch.".format(
                last_info, start_epoch
            )
        )
    else:
        logger.log("=> do not find the last-info file : {:}".format(last_info))
        start_epoch, valid_accuracies, genotypes = (
            0,
            {"best": -1},
            {-1: search_model.genotype()},
        )

    # start training
    start_time, search_time, epoch_time, total_epoch = (
        time.time(),
        AverageMeter(),
        AverageMeter(),
        config.epochs + config.warmup,
    )
    for epoch in range(start_epoch, total_epoch):
        w_scheduler.update(epoch, 0.0)
        need_time = "Time Left: {:}".format(
            convert_secs2time(epoch_time.val * (total_epoch - epoch), True)
        )
        epoch_str = "{:03d}-{:03d}".format(epoch, total_epoch)
        logger.log(
            "\n[Search the {:}-th epoch] {:}, LR={:}".format(
                epoch_str, need_time, min(w_scheduler.get_lr())
            )
        )

        search_w_loss, search_w_top1, search_w_top5 = search_func(
            search_loader,
            network,
            criterion,
            w_scheduler,
            w_optimizer,
            a_optimizer,
            epoch_str,
            xargs.print_freq,
            logger,
            xargs.gradient_clip,
        )
        search_time.update(time.time() - start_time)
        logger.log(
            "[{:}] searching : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%, time-cost={:.1f} s".format(
                epoch_str, search_w_loss, search_w_top1, search_w_top5, search_time.sum
            )
        )
        valid_a_loss, valid_a_top1, valid_a_top5 = valid_func(
            valid_loader, network, criterion
        )
        logger.log(
            "[{:}] evaluate  : loss={:.2f}, accuracy@1={:.2f}%, accuracy@5={:.2f}%".format(
                epoch_str, valid_a_loss, valid_a_top1, valid_a_top5
            )
        )
        # check the best accuracy
        valid_accuracies[epoch] = valid_a_top1
        if valid_a_top1 > valid_accuracies["best"]:
            valid_accuracies["best"] = valid_a_top1
            genotypes["best"] = search_model.genotype()
            find_best = True
        else:
            find_best = False

        genotypes[epoch] = search_model.genotype()
        logger.log(
            "<<<--->>> The {:}-th epoch : {:}".format(epoch_str, genotypes[epoch])
        )
        # save checkpoint
        save_path = save_checkpoint(
            {
                "epoch": epoch + 1,
                "args": deepcopy(xargs),
                "search_model": search_model.state_dict(),
                "w_optimizer": w_optimizer.state_dict(),
                "a_optimizer": a_optimizer.state_dict(),
                "w_scheduler": w_scheduler.state_dict(),
                "genotypes": genotypes,
                "valid_accuracies": valid_accuracies,
            },
            model_base_path,
            logger,
        )
        last_info = save_checkpoint(
            {
                "epoch": epoch + 1,
                "args": deepcopy(args),
                "last_checkpoint": save_path,
            },
            logger.path("info"),
            logger,
        )
        if find_best:
            logger.log(
                "<<<--->>> The {:}-th epoch : find the highest validation accuracy : {:.2f}%.".format(
                    epoch_str, valid_a_top1
                )
            )
            copy_checkpoint(model_base_path, model_best_path, logger)
        with torch.no_grad():
            # logger.log('arch-parameters :\n{:}'.format( nn.functional.softmax(search_model.arch_parameters, dim=-1).cpu() ))
            logger.log("{:}".format(search_model.show_alphas()))
        if api is not None:
            logger.log("{:}".format(api.query_by_arch(genotypes[epoch], "200")))
        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    logger.log("\n" + "-" * 100)
    logger.log(
        "DARTS-V1 : run {:} epochs, cost {:.1f} s, last-geno is {:}.".format(
            total_epoch, search_time.sum, genotypes[total_epoch - 1]
        )
    )
    if api is not None:
        logger.log("{:}".format(api.query_by_arch(genotypes[total_epoch - 1], "200")))
    logger.close()
Пример #22
0
def check_files(save_dir, meta_file, basestr):
    meta_infos = torch.load(meta_file, map_location="cpu")
    meta_archs = meta_infos["archs"]
    meta_num_archs = meta_infos["total"]
    assert meta_num_archs == len(
        meta_archs), "invalid number of archs : {:} vs {:}".format(
            meta_num_archs, len(meta_archs))

    sub_model_dirs = sorted(list(save_dir.glob("*-*-{:}".format(basestr))))
    print("{:} find {:} directories used to save checkpoints".format(
        time_string(), len(sub_model_dirs)))

    subdir2archs, num_evaluated_arch = collections.OrderedDict(), 0
    num_seeds = defaultdict(lambda: 0)
    for index, sub_dir in enumerate(sub_model_dirs):
        xcheckpoints = list(sub_dir.glob("arch-*-seed-*.pth"))
        # xcheckpoints = list(sub_dir.glob('arch-*-seed-0777.pth')) + list(sub_dir.glob('arch-*-seed-0888.pth')) + list(sub_dir.glob('arch-*-seed-0999.pth'))
        arch_indexes = set()
        for checkpoint in xcheckpoints:
            temp_names = checkpoint.name.split("-")
            assert (len(temp_names) == 4 and temp_names[0] == "arch"
                    and temp_names[2]
                    == "seed"), "invalid checkpoint name : {:}".format(
                        checkpoint.name)
            arch_indexes.add(temp_names[1])
        subdir2archs[sub_dir] = sorted(list(arch_indexes))
        num_evaluated_arch += len(arch_indexes)
        # count number of seeds for each architecture
        for arch_index in arch_indexes:
            num_seeds[len(
                list(sub_dir.glob(
                    "arch-{:}-seed-*.pth".format(arch_index))))] += 1
    print(
        "There are {:5d} architectures that have been evaluated ({:} in total, {:} ckps in total)."
        .format(num_evaluated_arch, meta_num_archs,
                sum(k * v for k, v in num_seeds.items())))
    for key in sorted(list(num_seeds.keys())):
        print("There are {:5d} architectures that are evaluated {:} times.".
              format(num_seeds[key], key))

    dir2ckps, dir2ckp_exists = dict(), dict()
    start_time, epoch_time = time.time(), AverageMeter()
    for IDX, (sub_dir, arch_indexes) in enumerate(subdir2archs.items()):
        if basestr == "C16-N5":
            seeds = [777, 888, 999]
        elif basestr == "C16-N5-LESS":
            seeds = [111, 777]
        else:
            raise ValueError("Invalid base str : {:}".format(basestr))
        numrs = defaultdict(lambda: 0)
        all_checkpoints, all_ckp_exists = [], []
        for arch_index in arch_indexes:
            checkpoints = [
                "arch-{:}-seed-{:04d}.pth".format(arch_index, seed)
                for seed in seeds
            ]
            ckp_exists = [(sub_dir / x).exists() for x in checkpoints]
            arch_index = int(arch_index)
            assert (
                0 <= arch_index < len(meta_archs)
            ), "invalid arch-index {:} (not found in meta_archs)".format(
                arch_index)
            all_checkpoints += checkpoints
            all_ckp_exists += ckp_exists
            numrs[sum(ckp_exists)] += 1
        dir2ckps[str(sub_dir)] = all_checkpoints
        dir2ckp_exists[str(sub_dir)] = all_ckp_exists
        # measure time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()
        numrstr = ", ".join(
            ["{:}: {:03d}".format(x, numrs[x]) for x in sorted(numrs.keys())])
        print(
            "{:} load [{:2d}/{:2d}] [{:03d} archs] [{:04d}->{:04d} ckps] {:} done, need {:}. {:}"
            .format(
                time_string(),
                IDX + 1,
                len(subdir2archs),
                len(arch_indexes),
                len(all_checkpoints),
                sum(all_ckp_exists),
                sub_dir,
                convert_secs2time(
                    epoch_time.avg * (len(subdir2archs) - IDX - 1), True),
                numrstr,
            ))
Пример #23
0
def meta_train_procedure(base_model, meta_model, criterion, xenv, args,
                         logger):
    base_model.train()
    meta_model.train()
    optimizer = torch.optim.Adam(
        meta_model.get_parameters(True, True, True),
        lr=args.lr,
        weight_decay=args.weight_decay,
        amsgrad=True,
    )
    logger.log("Pre-train the meta-model")
    logger.log("Using the optimizer: {:}".format(optimizer))

    meta_model.set_best_dir(logger.path(None) / "ckps-pretrain-v2")
    final_best_name = "final-pretrain-{:}.pth".format(args.rand_seed)
    if meta_model.has_best(final_best_name):
        meta_model.load_best(final_best_name)
        logger.log(
            "Directly load the best model from {:}".format(final_best_name))
        return

    total_indexes = list(range(meta_model.meta_length))
    meta_model.set_best_name("pretrain-{:}.pth".format(args.rand_seed))
    last_success_epoch, early_stop_thresh = 0, args.pretrain_early_stop_thresh
    per_epoch_time, start_time = AverageMeter(), time.time()
    device = args.device
    for iepoch in range(args.epochs):
        left_time = "Time Left: {:}".format(
            convert_secs2time(per_epoch_time.avg * (args.epochs - iepoch),
                              True))
        optimizer.zero_grad()

        generated_time_embeds = meta_model.gen_time_embed(
            meta_model.meta_timestamps)

        batch_indexes = random.choices(total_indexes, k=args.meta_batch)

        raw_time_steps = meta_model.meta_timestamps[batch_indexes]

        regularization_loss = F.l1_loss(generated_time_embeds,
                                        meta_model.super_meta_embed,
                                        reduction="mean")
        # future loss
        total_future_losses, total_present_losses = [], []
        future_containers = meta_model.gen_model(
            generated_time_embeds[batch_indexes])
        present_containers = meta_model.gen_model(
            meta_model.super_meta_embed[batch_indexes])
        for ibatch, time_step in enumerate(raw_time_steps.cpu().tolist()):
            _, (inputs, targets) = xenv(time_step)
            inputs, targets = inputs.to(device), targets.to(device)

            predictions = base_model.forward_with_container(
                inputs, future_containers[ibatch])
            total_future_losses.append(criterion(predictions, targets))

            predictions = base_model.forward_with_container(
                inputs, present_containers[ibatch])
            total_present_losses.append(criterion(predictions, targets))

        with torch.no_grad():
            meta_std = torch.stack(total_future_losses).std().item()
        loss_future = torch.stack(total_future_losses).mean()
        loss_present = torch.stack(total_present_losses).mean()
        total_loss = loss_future + loss_present + regularization_loss
        total_loss.backward()
        optimizer.step()
        # success
        success, best_score = meta_model.save_best(-total_loss.item())
        logger.log(
            "{:} [META {:04d}/{:}] loss : {:.4f} +- {:.4f} = {:.4f} + {:.4f} + {:.4f}"
            .format(
                time_string(),
                iepoch,
                args.epochs,
                total_loss.item(),
                meta_std,
                loss_future.item(),
                loss_present.item(),
                regularization_loss.item(),
            ) + ", batch={:}".format(len(total_future_losses)) +
            ", success={:}, best={:.4f}".format(success, -best_score) +
            ", LS={:}/{:}".format(iepoch -
                                  last_success_epoch, early_stop_thresh) +
            ", {:}".format(left_time))
        if success:
            last_success_epoch = iepoch
        if iepoch - last_success_epoch >= early_stop_thresh:
            logger.log("Early stop the pre-training at {:}".format(iepoch))
            break
        per_epoch_time.update(time.time() - start_time)
        start_time = time.time()
    meta_model.load_best()
    # save to the final model
    meta_model.set_best_name(final_best_name)
    success, _ = meta_model.save_best(best_score + 1e-6)
    assert success
    logger.log("Save the best model into {:}".format(final_best_name))
Пример #24
0
def main(args):

    assert os.path.isdir(args.data_path), "invalid data-path : {:}".format(
        args.data_path)
    assert os.path.isfile(args.checkpoint), "invalid checkpoint : {:}".format(
        args.checkpoint)

    checkpoint = torch.load(args.checkpoint)
    xargs = checkpoint["args"]
    train_data, valid_data, xshape, class_num = get_datasets(
        xargs.dataset, args.data_path, xargs.cutout_length)
    valid_loader = torch.utils.data.DataLoader(
        valid_data,
        batch_size=xargs.batch_size,
        shuffle=False,
        num_workers=xargs.workers,
        pin_memory=True,
    )

    logger = PrintLogger()
    model_config = dict2config(checkpoint["model-config"], logger)
    base_model = obtain_model(model_config)
    flop, param = get_model_infos(base_model, xshape)
    logger.log("model ====>>>>:\n{:}".format(base_model))
    logger.log("model information : {:}".format(base_model.get_message()))
    logger.log("-" * 50)
    logger.log("Params={:.2f} MB, FLOPs={:.2f} M ... = {:.2f} G".format(
        param, flop, flop / 1e3))
    logger.log("-" * 50)
    logger.log("valid_data : {:}".format(valid_data))
    optim_config = dict2config(checkpoint["optim-config"], logger)
    _, _, criterion = get_optim_scheduler(base_model.parameters(),
                                          optim_config)
    logger.log("criterion  : {:}".format(criterion))
    base_model.load_state_dict(checkpoint["base-model"])
    _, valid_func = get_procedures(xargs.procedure)
    logger.log(
        "initialize the CNN done, evaluate it using {:}".format(valid_func))
    network = torch.nn.DataParallel(base_model).cuda()

    try:
        valid_loss, valid_acc1, valid_acc5 = valid_func(
            valid_loader,
            network,
            criterion,
            optim_config,
            "pure-evaluation",
            xargs.print_freq_eval,
            logger,
        )
    except:
        _, valid_func = get_procedures("basic")
        valid_loss, valid_acc1, valid_acc5 = valid_func(
            valid_loader,
            network,
            criterion,
            optim_config,
            "pure-evaluation",
            xargs.print_freq_eval,
            logger,
        )

    num_bytes = torch.cuda.max_memory_cached(
        next(network.parameters()).device) * 1.0
    logger.log(
        "***{:s}*** EVALUATION loss = {:.6f}, accuracy@1 = {:.2f}, accuracy@5 = {:.2f}, error@1 = {:.2f}, error@5 = {:.2f}"
        .format(
            time_string(),
            valid_loss,
            valid_acc1,
            valid_acc5,
            100 - valid_acc1,
            100 - valid_acc5,
        ))
    logger.log(
        "[GPU-Memory-Usage on {:} is {:} bytes, {:.2f} KB, {:.2f} MB, {:.2f} GB.]"
        .format(
            next(network.parameters()).device,
            int(num_bytes),
            num_bytes / 1e3,
            num_bytes / 1e6,
            num_bytes / 1e9,
        ))
    logger.close()
Пример #25
0
    parser.add_argument(
        "--save_dir",
        type=str,
        default="./output/search",
        help="Folder to save checkpoints and log.",
    )
    parser.add_argument("--rand_seed", type=int, default=-1, help="manual seed")
    args = parser.parse_args()

    api = create(None, args.search_space, fast_mode=True, verbose=False)

    args.save_dir = os.path.join(
        "{:}-{:}".format(args.save_dir, args.search_space),
        "{:}-T{:}".format(args.dataset, args.time_budget),
        "RANDOM",
    )
    print("save-dir : {:}".format(args.save_dir))

    if args.rand_seed < 0:
        save_dir, all_info = None, collections.OrderedDict()
        for i in range(args.loops_if_rand):
            print("{:} : {:03d}/{:03d}".format(time_string(), i, args.loops_if_rand))
            args.rand_seed = random.randint(1, 100000)
            save_dir, all_archs, all_total_times = main(args, api)
            all_info[i] = {"all_archs": all_archs, "all_total_times": all_total_times}
        save_path = save_dir / "results.pth"
        print("save into {:}".format(save_path))
        torch.save(all_info, save_path)
    else:
        main(args, api)
Пример #26
0
def main(args):
    assert torch.cuda.is_available(), "CUDA is not available."
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = True
    # torch.backends.cudnn.deterministic = True
    # torch.set_num_threads(args.workers)

    prepare_seed(args.rand_seed)
    logger = prepare_logger(args)

    # prepare dataset
    train_data, valid_data, xshape, class_num = get_datasets(
        args.dataset, args.data_path, args.cutout_length)
    # train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True , num_workers=args.workers, pin_memory=True)
    valid_loader = torch.utils.data.DataLoader(
        valid_data,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.workers,
        pin_memory=True,
    )

    split_file_path = Path(args.split_path)
    assert split_file_path.exists(), "{:} does not exist".format(
        split_file_path)
    split_info = torch.load(split_file_path)

    train_split, valid_split = split_info["train"], split_info["valid"]
    assert (len(set(train_split).intersection(set(valid_split))) == 0
            ), "There should be 0 element that belongs to both train and valid"
    assert len(train_split) + len(valid_split) == len(
        train_data), "{:} + {:} vs {:}".format(len(train_split),
                                               len(valid_split),
                                               len(train_data))
    search_dataset = SearchDataset(args.dataset, train_data, train_split,
                                   valid_split)

    search_train_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=args.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(train_split),
        pin_memory=True,
        num_workers=args.workers,
    )
    search_valid_loader = torch.utils.data.DataLoader(
        train_data,
        batch_size=args.batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(valid_split),
        pin_memory=True,
        num_workers=args.workers,
    )
    search_loader = torch.utils.data.DataLoader(
        search_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.workers,
        pin_memory=True,
        sampler=None,
    )
    # get configures
    model_config = load_config(
        args.model_config,
        {
            "class_num": class_num,
            "search_mode": args.search_shape
        },
        logger,
    )

    # obtain the model
    search_model = obtain_search_model(model_config)
    MAX_FLOP, param = get_model_infos(search_model, xshape)
    optim_config = load_config(args.optim_config, {
        "class_num": class_num,
        "FLOP": MAX_FLOP
    }, logger)
    logger.log("Model Information : {:}".format(search_model.get_message()))
    logger.log("MAX_FLOP = {:} M".format(MAX_FLOP))
    logger.log("Params   = {:} M".format(param))
    logger.log("train_data : {:}".format(train_data))
    logger.log("search-data: {:}".format(search_dataset))
    logger.log("search_train_loader : {:} samples".format(len(train_split)))
    logger.log("search_valid_loader : {:} samples".format(len(valid_split)))
    base_optimizer, scheduler, criterion = get_optim_scheduler(
        search_model.base_parameters(), optim_config)
    arch_optimizer = torch.optim.Adam(
        search_model.arch_parameters(),
        lr=optim_config.arch_LR,
        betas=(0.5, 0.999),
        weight_decay=optim_config.arch_decay,
    )
    logger.log("base-optimizer : {:}".format(base_optimizer))
    logger.log("arch-optimizer : {:}".format(arch_optimizer))
    logger.log("scheduler      : {:}".format(scheduler))
    logger.log("criterion      : {:}".format(criterion))

    last_info, model_base_path, model_best_path = (
        logger.path("info"),
        logger.path("model"),
        logger.path("best"),
    )
    network, criterion = torch.nn.DataParallel(
        search_model).cuda(), criterion.cuda()

    # load checkpoint
    if last_info.exists() or (args.resume is not None and osp.isfile(
            args.resume)):  # automatically resume from previous checkpoint
        if args.resume is not None and osp.isfile(args.resume):
            resume_path = Path(args.resume)
        elif last_info.exists():
            resume_path = last_info
        else:
            raise ValueError("Something is wrong.")
        logger.log("=> loading checkpoint of the last-info '{:}' start".format(
            resume_path))
        checkpoint = torch.load(resume_path)
        if "last_checkpoint" in checkpoint:
            last_checkpoint_path = checkpoint["last_checkpoint"]
            if not last_checkpoint_path.exists():
                logger.log("Does not find {:}, try another path".format(
                    last_checkpoint_path))
                last_checkpoint_path = (resume_path.parent /
                                        last_checkpoint_path.parent.name /
                                        last_checkpoint_path.name)
            assert (last_checkpoint_path.exists()
                    ), "can not find the checkpoint from {:}".format(
                        last_checkpoint_path)
            checkpoint = torch.load(last_checkpoint_path)
        start_epoch = checkpoint["epoch"] + 1
        search_model.load_state_dict(checkpoint["search_model"])
        scheduler.load_state_dict(checkpoint["scheduler"])
        base_optimizer.load_state_dict(checkpoint["base_optimizer"])
        arch_optimizer.load_state_dict(checkpoint["arch_optimizer"])
        valid_accuracies = checkpoint["valid_accuracies"]
        arch_genotypes = checkpoint["arch_genotypes"]
        discrepancies = checkpoint["discrepancies"]
        logger.log(
            "=> loading checkpoint of the last-info '{:}' start with {:}-th epoch."
            .format(resume_path, start_epoch))
    else:
        logger.log(
            "=> do not find the last-info file : {:} or resume : {:}".format(
                last_info, args.resume))
        start_epoch, valid_accuracies, arch_genotypes, discrepancies = (
            0,
            {
                "best": -1
            },
            {},
            {},
        )

    # main procedure
    train_func, valid_func = get_procedures(args.procedure)
    total_epoch = optim_config.epochs + optim_config.warmup
    start_time, epoch_time = time.time(), AverageMeter()
    for epoch in range(start_epoch, total_epoch):
        scheduler.update(epoch, 0.0)
        search_model.set_tau(args.gumbel_tau_max, args.gumbel_tau_min,
                             epoch * 1.0 / total_epoch)
        need_time = "Time Left: {:}".format(
            convert_secs2time(epoch_time.avg * (total_epoch - epoch), True))
        epoch_str = "epoch={:03d}/{:03d}".format(epoch, total_epoch)
        LRs = scheduler.get_lr()
        find_best = False

        logger.log(
            "\n***{:s}*** start {:s} {:s}, LR=[{:.6f} ~ {:.6f}], scheduler={:}, tau={:}, FLOP={:.2f}"
            .format(
                time_string(),
                epoch_str,
                need_time,
                min(LRs),
                max(LRs),
                scheduler,
                search_model.tau,
                MAX_FLOP,
            ))

        # train for one epoch
        train_base_loss, train_arch_loss, train_acc1, train_acc5 = train_func(
            search_loader,
            network,
            criterion,
            scheduler,
            base_optimizer,
            arch_optimizer,
            optim_config,
            {
                "epoch-str": epoch_str,
                "FLOP-exp": MAX_FLOP * args.FLOP_ratio,
                "FLOP-weight": args.FLOP_weight,
                "FLOP-tolerant": MAX_FLOP * args.FLOP_tolerant,
            },
            args.print_freq,
            logger,
        )
        # log the results
        logger.log(
            "***{:s}*** TRAIN [{:}] base-loss = {:.6f}, arch-loss = {:.6f}, accuracy-1 = {:.2f}, accuracy-5 = {:.2f}"
            .format(
                time_string(),
                epoch_str,
                train_base_loss,
                train_arch_loss,
                train_acc1,
                train_acc5,
            ))
        cur_FLOP, genotype = search_model.get_flop("genotype",
                                                   model_config._asdict(),
                                                   None)
        arch_genotypes[epoch] = genotype
        arch_genotypes["last"] = genotype
        logger.log("[{:}] genotype : {:}".format(epoch_str, genotype))
        arch_info, discrepancy = search_model.get_arch_info()
        logger.log(arch_info)
        discrepancies[epoch] = discrepancy
        logger.log(
            "[{:}] FLOP : {:.2f} MB, ratio : {:.4f}, Expected-ratio : {:.4f}, Discrepancy : {:.3f}"
            .format(
                epoch_str,
                cur_FLOP,
                cur_FLOP / MAX_FLOP,
                args.FLOP_ratio,
                np.mean(discrepancy),
            ))

        # if cur_FLOP/MAX_FLOP > args.FLOP_ratio:
        #  init_flop_weight = init_flop_weight * args.FLOP_decay
        # else:
        #  init_flop_weight = init_flop_weight / args.FLOP_decay

        # evaluate the performance
        if (epoch % args.eval_frequency == 0) or (epoch + 1 == total_epoch):
            logger.log("-" * 150)
            valid_loss, valid_acc1, valid_acc5 = valid_func(
                search_valid_loader,
                network,
                criterion,
                epoch_str,
                args.print_freq_eval,
                logger,
            )
            valid_accuracies[epoch] = valid_acc1
            logger.log(
                "***{:s}*** VALID [{:}] loss = {:.6f}, accuracy@1 = {:.2f}, accuracy@5 = {:.2f} | Best-Valid-Acc@1={:.2f}, Error@1={:.2f}"
                .format(
                    time_string(),
                    epoch_str,
                    valid_loss,
                    valid_acc1,
                    valid_acc5,
                    valid_accuracies["best"],
                    100 - valid_accuracies["best"],
                ))
            if valid_acc1 > valid_accuracies["best"]:
                valid_accuracies["best"] = valid_acc1
                arch_genotypes["best"] = genotype
                find_best = True
                logger.log(
                    "Currently, the best validation accuracy found at {:03d}-epoch :: acc@1={:.2f}, acc@5={:.2f}, error@1={:.2f}, error@5={:.2f}, save into {:}."
                    .format(
                        epoch,
                        valid_acc1,
                        valid_acc5,
                        100 - valid_acc1,
                        100 - valid_acc5,
                        model_best_path,
                    ))

        # save checkpoint
        save_path = save_checkpoint(
            {
                "epoch": epoch,
                "args": deepcopy(args),
                "valid_accuracies": deepcopy(valid_accuracies),
                "model-config": model_config._asdict(),
                "optim-config": optim_config._asdict(),
                "search_model": search_model.state_dict(),
                "scheduler": scheduler.state_dict(),
                "base_optimizer": base_optimizer.state_dict(),
                "arch_optimizer": arch_optimizer.state_dict(),
                "arch_genotypes": arch_genotypes,
                "discrepancies": discrepancies,
            },
            model_base_path,
            logger,
        )
        if find_best:
            copy_checkpoint(model_base_path, model_best_path, logger)
        last_info = save_checkpoint(
            {
                "epoch": epoch,
                "args": deepcopy(args),
                "last_checkpoint": save_path,
            },
            logger.path("info"),
            logger,
        )

        # measure elapsed time
        epoch_time.update(time.time() - start_time)
        start_time = time.time()

    logger.log("")
    logger.log("-" * 100)
    last_config_path = logger.path("log") / "seed-{:}-last.config".format(
        args.rand_seed)
    configure2str(arch_genotypes["last"], str(last_config_path))
    logger.log("save the last config int {:} :\n{:}".format(
        last_config_path, arch_genotypes["last"]))

    best_arch, valid_acc = arch_genotypes["best"], valid_accuracies["best"]
    for key, config in arch_genotypes.items():
        if key == "last":
            continue
        FLOP_ratio = config["estimated_FLOP"] / MAX_FLOP
        if abs(FLOP_ratio - args.FLOP_ratio) <= args.FLOP_tolerant:
            if valid_acc < valid_accuracies[key]:
                best_arch, valid_acc = config, valid_accuracies[key]
    print("Best-Arch : {:}\nRatio={:}, Valid-ACC={:}".format(
        best_arch, best_arch["estimated_FLOP"] / MAX_FLOP, valid_acc))
    best_config_path = logger.path("log") / "seed-{:}-best.config".format(
        args.rand_seed)
    configure2str(best_arch, str(best_config_path))
    logger.log("save the last config int {:} :\n{:}".format(
        best_config_path, best_arch))
    logger.log("\n" + "-" * 200)
    logger.log(
        "Finish training/validation in {:}, and save final checkpoint into {:}"
        .format(convert_secs2time(epoch_time.sum, True), logger.path("info")))
    logger.close()
Пример #27
0
def main(args):
    logger, env_info, model_kwargs = lfna_setup(args)

    # check indexes to be evaluated
    to_evaluate_indexes = split_str2indexes(args.srange, env_info["total"],
                                            None)
    logger.log("Evaluate {:}, which has {:} timestamps in total.".format(
        args.srange, len(to_evaluate_indexes)))

    w_container_per_epoch = dict()

    per_timestamp_time, start_time = AverageMeter(), time.time()
    for i, idx in enumerate(to_evaluate_indexes):

        need_time = "Time Left: {:}".format(
            convert_secs2time(
                per_timestamp_time.avg * (len(to_evaluate_indexes) - i), True))
        logger.log("[{:}]".format(time_string()) +
                   " [{:04d}/{:04d}][{:04d}]".format(i, len(
                       to_evaluate_indexes), idx) + " " + need_time)
        # train the same data
        assert idx != 0
        historical_x, historical_y = [], []
        for past_i in range(idx):
            historical_x.append(env_info["{:}-x".format(past_i)])
            historical_y.append(env_info["{:}-y".format(past_i)])
        historical_x, historical_y = torch.cat(historical_x), torch.cat(
            historical_y)
        historical_x, historical_y = subsample(historical_x, historical_y)
        # build model
        model = get_model(dict(model_type="simple_mlp"), **model_kwargs)
        # build optimizer
        optimizer = torch.optim.Adam(model.parameters(),
                                     lr=args.init_lr,
                                     amsgrad=True)
        criterion = torch.nn.MSELoss()
        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
            optimizer,
            milestones=[
                int(args.epochs * 0.25),
                int(args.epochs * 0.5),
                int(args.epochs * 0.75),
            ],
            gamma=0.3,
        )
        train_metric = MSEMetric()
        best_loss, best_param = None, None
        for _iepoch in range(args.epochs):
            preds = model(historical_x)
            optimizer.zero_grad()
            loss = criterion(preds, historical_y)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            # save best
            if best_loss is None or best_loss > loss.item():
                best_loss = loss.item()
                best_param = copy.deepcopy(model.state_dict())
        model.load_state_dict(best_param)
        with torch.no_grad():
            train_metric(preds, historical_y)
        train_results = train_metric.get_info()

        metric = ComposeMetric(MSEMetric(), SaveMetric())
        eval_dataset = torch.utils.data.TensorDataset(
            env_info["{:}-x".format(idx)], env_info["{:}-y".format(idx)])
        eval_loader = torch.utils.data.DataLoader(eval_dataset,
                                                  batch_size=args.batch_size,
                                                  shuffle=False,
                                                  num_workers=0)
        results = basic_eval_fn(eval_loader, model, metric, logger)
        log_str = ("[{:}]".format(time_string()) +
                   " [{:04d}/{:04d}]".format(idx, env_info["total"]) +
                   " train-mse: {:.5f}, eval-mse: {:.5f}".format(
                       train_results["mse"], results["mse"]))
        logger.log(log_str)

        save_path = logger.path(None) / "{:04d}-{:04d}.pth".format(
            idx, env_info["total"])
        w_container_per_epoch[idx] = model.get_w_container().no_grad_clone()
        save_checkpoint(
            {
                "model_state_dict": model.state_dict(),
                "model": model,
                "index": idx,
                "timestamp": env_info["{:}-timestamp".format(idx)],
            },
            save_path,
            logger,
        )
        logger.log("")
        per_timestamp_time.update(time.time() - start_time)
        start_time = time.time()

    save_checkpoint(
        {"w_container_per_epoch": w_container_per_epoch},
        logger.path(None) / "final-ckp.pth",
        logger,
    )
    logger.log("-" * 200 + "\n")
    logger.close()
Пример #28
0
def procedure(
    xloader,
    teacher,
    network,
    criterion,
    scheduler,
    optimizer,
    mode,
    config,
    extra_info,
    print_freq,
    logger,
):
    data_time, batch_time, losses, top1, top5 = (
        AverageMeter(),
        AverageMeter(),
        AverageMeter(),
        AverageMeter(),
        AverageMeter(),
    )
    Ttop1, Ttop5 = AverageMeter(), AverageMeter()
    if mode == "train":
        network.train()
    elif mode == "valid":
        network.eval()
    else:
        raise ValueError("The mode is not right : {:}".format(mode))
    teacher.eval()

    logger.log(
        "[{:5s}] config :: auxiliary={:}, KD :: [alpha={:.2f}, temperature={:.2f}]"
        .format(
            mode,
            config.auxiliary if hasattr(config, "auxiliary") else -1,
            config.KD_alpha,
            config.KD_temperature,
        ))
    end = time.time()
    for i, (inputs, targets) in enumerate(xloader):
        if mode == "train":
            scheduler.update(None, 1.0 * i / len(xloader))
        # measure data loading time
        data_time.update(time.time() - end)
        # calculate prediction and loss
        targets = targets.cuda(non_blocking=True)

        if mode == "train":
            optimizer.zero_grad()

        student_f, logits = network(inputs)
        if isinstance(logits, list):
            assert len(
                logits
            ) == 2, "logits must has {:} items instead of {:}".format(
                2, len(logits))
            logits, logits_aux = logits
        else:
            logits, logits_aux = logits, None
        with torch.no_grad():
            teacher_f, teacher_logits = teacher(inputs)

        loss = loss_KD_fn(
            criterion,
            logits,
            teacher_logits,
            student_f,
            teacher_f,
            targets,
            config.KD_alpha,
            config.KD_temperature,
        )
        if config is not None and hasattr(
                config, "auxiliary") and config.auxiliary > 0:
            loss_aux = criterion(logits_aux, targets)
            loss += config.auxiliary * loss_aux

        if mode == "train":
            loss.backward()
            optimizer.step()

        # record
        sprec1, sprec5 = obtain_accuracy(logits.data,
                                         targets.data,
                                         topk=(1, 5))
        losses.update(loss.item(), inputs.size(0))
        top1.update(sprec1.item(), inputs.size(0))
        top5.update(sprec5.item(), inputs.size(0))
        # teacher
        tprec1, tprec5 = obtain_accuracy(teacher_logits.data,
                                         targets.data,
                                         topk=(1, 5))
        Ttop1.update(tprec1.item(), inputs.size(0))
        Ttop5.update(tprec5.item(), inputs.size(0))

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        if i % print_freq == 0 or (i + 1) == len(xloader):
            Sstr = (
                " {:5s} ".format(mode.upper()) + time_string() +
                " [{:}][{:03d}/{:03d}]".format(extra_info, i, len(xloader)))
            if scheduler is not None:
                Sstr += " {:}".format(scheduler.get_min_info())
            Tstr = "Time {batch_time.val:.2f} ({batch_time.avg:.2f}) Data {data_time.val:.2f} ({data_time.avg:.2f})".format(
                batch_time=batch_time, data_time=data_time)
            Lstr = "Loss {loss.val:.3f} ({loss.avg:.3f})  Prec@1 {top1.val:.2f} ({top1.avg:.2f}) Prec@5 {top5.val:.2f} ({top5.avg:.2f})".format(
                loss=losses, top1=top1, top5=top5)
            Lstr += " Teacher : acc@1={:.2f}, acc@5={:.2f}".format(
                Ttop1.avg, Ttop5.avg)
            Istr = "Size={:}".format(list(inputs.size()))
            logger.log(Sstr + " " + Tstr + " " + Lstr + " " + Istr)

    logger.log(" **{:5s}** accuracy drop :: @1={:.2f}, @5={:.2f}".format(
        mode.upper(), Ttop1.avg - top1.avg, Ttop5.avg - top5.avg))
    logger.log(
        " **{mode:5s}** Prec@1 {top1.avg:.2f} Prec@5 {top5.avg:.2f} Error@1 {error1:.2f} Error@5 {error5:.2f} Loss:{loss:.3f}"
        .format(
            mode=mode.upper(),
            top1=top1,
            top5=top5,
            error1=100 - top1.avg,
            error5=100 - top5.avg,
            loss=losses.avg,
        ))
    return losses.avg, top1.avg, top5.avg
Пример #29
0
 )
 parser.add_argument(
     "--arch_nas_dataset",
     type=str,
     help="The path to load the architecture dataset (tiny-nas-benchmark).",
 )
 parser.add_argument("--print_freq", type=int, help="print frequency (default: 200)")
 parser.add_argument("--rand_seed", type=int, help="manual seed")
 args = parser.parse_args()
 # if args.rand_seed is None or args.rand_seed < 0: args.rand_seed = random.randint(1, 100000)
 if args.arch_nas_dataset is None or not os.path.isfile(args.arch_nas_dataset):
     nas_bench = None
 else:
     print(
         "{:} build NAS-Benchmark-API from {:}".format(
             time_string(), args.arch_nas_dataset
         )
     )
     nas_bench = API(args.arch_nas_dataset)
 if args.rand_seed < 0:
     save_dir, all_indexes, num, all_times = None, [], 500, []
     for i in range(num):
         print("{:} : {:03d}/{:03d}".format(time_string(), i, num))
         args.rand_seed = random.randint(1, 100000)
         save_dir, index, ctime = main(args, nas_bench)
         all_indexes.append(index)
         all_times.append(ctime)
     print("\n average time : {:.3f} s".format(sum(all_times) / len(all_times)))
     torch.save(all_indexes, save_dir / "results.pth")
 else:
     main(args, nas_bench)
Пример #30
0
def main(xargs, nas_bench):
    assert torch.cuda.is_available(), "CUDA is not available."
    torch.backends.cudnn.enabled = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    torch.set_num_threads(xargs.workers)
    prepare_seed(xargs.rand_seed)
    logger = prepare_logger(args)

    if xargs.dataset == "cifar10":
        dataname = "cifar10-valid"
    else:
        dataname = xargs.dataset
    if xargs.data_path is not None:
        train_data, valid_data, xshape, class_num = get_datasets(
            xargs.dataset, xargs.data_path, -1)
        split_Fpath = "configs/nas-benchmark/cifar-split.txt"
        cifar_split = load_config(split_Fpath, None, None)
        train_split, valid_split = cifar_split.train, cifar_split.valid
        logger.log("Load split file from {:}".format(split_Fpath))
        config_path = "configs/nas-benchmark/algos/R-EA.config"
        config = load_config(config_path, {
            "class_num": class_num,
            "xshape": xshape
        }, logger)
        # To split data
        train_data_v2 = deepcopy(train_data)
        train_data_v2.transform = valid_data.transform
        valid_data = train_data_v2
        search_data = SearchDataset(xargs.dataset, train_data, train_split,
                                    valid_split)
        # data loader
        train_loader = torch.utils.data.DataLoader(
            train_data,
            batch_size=config.batch_size,
            sampler=torch.utils.data.sampler.SubsetRandomSampler(train_split),
            num_workers=xargs.workers,
            pin_memory=True,
        )
        valid_loader = torch.utils.data.DataLoader(
            valid_data,
            batch_size=config.batch_size,
            sampler=torch.utils.data.sampler.SubsetRandomSampler(valid_split),
            num_workers=xargs.workers,
            pin_memory=True,
        )
        logger.log(
            "||||||| {:10s} ||||||| Train-Loader-Num={:}, Valid-Loader-Num={:}, batch size={:}"
            .format(xargs.dataset, len(train_loader), len(valid_loader),
                    config.batch_size))
        logger.log("||||||| {:10s} ||||||| Config={:}".format(
            xargs.dataset, config))
        extra_info = {
            "config": config,
            "train_loader": train_loader,
            "valid_loader": valid_loader,
        }
    else:
        config_path = "configs/nas-benchmark/algos/R-EA.config"
        config = load_config(config_path, None, logger)
        logger.log("||||||| {:10s} ||||||| Config={:}".format(
            xargs.dataset, config))
        extra_info = {
            "config": config,
            "train_loader": None,
            "valid_loader": None
        }
    search_space = get_search_spaces("cell", xargs.search_space_name)
    random_arch = random_architecture_func(xargs.max_nodes, search_space)
    # x =random_arch() ; y = mutate_arch(x)
    x_start_time = time.time()
    logger.log("{:} use nas_bench : {:}".format(time_string(), nas_bench))
    best_arch, best_acc, total_time_cost, history = None, -1, 0, []
    # for idx in range(xargs.random_num):
    while total_time_cost < xargs.time_budget:
        arch = random_arch()
        accuracy, cost_time = train_and_eval(arch, nas_bench, extra_info,
                                             dataname)
        if total_time_cost + cost_time > xargs.time_budget:
            break
        else:
            total_time_cost += cost_time
        history.append(arch)
        if best_arch is None or best_acc < accuracy:
            best_acc, best_arch = accuracy, arch
        logger.log("[{:03d}] : {:} : accuracy = {:.2f}%".format(
            len(history), arch, accuracy))
    logger.log(
        "{:} best arch is {:}, accuracy = {:.2f}%, visit {:} archs with {:.1f} s (real-cost = {:.3f} s)."
        .format(
            time_string(),
            best_arch,
            best_acc,
            len(history),
            total_time_cost,
            time.time() - x_start_time,
        ))

    info = nas_bench.query_by_arch(best_arch, "200")
    if info is None:
        logger.log("Did not find this architecture : {:}.".format(best_arch))
    else:
        logger.log("{:}".format(info))
    logger.log("-" * 100)
    logger.close()
    return logger.log_dir, nas_bench.query_index_by_arch(best_arch)