Exemplo n.º 1
0
def read_parent_by_id(id: int, expr_root: str = None, args = None) -> dict:
    """
    based on the id, find the directory of the parent and parse its relevant data for wt inheritance.

    Args:
        id: one of the parent ids saved on the individual after traced mating
        expr_root: this should match args.save_path

    Returns:
        A dictionary describing the parent
    """

    if main_config.distributed_cloud:
        task = get_task_by_network_id(id, run_code=args.code)
        if task is None:
            logger.warning("Could not find expected parent id")
        spec = pickle_load_from_str(task.pkl_data)

        results = pickle_load_from_str(task.result_pkl_data)

        genome = micro_encoding.decode(spec["genome"])
        flops = results["flops"]
        acc = results["valid_acc"]

        wt_path = download_blob(results["wt_blob_name"])

    else:
        # parent is a dict
        arch_path = os.path.join(expr_root, "arch_{}".format(id))

        path = os.path.join(arch_path, "log.txt")
        wt_path = os.path.join(arch_path, "weights.pt")

        with open(path, "r") as f:
            s = f.read()

        genome = eval(s.split("\n")[1][15:])
        flops = float(s.split("\n")[3][8:-2])
        acc = float(s.split("\n")[4][11:])

    error = 100 - acc
    weights = torch.load(wt_path)

    has_nan = False
    for k, v in weights.items():
        if torch.any(torch.isnan(v.cpu())):
            has_nan = True

        weights[k] = torch.nan_to_num(v.cpu())
    if has_nan:
        logger.warning("Nan values detected when trying to load weights for parent with id %i" % id)

    parent = {"genome": genome, "flops": flops, "error": error, "weights": weights}
    return parent
Exemplo n.º 2
0
def decode_individual(individual) -> tuple:
    """
    Takes an individual, with a genome as a numpy array, and decodes it into a
    human friendly genome.

    Args:
        individual:

    Returns:

    """

    return micro_encoding.decode(micro_encoding.convert(individual.X))
Exemplo n.º 3
0
def main(macro_genome, micro_genome, epochs, search_space='micro',
         save='Design_1', expr_root='search', seed=0, gpu=0, init_channels=24,
         layers=11, auxiliary=False, cutout=False, drop_path_prob=0.0, batch_size=128):

    # ---- train logger ----------------- #
    save_pth = os.path.join(expr_root, '{}'.format(save))
    utils.create_exp_dir(save_pth)
    log_format = '%(asctime)s %(message)s'
    logging.basicConfig(stream=sys.stdout, level=logging.INFO,
                        format=log_format, datefmt='%m/%d %I:%M:%S %p')

    # ---- parameter values setting ----- #
    CIFAR_CLASSES = config_dict()['n_classes']
    INPUT_CHANNELS = config_dict()['n_channels']
    learning_rate = 0.025
    momentum = 0.9
    weight_decay = 3e-4
    data_root = '../data'
    cutout_length = 16
    auxiliary_weight = 0.4
    grad_clip = 5
    report_freq = 50
    train_params = {
        'auxiliary': auxiliary,
        'auxiliary_weight': auxiliary_weight,
        'grad_clip': grad_clip,
        'report_freq': report_freq,
    }

    if search_space == 'micro' or search_space == 'micro_garbage':
        genome = micro_genome
        genotype = micro_encoding.decode(genome)
        model = Network(init_channels, CIFAR_CLASSES, config_dict()['n_channels'], layers, auxiliary, genotype)
    elif search_space == 'macro' or search_space == 'macro_garbage':
        genome = macro_genome
        genotype = macro_encoding.decode(genome)
        channels = [(INPUT_CHANNELS, init_channels),
                    (init_channels, 2*init_channels),
                    (2*init_channels, 4*init_channels)]
        model = EvoNetwork(genotype, channels, CIFAR_CLASSES, (config_dict()['INPUT_HEIGHT'], config_dict()['INPUT_WIDTH']), decoder='residual')
    elif search_space == 'micromacro':
        genome = [macro_genome, micro_genome]
        macro_genotype = macro_encoding.decode(macro_genome)
        micro_genotype = micro_encoding.decode(micro_genome)
        genotype = [macro_genotype, micro_genotype]
        set_config('micro_creator', make_micro_creator(micro_genotype, convert=False))
        channels = [(INPUT_CHANNELS, init_channels),
                    (init_channels, 2 * init_channels),
                    (2 * init_channels, 4 * init_channels)]
        model = EvoNetwork(macro_genotype, channels, CIFAR_CLASSES,
                           (config_dict()['INPUT_HEIGHT'], config_dict()['INPUT_WIDTH']), decoder='residual')

    else:
        raise NameError('Unknown search space type')

    # logging.info("Genome = %s", genome)
    logging.info("Architecture = %s", genotype)

    torch.cuda.set_device(gpu)
    cudnn.benchmark = True
    torch.manual_seed(seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(seed)

    n_params = (np.sum(np.prod(v.size()) for v in filter(lambda p: p.requires_grad, model.parameters())) / 1e6)
    model = model.to(device)

    logging.info("param size = %fMB", n_params)

    if config_dict()['problem'] == 'classification':
        criterion = nn.CrossEntropyLoss()
    else:
        criterion = nn.MSELoss()
    criterion = criterion.cuda()


    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.SGD(
        parameters,
        learning_rate,
        momentum=momentum,
        weight_decay=weight_decay
    )

    CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
    CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]

    train_transform = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor()
    ])

    if cutout:
        train_transform.transforms.append(utils.Cutout(cutout_length))

    train_transform.transforms.append(transforms.Normalize(CIFAR_MEAN, CIFAR_STD))

    valid_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
    ])

    train_data = my_cifar10.CIFAR10(root=data_root, train=True, download=False, transform=train_transform)
    valid_data = my_cifar10.CIFAR10(root=data_root, train=False, download=False, transform=valid_transform)

    train_queue = torch.utils.data.DataLoader(
        train_data, batch_size=batch_size,
        # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),
        pin_memory=True, num_workers=1)

    valid_queue = torch.utils.data.DataLoader(
        valid_data, batch_size=batch_size,
        # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]),
        pin_memory=True, num_workers=1)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, int(epochs))

    for epoch in range(epochs):
        scheduler.step()
        logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0])
        model.droprate = drop_path_prob * epoch / epochs

        train_acc, train_obj = train(train_queue, model, criterion, optimizer, train_params)
        logging.info(f'train_{config_dict()["performance_measure"]} %f', train_acc)

    valid_acc, valid_obj = infer(valid_queue, model, criterion)
    logging.info(f'valid_{config_dict()["performance_measure"]} %f', valid_acc)

    # calculate for flops
    model = add_flops_counting_methods(model)
    model.eval()
    model.start_flops_count()
    random_data = torch.randn(1, INPUT_CHANNELS, config_dict()['INPUT_HEIGHT'], config_dict()['INPUT_WIDTH'])
    model(torch.autograd.Variable(random_data).to(device))
    n_flops = np.round(model.compute_average_flops_cost() / 1e6, 4)
    logging.info('flops = %f', n_flops)

    # save to file
    # os.remove(os.path.join(save_pth, 'log.txt'))
    with open(os.path.join(save_pth, 'log.txt'), "w") as file:
        file.write("Genome = {}\n".format(genome))
        file.write("Architecture = {}\n".format(genotype))
        file.write("param size = {}MB\n".format(n_params))
        file.write("flops = {}MB\n".format(n_flops))
        file.write("valid_acc = {}\n".format(valid_acc))

    # logging.info("Architecture = %s", genotype))

    return {
        'valid_acc': valid_acc,
        'params': n_params,
        'flops': n_flops,
    }
Exemplo n.º 4
0
def inherit_one_model(individual,
                      expr_root: str,
                      model=None,
                      args=None) -> nn.Module:
    """
    Very complicated function.
        Handles inheritance of the common components not defined in the genome of the individual.
        Also calls the function to inherit weights for the cells, weight are defined by the individual.

    TODO: Maybe document this better
    TODO: Improve Error Logging

    Args:
        individual: the individual to inherit weights
        expr_root: path as defined in args.save
        model: not used except in testing
        args: not used except in testing

    Returns:
        model
    """

    try:

        r = np.random.uniform(-0.5, 1.5)

        parent1 = projectcode.weightmanagement.common.read_parent_by_id(
            individual.parents[0], expr_root, args)
        parent2 = projectcode.weightmanagement.common.read_parent_by_id(
            individual.parents[1], expr_root, args)

        parents = common.determine_more_fit_parent(parent1, parent2)

        genotype = micro_encoding.decode(micro_encoding.convert(individual.X))
        if model is None:
            CIFAR_CLASSES = 10
            auxiliary = False
            model = Network(args.init_channels, CIFAR_CLASSES, args.layers,
                            auxiliary, genotype)
            model = common.initialize_zero(model)

        wcom = WeightComputer(parents)

        # weight merge
        model.stem[0].weight = wcom.compute_child_weight(
            "stem.0.weight",
            r,
            inherit_rules="both",
            weight_tensor=model.stem[0].weight)

        previous_reduction = False
        pp_reduction = False
        for cell_number, cell in enumerate(model.cells):
            try:

                reduction = cell_number in [
                    len(model.cells) // 3,
                    2 * len(model.cells) // 3,
                ]
                # print(reduction, previous_reduction)
                model = inherit_one_cell(
                    cell_number,
                    individual,
                    model,
                    parents,
                    reduce=reduction,
                    previous_reduce=previous_reduction,
                    weight_computer=wcom,
                    pp_reduce=pp_reduction,
                    r=r,
                )
                pp_reduction = previous_reduction
                previous_reduction = reduction

            except:
                logger.warning("error in cell %i" % cell_number)
                raise

        inherit = None
        child_genome = common.decode_individual(individual)
        if parents[0]["genome"].normal_concat == parents[1][
                "genome"].normal_concat:
            inherit = "both"
        elif parents[0]["genome"].normal_concat == child_genome.normal_concat:
            inherit = "first"
        elif parents[1]["genome"].normal_concat == child_genome.normal_concat:
            inherit = "second"
        else:
            inherit = "concat_mismatch"
        assert inherit is not None, "could not determine classifier inheritance"

        key = "classifier.weight"
        model.classifier.weight = wcom.compute_child_weight(
            key, r, inherit, model.classifier.weight)
        key = "classifier.bias"
        model.classifier.bias = wcom.compute_child_weight(
            key, r, inherit, model.classifier.bias)

        common.assert_non_null_weights(model.state_dict())
    except:

        logger.warning(
            projectcode.weightmanagement.common.decode_individual(individual))
        logger.warning(parents[0]["genome"])
        logger.warning(parents[1]["genome"])
        logger.warning(individual.parents)
        raise

    return model
Exemplo n.º 5
0
def main(genome,
         epochs,
         search_space='micro',
         save='Design_1',
         expr_root='search',
         seed=0,
         gpu=0,
         init_channels=24,
         layers=11,
         auxiliary=False,
         cutout=False,
         drop_path_prob=0.0,
         data_path="../data",
         dataset="CIFAR10"):

    # ---- train logger ----------------- #
    save_pth = os.path.join(expr_root, '{}'.format(save))
    utils.create_exp_dir(save_pth)
    log_format = '%(asctime)s %(message)s'
    logging.basicConfig(stream=sys.stdout,
                        level=logging.INFO,
                        format=log_format,
                        datefmt='%m/%d %I:%M:%S %p')
    # fh = logging.FileHandler(os.path.join(save_pth, 'log.txt'))
    # fh.setFormatter(logging.Formatter(log_format))
    # logging.getLogger().addHandler(fh)

    # ---- parameter values setting ----- #
    if dataset == "CIFAR10":
        CLASSES = 10
    elif dataset == "CIFAR100":
        CLASSES = 100
    elif dataset == "Sport8":
        CLASSES = 8
    elif dataset == "MIT67":
        CLASSES = 67
    elif dataset == "flowers102":
        CLASSES = 102
    learning_rate = 0.025
    momentum = 0.9
    weight_decay = 3e-4
    data_root = data_path
    batch_size = 128
    cutout_length = 16
    auxiliary_weight = 0.4
    grad_clip = 5
    report_freq = 50
    train_params = {
        'auxiliary': auxiliary,
        'auxiliary_weight': auxiliary_weight,
        'grad_clip': grad_clip,
        'report_freq': report_freq,
    }

    if search_space == 'micro':
        genotype = micro_encoding.decode(genome)
        if dataset == "CIFAR10" or dataset == "CIFAR100":
            model = NetworkCIFAR(init_channels, CLASSES, layers, auxiliary,
                                 genotype)
        else:
            model = NetworkImageNet(init_channels, CLASSES, layers, auxiliary,
                                    genotype)
    elif search_space == 'macro':
        genotype = macro_encoding.decode(genome)
        channels = [(3, init_channels), (init_channels, 2 * init_channels),
                    (2 * init_channels, 4 * init_channels)]
        model = EvoNetwork(genotype,
                           channels,
                           CLASSES, (32, 32),
                           decoder='residual')
    else:
        raise NameError('Unknown search space type')

    # logging.info("Genome = %s", genome)
    logging.info("Architecture = %s", genotype)

    torch.cuda.set_device(gpu)
    cudnn.benchmark = True
    torch.manual_seed(seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(seed)

    n_params = (np.sum(
        np.prod(v.size())
        for v in filter(lambda p: p.requires_grad, model.parameters())) / 1e6)
    model = model.to(device)

    logging.info("param size = %fMB", n_params)

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.SGD(parameters,
                                learning_rate,
                                momentum=momentum,
                                weight_decay=weight_decay)
    if dataset == "CIFAR10" or dataset == "CIFAR100":
        MEAN = [0.49139968, 0.48215827, 0.44653124]
        STD = [0.24703233, 0.24348505, 0.26158768]

        train_transform = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor()
        ])

        if cutout:
            train_transform.transforms.append(utils.Cutout(cutout_length))

        train_transform.transforms.append(transforms.Normalize(MEAN, STD))

        valid_transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(MEAN, STD),
        ])
    if dataset == "CIFAR10":
        train_data = my_cifar10.CIFAR10(root=data_root,
                                        train=True,
                                        download=True,
                                        transform=train_transform)
        valid_data = my_cifar10.CIFAR10(root=data_root,
                                        train=True,
                                        download=True,
                                        transform=valid_transform)  #dunno
    elif dataset == "CIFAR100":
        train_data = dset.CIFAR100(root=data_root,
                                   train=True,
                                   download=True,
                                   transform=train_transform)
        valid_data = dset.CIFAR100(root=data_root,
                                   train=True,
                                   download=True,
                                   transform=valid_transform)
    else:
        MEAN = [0.485, 0.456, 0.406]
        STD = [0.229, 0.224, 0.225]
        transf_train = [
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ColorJitter(brightness=0.4,
                                   contrast=0.4,
                                   saturation=0.4,
                                   hue=0.2)
        ]
        transf_val = [
            transforms.Resize(256),
            transforms.CenterCrop(224),
        ]
        normalize = [transforms.ToTensor(), transforms.Normalize(MEAN, STD)]

        train_transform = transforms.Compose(transf_train + normalize)
        valid_transform = transforms.Compose(transf_val + normalize)
        if cutout:
            train_transform.transforms.append(utils.Cutout(cutout_length))

        train_data = dset.ImageFolder(root=data_path + "/" + dataset +
                                      "/train",
                                      transform=train_transform)
        valid_data = dset.ImageFolder(root=data_path + "/" + dataset + "/test",
                                      transform=valid_transform)

    n_train = len(train_data)
    split = n_train // 2
    indices = list(range(n_train))
    random.shuffle(indices)
    train_sampler = torch.utils.data.sampler.SubsetRandomSampler(
        indices[:split])
    valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(
        indices[split:])

    train_queue = torch.utils.data.DataLoader(
        train_data,
        batch_size=batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),
        pin_memory=True,
        num_workers=4)

    valid_queue = torch.utils.data.DataLoader(
        train_data,
        batch_size=batch_size,
        sampler=torch.utils.data.sampler.SubsetRandomSampler(
            indices[split:n_train]),
        pin_memory=True,
        num_workers=4)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, int(epochs))

    for epoch in range(epochs):
        scheduler.step()
        logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0])
        model.droprate = drop_path_prob * epoch / epochs

        train_acc, train_obj = train(train_queue, model, criterion, optimizer,
                                     train_params)
        logging.info('train_acc %f', train_acc)

    valid_acc, valid_obj = infer(valid_queue, model, criterion)
    logging.info('valid_acc %f', valid_acc)

    # calculate for flops
    model = add_flops_counting_methods(model)
    model.eval()
    model.start_flops_count()
    random_data = torch.randn(1, 3, 32, 32)  #to change
    model(torch.autograd.Variable(random_data).to(device))
    n_flops = np.round(model.compute_average_flops_cost() / 1e6, 4)
    logging.info('flops = %f', n_flops)

    # save to file
    # os.remove(os.path.join(save_pth, 'log.txt'))
    with open(os.path.join(save_pth, 'log.txt'), "w") as file:
        file.write("Genome = {}\n".format(genome))
        file.write("Architecture = {}\n".format(genotype))
        file.write("param size = {}MB\n".format(n_params))
        file.write("flops = {}MB\n".format(n_flops))
        file.write("valid_acc = {}\n".format(valid_acc))
    # logging.info("Architecture = %s", genotype))
    with open(os.path.join(save_pth, 'genotype.txt'), "w") as f:
        f.write(str(genotype))
    return {
        'valid_acc': valid_acc,
        'params': n_params,
        'flops': n_flops,
    }
Exemplo n.º 6
0
def main(genome,
         epochs,
         search_space='micro',
         save='Design_1',
         expr_root='search',
         seed=0,
         gpu=0,
         init_channels=24,
         layers=11,
         auxiliary=False,
         cutout=False,
         drop_path_prob=0.0,
         train_dataset="",
         val_dataset=""):

    # ---- train logger ----------------- #
    save_pth = os.path.join(expr_root, '{}'.format(save))
    utils.create_exp_dir(save_pth)
    log_format = '%(asctime)s %(message)s'
    logging.basicConfig(stream=sys.stdout,
                        level=logging.INFO,
                        format=log_format,
                        datefmt='%m/%d %I:%M:%S %p')
    # fh = logging.FileHandler(os.path.join(save_pth, 'log.txt'))
    # fh.setFormatter(logging.Formatter(log_format))
    # logging.getLogger().addHandler(fh)

    # ---- parameter values setting ----- #
    NUM_CLASSES = 4
    CIFAR_CLASSES = NUM_CLASSES
    DATA_SHAPE = (128, 128)
    INPUT_CHANNELS = 3
    learning_rate = 0.025
    momentum = 0.9
    weight_decay = 3e-4
    data_root = '../data'
    batch_size = 16
    cutout_length = 16
    auxiliary_weight = 0.4
    grad_clip = 5
    report_freq = 50
    train_params = {
        'auxiliary': auxiliary,
        'auxiliary_weight': auxiliary_weight,
        'grad_clip': grad_clip,
        'report_freq': report_freq,
    }

    if search_space == 'micro':
        genotype = micro_encoding.decode(genome)
        model = Network(init_channels, CIFAR_CLASSES, layers, auxiliary,
                        genotype)
    elif search_space == 'macro':
        genotype = macro_encoding.decode(genome)
        channels = [(INPUT_CHANNELS, init_channels),
                    (init_channels, 2 * init_channels),
                    (2 * init_channels, 4 * init_channels)]
        model = EvoNetwork(genotype,
                           channels,
                           CIFAR_CLASSES,
                           DATA_SHAPE,
                           decoder='residual')
    else:
        raise NameError('Unknown search space type')

    # logging.info("Genome = %s", genome)
    logging.info("Architecture = %s", genotype)

    torch.cuda.set_device(gpu)
    cudnn.benchmark = True
    torch.manual_seed(seed)
    cudnn.enabled = True
    torch.cuda.manual_seed(seed)

    n_params = (np.sum(
        np.prod(v.size())
        for v in filter(lambda p: p.requires_grad, model.parameters())) / 1e6)
    model = model.to(device)

    logging.info("param size = %fMB", n_params)

    criterion = nn.CrossEntropyLoss()
    criterion = criterion.cuda()

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.SGD(parameters,
                                learning_rate,
                                momentum=momentum,
                                weight_decay=weight_decay)

    #TODO: change
    CIFAR_MEAN = [0.49139968, 0.48215827, 0.44653124]
    DATASET_MEAN = [0.4785047, 0.45649716, 0.42604172]
    CIFAR_MEAN = DATASET_MEAN
    DATASET_STD = [0.31962952, 0.3112294, 0.31206125]
    CIFAR_STD = [0.24703233, 0.24348505, 0.26158768]
    CIFAR_STD = DATASET_STD
    #     # data agumentation
    #     train_transform = transforms.Compose([
    #         transforms.RandomCrop(32, padding=4),
    #         transforms.RandomHorizontalFlip(),
    #         transforms.ToTensor()
    #     ])

    #     if cutout:
    #         train_transform.transforms.append(utils.Cutout(cutout_length))

    #     train_transform.transforms.append(transforms.Normalize(CIFAR_MEAN, CIFAR_STD))

    #     valid_transform = transforms.Compose([
    #         transforms.ToTensor(),
    #         transforms.Normalize(CIFAR_MEAN, CIFAR_STD),
    #     ])

    #     train_data = my_cifar10.CIFAR10(root=data_root, train=True, download=True, transform=train_transform)
    #     valid_data = my_cifar10.CIFAR10(root=data_root, train=False, download=True, transform=valid_transform)

    #     # num_train = len(train_data)
    #     # indices = list(range(num_train))
    #     # split = int(np.floor(train_portion * num_train))
    train_data = train_dataset
    valid_data = val_dataset
    train_queue = torch.utils.data.DataLoader(
        train_data,
        batch_size=batch_size,
        # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[:split]),
        pin_memory=True,
        num_workers=4)

    valid_queue = torch.utils.data.DataLoader(
        valid_data,
        batch_size=batch_size,
        # sampler=torch.utils.data.sampler.SubsetRandomSampler(indices[split:num_train]),
        pin_memory=True,
        num_workers=4)

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, int(epochs))

    for epoch in range(epochs):
        scheduler.step()
        logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0])
        model.droprate = drop_path_prob * epoch / epochs

        train_acc, train_obj = train(train_queue, model, criterion, optimizer,
                                     train_params)
        logging.info('train_acc %f', train_acc)

    valid_acc, valid_obj = infer(valid_queue, model, criterion)
    logging.info('valid_acc %f', valid_acc)

    # calculate for flops
    model = add_flops_counting_methods(model)
    model.eval()
    model.start_flops_count()
    random_data = torch.randn(1, INPUT_CHANNELS, *DATA_SHAPE)
    model(torch.autograd.Variable(random_data).to(device))
    n_flops = np.round(model.compute_average_flops_cost() / 1e6, 4)
    logging.info('flops = %f', n_flops)

    # save to file
    # os.remove(os.path.join(save_pth, 'log.txt'))
    with open(os.path.join(save_pth, 'log.txt'), "w") as file:
        file.write("Genome = {}\n".format(genome))
        file.write("Architecture = {}\n".format(genotype))
        file.write("param size = {}MB\n".format(n_params))
        file.write("flops = {}MB\n".format(n_flops))
        file.write("valid_acc = {}\n".format(valid_acc))

    # logging.info("Architecture = %s", genotype))

    return {
        'valid_acc': valid_acc,
        'params': n_params,
        'flops': n_flops,
    }
Exemplo n.º 7
0
def train_and_evaluate(
    genome: tuple,
    individual=None,
    args: argparse.Namespace = None,
    first_gen: bool = True,
    save: str = None,
    client_id: str = None,
):
    """
    Function to train and evaluate an individual using a TPU.

    Results are always saved in the save dir to make distributed data management easier.

    Args:
        first_gen:
        genome:
        save:
        individual:
        args:

    Returns:

    """

    if args.stream == "tpu":
        # must warp up TPU
        import torch_xla

    auxiliary = False

    assert hasattr(individual, "id")

    if not first_gen:
        # this is not the first generation, so mating should have occurred
        assert hasattr(individual, "parents")

    expr_root = ""

    save_pth = os.path.join(expr_root, "{}".format(save))
    utils.create_exp_dir(save_pth)

    CIFAR_CLASSES = 10
    learning_rate = 0.025
    momentum = 0.9
    weight_decay = 3e-4
    data_root = "../data"
    batch_size = args.batch_size
    auxiliary_weight = 0.4
    grad_clip = 5
    report_freq = 50
    train_params = {
        "auxiliary": auxiliary,
        "auxiliary_weight": auxiliary_weight,
        "grad_clip": grad_clip,
        "report_freq": report_freq,
    }

    if args.search_space == "micro":
        genotype = micro_encoding.decode(genome)
        model = Network(args.init_channels, CIFAR_CLASSES, args.layers,
                        auxiliary, genotype)

        if not first_gen:
            # change the way the weights are set up
            model = manage_weights(model, individual, expr_root, args)

    elif args.search_space == "macro":
        raise NotImplementedError("Not supported")
    else:
        raise NameError("Unknown search space type")

    logger.info("Architecture = %s", genotype)

    try:
        max_weight = args.max_weight
    except:
        print("Could Not Determine Maximum Weight Argument")
        max_weight = 1e20

    clip = weightClip(max_weight=max_weight, min_weight=max_weight * -1)

    if args.stream == "tpu":
        from projectcode.training.tpu import get_map_fn
        import torch_xla.distributed.xla_multiprocessing as xmp

        WRAPPED_MODEL = xmp.MpModelWrapper(model)

        logger.info("Executing TPU Training")
        map_fn = get_map_fn(model,
                            train_params,
                            data_root,
                            momentum,
                            weight_decay,
                            CIFAR_CLASSES,
                            learning_rate,
                            args.layers,
                            batch_size,
                            epochs=args.epochs,
                            save_pth=save_pth,
                            args=args,
                            WRAPPED_MODEL=WRAPPED_MODEL,
                            clip=clip)

        FLAGS = {}

        xmp.spawn(map_fn, args=(FLAGS, ), nprocs=1, start_method="fork")

        valid_acc, n_flops = torch.load("results.pt")
    elif args.stream == "gpu":
        from projectcode.training.gpu import train_gpu
        logger.info("Executing GPU Training")
        valid_acc, n_flops = train_gpu(model,
                                       train_params,
                                       data_root,
                                       momentum,
                                       weight_decay,
                                       CIFAR_CLASSES,
                                       learning_rate,
                                       args.layers,
                                       batch_size,
                                       epochs=args.epochs,
                                       save_pth=save_pth,
                                       args=args,
                                       clip=clip)

    else:

        raise NameError("Unrecognized client stream")

    n_params = (np.sum(
        np.prod(v.size())
        for v in filter(lambda p: p.requires_grad, model.parameters())) / 1e6)

    if main_config.distributed_cloud and args.weight_init == "lammarckian":
        wt_path = f"{args.code}_{client_id}_weights_{individual.id:05d}.pt"
        torch.save(model.state_dict(), wt_path)
        blob_name = upload_blob(wt_path)
    else:
        blob_name = None
        torch.save(model.state_dict(), os.path.join(save_pth, "weights.pt"))

    result_dict = {
        "id": individual.id,
        "save_path": save_pth,
        "valid_acc": valid_acc,
        "params": n_params,
        "flops": n_flops,
        "wt_blob_name": blob_name,
    }

    dump(result_dict, os.path.join(save_pth, "result.pkl"))

    return result_dict