예제 #1
0
def model_desc_fn_to_option(fullname,
                            ds_name,
                            options,
                            use_latest_input=False,
                            aux_weight=0.4,
                            depth_multiplier=[1]):
    assert os.path.exists(fullname), fullname
    with open(fullname, 'rt') as fin:
        lines = fin.readlines()
        assert lines, 'file is empty'
        line = lines[0].strip()
        assert line, 'the net info line (first line) is empty'

    options.ds_name = ds_name
    if ds_name == 'cifar10' or ds_name == 'cifar100':
        is_ilsvrc = False
        options = cifar_default_train_options(options)
    elif ds_name == 'imagenet' or ds_name == 'ilsvrc':
        is_ilsvrc = True
        options = imagenet_mobile_default_train_options(options)

    net_info = net_info_from_str(line)
    if isinstance(depth_multiplier, int):
        depth_multiplier = [depth_multiplier]
    if any([_x > 1 for _x in depth_multiplier]):
        net_info = increase_net_info_size(net_info, depth_multiplier)
    if is_ilsvrc:
        net_info = net_info_cifar_to_ilsvrc(net_info, options.s_type,
                                            use_latest_input)
    if aux_weight > 0:
        options.net_info = add_aux_weight(net_info, 0.4)
    return options
예제 #2
0
def server_handle_child_message(
        msg_output, controller, mi_info, options, n_idle, curr_iter):
    """
    Petridish server handles the return message of a forked
    process that watches over a child job.
    """
    log_dir_root = logger.get_logger_dir()
    q_parent, q_hallu = controller.q_parent, controller.q_hallu
    model_str, model_iter, _parent_iter, search_depth = msg_output
    # Record performance in the main log
    jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter))
    if jr is None:
        # job failure: reap the virtual resource and move on.
        logger.info('Failed mi={}'.format(model_iter))
        return curr_iter
    fp, ve, te = jr['fp'], jr['ve'], jr['te']
    logger.info('CHILD : mi={} val_err={} test_err={} Gflops={}'.format(
        model_iter, ve, te, fp * 1e-9))
    mi_info[model_iter].ve = ve
    mi_info[model_iter].fp = fp

    if (search_depth // 2 < options.max_growth
            and (options.search_max_flops is None
                    or fp < options.search_max_flops)):
        controller.add_one_to_queue(
            q_parent, mi_info, model_iter, None)

    if q_parent.size() > 0:
        # choose a parent.
        pqe = controller.choose_parent(q_parent, mi_info)
        model_str, model_iter, _parent_iter, search_depth = pqe
        logger.info('PARENT : mi={}'.format(model_iter))
        # Create hallucinations on the parent
        net_info_parent = net_info_from_str(model_str)
        n_hallu_per_parent = max(
            1,
            min(controller.n_hallu_per_parent_on_idle, n_idle))
        for _ in range(n_hallu_per_parent):
            net_info = copy.deepcopy(net_info_parent)
            hallus = net_info.sample_hallucinations(
                layer_ops=controller.valid_operations,
                merge_ops=controller.merge_operations,
                prob_at_layer=None,
                min_num_hallus=options.n_hallus_per_init,
                hallu_input_choice=options.hallu_input_choice)
            net_info = net_info.add_hallucinations(
                hallus,
                final_merge_op=controller.hallu_final_merge_op,
                stop_gradient_val=controller.stop_gradient_val,
                hallu_gate_layer=controller.hallu_gate_layer)
            # Update mi_info
            curr_iter += 1
            hallu_str = net_info.to_str()
            mi_info.append(ModelSearchInfo(
                curr_iter, model_iter, search_depth + 1,
                None, None, hallu_str))
            controller.add_one_to_queue(
                q_hallu, mi_info, curr_iter, net_info)
    return curr_iter
def model_options_processing(options):
    """
    Populate some complicated default arguments, and parse comma-separated int lists to int lists.
    """
    if options.net_info_str is None:
        options.net_info = None
        return options
    if isinstance(options.net_info_str, str):
        try:
            options.net_info = net_info_from_str(options.net_info_str)
        except:
            logger.info("Failed info str is:\n{}".format(options.net_info_str))
            raise
    return options
예제 #4
0
def server_handle_child_message_soft_vs_hard(
        msg_output, controller, mi_info, options, n_idle, curr_iter):
    """
    Special replacement of server_handle_child_message for
    experimenting on soft init vs. hard init.

    This is for experiment only.
    TODO reuse code with regular server_handle_child_message?
    """
    log_dir_root = logger.get_logger_dir()
    q_parent, q_hallu = controller.q_parent, controller.q_hallu
    model_str, model_iter, _parent_iter, search_depth = msg_output
    # Record performance in the main log
    jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter))
    if jr is None:
        # job failure: reap the virtual resource and move on.
        logger.info('Failed mi={}'.format(model_iter))
        return curr_iter
    fp, ve, te = jr['fp'], jr['ve'], jr['te']
    logger.info('CHILD : mi={} val_err={} test_err={} Gflops={}'.format(
        model_iter, ve, te, fp * 1e-9))
    mi_info[model_iter].ve = ve
    mi_info[model_iter].fp = fp

    if search_depth > 0:
        return curr_iter

    controller.n_hallu_per_parent_on_idle = 1
    # for soft vs hard experiment, only root generates hallu.
    controller.add_one_to_queue(q_parent, mi_info, model_iter, None)
    if q_parent.size() > 0:
        # choose a parent.
        pqe = controller.choose_parent(q_parent, mi_info)
        model_str, model_iter, _parent_iter, search_depth = pqe
        logger.info('PARENT : mi={}'.format(model_iter))
        # Create hallucinations on the parent
        net_info_parent = net_info_from_str(model_str)

        # this experiment only creates one hallu from the root
        hallus = net_info_parent.sample_hallucinations(
            layer_ops=controller.valid_operations,
            merge_ops=controller.merge_operations,
            prob_at_layer=None,
            min_num_hallus=options.n_hallus_per_init,
            hallu_input_choice=options.hallu_input_choice)

        for netmorph_method in ['hard', 'soft']:
            controller.set_netmorph_method(netmorph_method)
            net_info = copy.deepcopy(net_info_parent)
            net_info = net_info.add_hallucinations(
                hallus,
                final_merge_op=controller.hallu_final_merge_op,
                stop_gradient_val=controller.stop_gradient_val,
                hallu_gate_layer=controller.hallu_gate_layer)
            # Update mi_info
            curr_iter += 1
            hallu_str = net_info.to_str()
            mi_info.append(ModelSearchInfo(
                curr_iter, model_iter, search_depth + 1,
                None, None, hallu_str))
            controller.add_one_to_queue(
                q_hallu, mi_info, curr_iter, net_info)
    return curr_iter
예제 #5
0
def server_main(
        controller, options,
        hallu_handle=None, child_handle=None, critic_handle=None):
    """
        Server entrance/main.
    """
    model_options_base = options
    log_dir_root = logger.get_logger_dir()
    model_dir_root = options.model_dir
    (
        mi_info,
        ipc,
        qname_to_pool,
        philly_wa,
        curr_iter,
        critic_iter,
        n_recv,
        n_last_train,
        n_last_mi_save
    ) = server_init(controller, options)
    # useful alias:
    (q_hallu, q_child) = (controller.q_hallu, controller.q_child)
    # message handles
    hallu_handle = (
        hallu_handle if hallu_handle else server_handle_hallu_message)
    child_handle = (
        child_handle if child_handle else server_handle_child_message)
    critic_handle = (
        critic_handle if critic_handle else server_handle_critic_message)

    # server main loop
    while ipc.pools.has_active() or q_child.size() > 0 or q_hallu.size() > 0:
        # Launch child/hallu sleepers
        for job_type, queue in zip(
                [TRAIN_HALLU, TRAIN_MODEL], [q_hallu, q_child]):
            # Populate workers util either active is full
            # or option_queue is empty.
            while ipc.pools.has_idle(job_type) and queue.size() > 0:
                model_str, model_iter, parent_iter, search_depth = queue.pop()
                # log the pop order of models. Important for analysis
                logger.info("mi={} pi={} sd={}".format(
                    model_iter, parent_iter, search_depth))
                logger.info("LayerInfoList is :\n{}".format(model_str))
                model_options = copy.deepcopy(model_options_base)
                model_options.net_info = net_info_from_str(model_str)
                fork_and_train_model(ipc=ipc,
                        options=model_options,
                        log_dir=_mi_to_dn(log_dir_root, model_iter),
                        child_dir=_mi_to_dn(model_dir_root, model_iter),
                        prev_dir=_mi_to_dn(model_dir_root, parent_iter),
                        model_str=model_str,
                        model_iter=model_iter,
                        parent_iter=parent_iter,
                        search_depth=search_depth,
                        job_type=job_type)

        # launch critic sleepers
        for qname in [q_child.name, q_hallu.name]:
            _n_new = n_recv[qname] - n_last_train[qname]
            _train_every = controller.controller_train_every
            if _n_new >= _train_every:
                pool = qname_to_pool[qname]
                if ipc.pools.has_idle(pool):
                    n_last_train[qname] = n_recv[qname]
                    ci = critic_iter[qname] = 1 + critic_iter[qname]
                    logger.info('Train critic {} ci={} ...'.format(qname, ci))
                    fork_and_train_critic(
                        ipc=ipc,
                        ctrl=controller,
                        data_dir=options.data_dir,
                        crawl_dirs=log_dir_root,
                        log_dir=_ci_to_dn(log_dir_root, ci, qname),
                        model_dir=_ci_to_dn(model_dir_root, ci, qname),
                        prev_dir=_ci_to_dn(model_dir_root, ci-1, qname),
                        critic_iter=ci,
                        queue_name=qname,
                        pool=pool)
                    logger.info('...Train critic launched')

        logger.info('Listening for message...')
        msg_output, job_type = ipc.get_finished_message()
        if job_type == TRAIN_HALLU:
            n_recv[q_hallu.name] += 1
            curr_iter = hallu_handle(
                msg_output=msg_output,
                controller=controller,
                mi_info=mi_info,
                options=options,
                curr_iter=curr_iter)

        elif job_type == TRAIN_MODEL:
            n_recv[q_child.name] += 1
            n_idle = ipc.pools.num_idle(TRAIN_HALLU)
            curr_iter = child_handle(
                msg_output=msg_output,
                controller=controller,
                mi_info=mi_info,
                options=options,
                n_idle=n_idle,
                curr_iter=curr_iter)

        elif job_type in [
                TRAIN_CRITIC_MODEL, TRAIN_CRITIC_HALLU, TRAIN_CRITIC_PARENT]:
            critic_handle(
                msg_output=msg_output,
                controller=controller,
                mi_info=mi_info,
                options=options)

        ## periodic log/heartbeat/ and exits.
        n_finished = n_recv[q_child.name] + n_recv[q_hallu.name]
        philly_wa.new_heart_beat(cnt=n_finished)
        philly_wa.print_progress_percent()
        # Saving mi_info periodically for training
        # critic, post-processing and recovering.
        np.savez(_mi_info_save_fn(log_dir_root), mi_info=mi_info)
        # we have explore enough models. quit now.
        if n_finished >= options.max_exploration:
            break
    # end while (server main loop)
    logger.info(
        "Exiting server main. n_recv[hallu]={} n_recv[child]={}".format(
            n_recv[q_hallu.name], n_recv[q_child.name]))
예제 #6
0
def server_handle_hallu_message(
        msg_output, controller, mi_info, options, curr_iter):
    """
    Petridish server handles the return message of a forked
    process that watches over a halluciniation job.
    """
    log_dir_root = logger.get_logger_dir()
    q_child = controller.q_child
    model_str, model_iter, _parent_iter, search_depth = msg_output
    # Record performance in the main log
    jr = parse_remote_stop_file(_mi_to_dn(log_dir_root, model_iter))
    if jr is None:
        # job failure: reap the virtual resource and move on.
        logger.info('Failed mi={}'.format(model_iter))
        return curr_iter
    (fp, ve, te, hallu_stats, l_op_indices, l_op_omega) = (
        jr['fp'], jr['ve'], jr['te'], jr['l_stats'],
        jr['l_op_indices'], jr['l_op_omega']
    )
    logger.info(
        ("HALLU : mi={} val_err={} test_err={} "
         "Gflops={} hallu_stats={}").format(
            model_iter, ve, te, fp * 1e-9, hallu_stats))
    mi_info[model_iter].ve = ve
    mi_info[model_iter].fp = fp

    ## compute hallucination related info in net_info
    net_info = net_info_from_str(model_str)
    hallu_locs = net_info.contained_hallucination() # contained
    hallu_indices = net_info.sorted_hallu_indices(hallu_locs)
    # feature selection based on params
    l_fs_ops, l_fs_omega = feature_selection_cutoff(
        l_op_indices, l_op_omega, options)
    separated_hallu_info = net_info.separate_hallu_info_by_cname(
        hallu_locs, hallu_indices, l_fs_ops, l_fs_omega)

    ## Select a subset of hallucination to add to child model
    l_selected = []
    # sort by -cos(grad, hallu) for the indices, 0,1,2,...,n_hallu-1.
    processed_stats = [process_hallu_stats_for_critic_feat([stats]) \
        for stats in hallu_stats]
    logger.info('processed_stats={}'.format(processed_stats))
    logger.info('separated_hallu_info={}'.format(separated_hallu_info))

    # greedy select with gradient boosting
    l_greedy_selected = []
    if options.n_greed_select_per_init:
        greedy_order = sorted(
            range(len(hallu_indices)),
            key=lambda i : - processed_stats[i][0])
        min_select = options.n_hallus_per_select
        max_select = max(min_select, len(hallu_indices) // 2)
        for selected_len in range(min_select, max_select + 1):
            selected = greedy_order[:selected_len]
            l_greedy_selected.append(selected)
        n_greedy_select = len(l_greedy_selected)
        if n_greedy_select > options.n_greed_select_per_init:
            # random choose
            l_greedy_selected = list(np.random.choice(
                l_greedy_selected,
                options.n_greed_select_per_init,
                replace=False))
    # random select a subset
    l_random_selected = []
    if options.n_rand_select_per_init:
        # also try some random samples
        l_random_selected = online_sampling(
            itertools.combinations(
                range(len(hallu_indices)),
                options.n_hallus_per_select
            ),
            options.n_rand_select_per_init)
        np.random.shuffle(l_random_selected)
    l_selected = l_greedy_selected + l_random_selected

    ## for each selected subset of hallu, make a model for q_child
    # since more recent ones tend to be better,
    # we insert in reverse order, so greedy are inserted later.
    for selected in reversed(l_selected):
        # new model description
        child_info = copy.deepcopy(net_info)
        l_hi = [ hallu_indices[s] for s in selected ]
        child_info = child_info.select_hallucination(
            l_hi, separated_hallu_info)
        # Compute initialization stat
        stat = process_hallu_stats_for_critic_feat(
            [hallu_stats[s] for s in selected])
        # update mi_info
        curr_iter += 1
        child_str = child_info.to_str()
        mi_info.append(ModelSearchInfo(
            curr_iter, model_iter, search_depth+1,
            None, None, child_str, stat))
        controller.add_one_to_queue(
            q_child, mi_info, curr_iter, child_info)
    return curr_iter