예제 #1
0
 def __init__(self, test_case=None):
     self.parser = None
     self.config = Config()
     self.driver = None
     if test_case is not None:
         self.browser = test_case.BROWSER
         self.headless = test_case.HEAD_LESS
         logger.config_logger(test_case.__name__, self.browser)
예제 #2
0
    def __init__(self,
                 threadID: int,
                 threadName: str,
                 tqueue: queue.Queue,
                 host: str = "localhost",
                 port: str = "5555"):
        threading.Thread.__init__(self)

        # Threading variables
        self._threadID = threadID
        self._threadName = threadName
        self.__tqueue = tqueue
        self.__stoprequest = threading.Event()

        self.__logger = config_logger(name='request_handler',
                                      filepath='./logs/rhandler.log')

        # ZMQ context
        self.__host = host
        self.__port = port

        self.__context = zmq.Context()
        self._server = self.__context.socket(zmq.REP)
        self._server.connect("tcp://%s:%s" % (self.__host, self.__port))

        self.__poller = zmq.Poller()
        self.__poller.register(self._server, zmq.POLLIN)
예제 #3
0
    def __init__(self, threadID: int, threadName: str, queue: queue.Queue):
        threading.Thread.__init__(self)

        # Threading variables
        self._threadID = threadID
        self._threadName = threadName
        self._queue = queue
        self._stoprequest = threading.Event()

        self.__logger = config_logger(name=self._threadName,
                                      filepath='./logs/{}.log'.format(
                                          self._threadName))
예제 #4
0
def environment(bdw_paths: mp.Array, stop_env: mp.Event, end_of_run: mp.Event):
    rhostname = 'mininet' + '@' + SSH_HOST

    config = {
        'server': 'ipc:///tmp/zmq',
        'client': 'tcp://*:5555',
        'publisher': 'tcp://*:5556',
        'subscriber': 'ipc:///tmp/pubsub'
    }
    logger = config_logger('environment', filepath='./logs/environment.log')
    env = Environment(bdw_paths,
                      logger=logger,
                      mconfig=config,
                      remoteHostname=rhostname)

    # Lets measure env runs in time
    while not stop_env.is_set():

        # Only the agent can unblock this loop, after a training-batch has been completed
        while not end_of_run.is_set():
            try:
                # update environment config from session
                if env.updateEnvironment() == -1:
                    stop_env.set()
                    end_of_run.set()
                    break

                # run a single session & measure
                #-------------------
                now = time.time()
                env.run()
                end = time.time()
                #-------------------

                diff = int(end - now)
                logger.debug("Time to execute one run: {}s".format(diff))

                end_of_run.set()  # set the end of run so our agent knows
                # env.spawn_middleware() # restart middleware
            except Exception as ex:
                logger.error(ex)
                break
        time.sleep(0.1)

    env.close()
예제 #5
0
def train_net(config):
    pGen, pKv, pRpn, pRoi, pBbox, pDataset, pModel, pOpt, pTest, \
    transform, data_name, label_name, metric_list = config.get_config(is_train=True)
    pGen = patch_config_as_nothrow(pGen)
    pKv = patch_config_as_nothrow(pKv)
    pRpn = patch_config_as_nothrow(pRpn)
    pRoi = patch_config_as_nothrow(pRoi)
    pBbox = patch_config_as_nothrow(pBbox)
    pDataset = patch_config_as_nothrow(pDataset)
    pModel = patch_config_as_nothrow(pModel)
    pOpt = patch_config_as_nothrow(pOpt)
    pTest = patch_config_as_nothrow(pTest)

    ctx = [mx.gpu(int(i)) for i in pKv.gpus]
    pretrain_prefix = pModel.pretrain.prefix
    pretrain_epoch = pModel.pretrain.epoch
    prefix = pGen.name
    save_path = os.path.join("experiments", prefix)
    begin_epoch = pOpt.schedule.begin_epoch
    end_epoch = pOpt.schedule.end_epoch
    lr_iter = pOpt.schedule.lr_iter

    # only rank==0 print all debug infos
    kvstore_type = "dist_sync" if os.environ.get(
        "DMLC_ROLE") == "worker" else pKv.kvstore
    kv = mx.kvstore.create(kvstore_type)
    rank = kv.rank

    # for distributed training using shared file system
    os.makedirs(save_path, exist_ok=True)

    from utils.logger import config_logger
    config_logger(os.path.join(save_path, "log.txt"))

    model_prefix = os.path.join(save_path, "checkpoint")

    # set up logger
    logger = logging.getLogger()

    sym = pModel.train_symbol

    # setup multi-gpu
    input_batch_size = pKv.batch_image * len(ctx)

    # print config
    # if rank == 0:
    #     logger.info(pprint.pformat(config))

    # load dataset and prepare imdb for training
    image_sets = pDataset.image_set
    roidbs = [
        pkl.load(open("data/cache/{}.roidb".format(i), "rb"),
                 encoding="latin1") for i in image_sets
    ]
    roidb = reduce(lambda x, y: x + y, roidbs)
    # filter empty image
    roidb = [rec for rec in roidb if rec["gt_bbox"].shape[0] > 0]
    # add flip roi record
    flipped_roidb = []
    for rec in roidb:
        new_rec = rec.copy()
        new_rec["flipped"] = True
        flipped_roidb.append(new_rec)
    roidb = roidb + flipped_roidb

    from core.detection_input import AnchorLoader
    train_data = AnchorLoader(roidb=roidb,
                              transform=transform,
                              data_name=data_name,
                              label_name=label_name,
                              batch_size=input_batch_size,
                              shuffle=True,
                              kv=kv,
                              num_worker=pGen.loader_worker or 12,
                              num_collector=pGen.loader_collector or 1,
                              worker_queue_depth=2,
                              collector_queue_depth=2)

    # infer shape
    worker_data_shape = dict(train_data.provide_data +
                             train_data.provide_label)
    for key in worker_data_shape:
        worker_data_shape[key] = (
            pKv.batch_image, ) + worker_data_shape[key][1:]
    arg_shape, _, aux_shape = sym.infer_shape(**worker_data_shape)

    _, out_shape, _ = sym.get_internals().infer_shape(**worker_data_shape)
    out_shape_dict = list(zip(sym.get_internals().list_outputs(), out_shape))

    _, out_shape, _ = sym.infer_shape(**worker_data_shape)
    terminal_out_shape_dict = zip(sym.list_outputs(), out_shape)

    if rank == 0:
        logger.info('parameter shape')
        logger.info(
            pprint.pformat(
                [i for i in out_shape_dict if not i[0].endswith('output')]))

        logger.info('intermediate output shape')
        logger.info(
            pprint.pformat(
                [i for i in out_shape_dict if i[0].endswith('output')]))

        logger.info('terminal output shape')
        logger.info(pprint.pformat([i for i in terminal_out_shape_dict]))

    # memonger
    if pModel.memonger:
        last_block = pModel.memonger_until or ""
        if rank == 0:
            logger.info("do memonger up to {}".format(last_block))

        type_dict = {k: np.float32 for k in worker_data_shape}
        sym = search_plan_to_layer(sym,
                                   last_block,
                                   1000,
                                   type_dict=type_dict,
                                   **worker_data_shape)

    # load and initialize params
    if pOpt.schedule.begin_epoch != 0:
        arg_params, aux_params = load_checkpoint(model_prefix, begin_epoch)
    elif pModel.from_scratch:
        arg_params, aux_params = dict(), dict()
    else:
        arg_params, aux_params = load_checkpoint(pretrain_prefix,
                                                 pretrain_epoch)

    if pModel.process_weight is not None:
        pModel.process_weight(sym, arg_params, aux_params)
    '''
    there are some conflicts between `mergebn` and `attach_quantized_node` in graph_optimize.py 
    when mergebn ahead of attach_quantized_node
    such as `Symbol.ComposeKeyword`
    '''
    if pModel.QuantizeTrainingParam is not None and pModel.QuantizeTrainingParam.quantize_flag:
        pQuant = pModel.QuantizeTrainingParam
        assert pGen.fp16 == False, "current quantize training only support fp32 mode."
        from utils.graph_optimize import attach_quantize_node
        _, out_shape, _ = sym.get_internals().infer_shape(**worker_data_shape)
        out_shape_dictoinary = dict(
            zip(sym.get_internals().list_outputs(), out_shape))
        sym = attach_quantize_node(sym, out_shape_dictoinary,
                                   pQuant.WeightQuantizeParam,
                                   pQuant.ActQuantizeParam,
                                   pQuant.quantized_op)
    # merge batch normalization to save memory in fix bn training
    from utils.graph_optimize import merge_bn
    sym, arg_params, aux_params = merge_bn(sym, arg_params, aux_params)

    if pModel.random:
        import time
        mx.random.seed(int(time.time()))
        np.random.seed(int(time.time()))

    init = mx.init.Xavier(factor_type="in", rnd_type='gaussian', magnitude=2)
    init.set_verbosity(verbose=True)

    # create solver
    fixed_param = pModel.pretrain.fixed_param
    excluded_param = pModel.pretrain.excluded_param
    data_names = [k[0] for k in train_data.provide_data]
    label_names = [k[0] for k in train_data.provide_label]

    if pModel.teacher_param:
        from models.KD.utils import create_teacher_module
        from models.KD.detection_module import KDDetModule
        t_mod, t_label_name, t_label_shape = create_teacher_module(
            pModel.teacher_param, worker_data_shape, input_batch_size, ctx,
            rank, logger)
        mod = KDDetModule(sym,
                          teacher_module=t_mod,
                          teacher_label_names=t_label_name,
                          teacher_label_shapes=t_label_shape,
                          data_names=data_names,
                          label_names=label_names,
                          logger=logger,
                          context=ctx,
                          fixed_param=fixed_param,
                          excluded_param=excluded_param)
    else:
        mod = DetModule(sym,
                        data_names=data_names,
                        label_names=label_names,
                        logger=logger,
                        context=ctx,
                        fixed_param=fixed_param,
                        excluded_param=excluded_param)

    eval_metrics = mx.metric.CompositeEvalMetric(metric_list)

    # callback
    batch_end_callback = [
        callback.Speedometer(train_data.batch_size,
                             frequent=pGen.log_frequency)
    ]
    batch_end_callback += pModel.batch_end_callbacks or []
    epoch_end_callback = callback.do_checkpoint(model_prefix)
    sym.save(model_prefix + ".json")

    # decide learning rate
    lr_mode = pOpt.optimizer.lr_mode or 'step'
    base_lr = pOpt.optimizer.lr * kv.num_workers
    lr_factor = pOpt.schedule.lr_factor or 0.1

    iter_per_epoch = len(train_data) // input_batch_size
    total_iter = iter_per_epoch * (end_epoch - begin_epoch)
    lr_iter = [total_iter + it if it < 0 else it for it in lr_iter]
    lr_iter = [it // kv.num_workers for it in lr_iter]
    lr_iter = [it - iter_per_epoch * begin_epoch for it in lr_iter]
    lr_iter_discount = [it for it in lr_iter if it > 0]
    current_lr = base_lr * (lr_factor**(len(lr_iter) - len(lr_iter_discount)))
    if rank == 0:
        logging.info('total iter {}'.format(total_iter))
        logging.info('lr {}, lr_iters {}'.format(current_lr, lr_iter_discount))
        logging.info('lr mode: {}'.format(lr_mode))

    if pOpt.warmup and pOpt.schedule.begin_epoch == 0:
        if rank == 0:
            logging.info('warmup lr {}, warmup step {}'.format(
                pOpt.warmup.lr, pOpt.warmup.iter))
        if lr_mode == 'step':
            lr_scheduler = WarmupMultiFactorScheduler(
                step=lr_iter_discount,
                factor=lr_factor,
                warmup=True,
                warmup_type=pOpt.warmup.type,
                warmup_lr=pOpt.warmup.lr,
                warmup_step=pOpt.warmup.iter)
        elif lr_mode == 'cosine':
            warmup_lr_scheduler = AdvancedLRScheduler(mode='linear',
                                                      base_lr=pOpt.warmup.lr,
                                                      target_lr=base_lr,
                                                      niters=pOpt.warmup.iter)
            cosine_lr_scheduler = AdvancedLRScheduler(
                mode='cosine',
                base_lr=base_lr,
                target_lr=0,
                niters=(iter_per_epoch *
                        (end_epoch - begin_epoch)) - pOpt.warmup.iter)
            lr_scheduler = LRSequential(
                [warmup_lr_scheduler, cosine_lr_scheduler])
        else:
            raise NotImplementedError
    else:
        if lr_mode == 'step':
            lr_scheduler = WarmupMultiFactorScheduler(step=lr_iter_discount,
                                                      factor=lr_factor)
        elif lr_mode == 'cosine':
            lr_scheduler = AdvancedLRScheduler(mode='cosine',
                                               base_lr=base_lr,
                                               target_lr=0,
                                               niters=iter_per_epoch *
                                               (end_epoch - begin_epoch))
        else:
            lr_scheduler = None

    # optimizer
    optimizer_params = dict(momentum=pOpt.optimizer.momentum,
                            wd=pOpt.optimizer.wd,
                            learning_rate=current_lr,
                            lr_scheduler=lr_scheduler,
                            rescale_grad=1.0 / (len(ctx) * kv.num_workers),
                            clip_gradient=pOpt.optimizer.clip_gradient)

    if pKv.fp16:
        optimizer_params['multi_precision'] = True
        optimizer_params['rescale_grad'] /= 128.0

    profile = pGen.profile or False
    if profile:
        mx.profiler.set_config(profile_all=True,
                               filename=os.path.join(save_path,
                                                     "profile.json"))

    # train
    mod.fit(train_data=train_data,
            eval_metric=eval_metrics,
            epoch_end_callback=epoch_end_callback,
            batch_end_callback=batch_end_callback,
            kvstore=kv,
            optimizer=pOpt.optimizer.type,
            optimizer_params=optimizer_params,
            initializer=init,
            allow_missing=True,
            arg_params=arg_params,
            aux_params=aux_params,
            begin_epoch=begin_epoch,
            num_epoch=end_epoch,
            profile=profile)

    logging.info("Training has done")
    time.sleep(10)
    logging.info("Exiting")
예제 #6
0
    assert boxes.shape[1] == 4
    for i in range(len(boxes)):
        x1, y1, x2, y2 = boxes[i]
        coords = (x1, y1), x2 - x1 + 1, y2 - y1 + 1
        c = color if color else npr.random(3)
        rect_kwargs = dict(fill=False, edgecolor=c, linewidth=2)
        ax.add_patch(plt.Rectangle(*coords, **rect_kwargs))
        if labels is not None:
            text_kwargs = dict(size='small',
                               color='white',
                               bbox=dict(facecolor=c, alpha=0.5, pad=0.15))
            ax.text(x1 - 2, y1 - 2, labels[i], **text_kwargs)


if __name__ == "__main__":
    config_logger()

    # parse arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, help='deploy prototxt file')
    parser.add_argument('--weights', type=str, help='model weights file')
    parser.add_argument('--gpu',
                        default=0,
                        type=int,
                        help='gpu id, -1 for cpu')
    parser.add_argument('--conf-thresh',
                        default=0.6,
                        help='detection confidence threshold')
    parser.add_argument('--nms-thresh',
                        default=0.5,
                        help='detection nms threshold')
예제 #7
0
def agent():
    np.random.seed(RANDOM_SEED)

    # Create results path
    if not os.path.exists(SUMMARY_DIR):
        os.makedirs(SUMMARY_DIR)

    # Spawn request handler
    tqueue = queue.Queue(1)
    rhandler = RequestHandler(1,
                              "rhandler-thread",
                              tqueue=tqueue,
                              host=SSH_HOST,
                              port='5555')
    rhandler.start()

    # Spawn collector thread
    cqueue = queue.Queue(0)
    collector = Collector(2,
                          "collector-thread",
                          queue=cqueue,
                          host=SSH_HOST,
                          port='5556')
    collector.start()

    # Spawn environment # process -- not a thread
    bdw_paths = mp.Array('i', 2)
    stop_env = mp.Event()
    end_of_run = mp.Event()
    env = mp.Process(target=environment,
                     args=(bdw_paths, stop_env, end_of_run))
    env.start()

    # keep record of threads and processes
    tp_list = [rhandler, collector, env]

    # Main training loop
    logger = config_logger('agent', './logs/agent.log')
    logger.info("Run Agent until training stops...")

    with tf.Session() as sess, open(LOG_FILE, 'w') as log_file:
        actor = a3c.ActorNetwork(sess,
                                 state_dim=[S_INFO, S_LEN],
                                 action_dim=A_DIM,
                                 learning_rate=ACTOR_LR_RATE)

        critic = a3c.CriticNetwork(sess,
                                   state_dim=[S_INFO, S_LEN],
                                   learning_rate=CRITIC_LR_RATE)

        summary_ops, summary_vars = a3c.build_summaries()

        sess.run(tf.global_variables_initializer())
        writer = tf.summary.FileWriter(SUMMARY_DIR,
                                       sess.graph)  # training monitor
        saver = tf.train.Saver()  # save neural net parameters

        # # restore neural net parameters
        nn_model = NN_MODEL
        if nn_model is not None:  # nn_model is the path to file
            saver.restore(sess, nn_model)
            print("Model restored.")

        epoch = EPOCH
        time_stamp = 0

        path = DEFAULT_PATH

        action_vec = np.zeros(A_DIM)
        action_vec[path] = 1

        s_batch = [np.zeros((S_INFO, S_LEN))]
        a_batch = [action_vec]
        r_batch = []
        entropy_record = []

        actor_gradient_batch = []
        critic_gradient_batch = []

        list_states = []
        while not end_of_run.is_set():
            # Get scheduling request from rhandler thread
            request, ev1 = get_request(tqueue, logger, end_of_run=end_of_run)

            # end of iterations -> exit loop -> save -> bb
            if stop_env.is_set():
                break

            if request is None and end_of_run.is_set():
                logger.info("END_OF_RUN => BATCH UPDATE")

                # get all stream_info from collector's queue
                stream_info = []
                with cqueue.mutex:
                    for elem in list(cqueue.queue):
                        stream_info.append(elem)
                    # clear the queue
                    cqueue.queue.clear()

                # Validate
                # Proceed to next run
                # logger.info("len(list_states) {} == len(stream_info) {}".format(len(list_states), len(stream_info)))
                if len(list_states) != len(stream_info) or len(
                        list_states) == 0:
                    entropy_record = []
                    del s_batch[:]
                    del a_batch[:]
                    del r_batch[:]
                    stream_info.clear()
                    list_states.clear()
                    end_of_run.clear()
                    time.sleep(0.01)
                    continue

                # Re-order rewards
                stream_info = arrangeStateStreamsInfo(list_states, stream_info)
                list_ids = [stream['StreamID'] for stream in stream_info]
                logger.info("all unique: {}".format(
                    allUnique(list_ids, debug=True)))

                # for i, stream in enumerate(stream_info):
                #     logger.info(stream)
                #     logger.info(list_states[i]) # print this on index based

                # For each stream calculate a reward
                completion_times = []
                for index, stream in enumerate(stream_info):
                    path1_smoothed_RTT, path1_bandwidth, path1_packets, \
                    path1_retransmissions, path1_losses, \
                    path2_smoothed_RTT, path2_bandwidth, path2_packets, \
                    path2_retransmissions, path2_losses, \
                        = getTrainingVariables(list_states[index])

                    normalized_bwd_path0 = (bdw_paths[0] - 1.0) / (100.0 - 1.0)
                    normalized_bwd_path1 = (bdw_paths[1] - 1.0) / (100.0 - 1.0)
                    normalized_srtt_path0 = (
                        (path1_smoothed_RTT * 1000.0) - 1.0) / (120.0)
                    normalized_srtt_path1 = (
                        (path2_smoothed_RTT * 1000.0) - 1.0) / (120.0)
                    normalized_loss_path0 = (
                        (path1_retransmissions + path1_losses) - 0.0) / 20.0
                    normalized_loss_path1 = (
                        (path2_retransmissions + path2_losses) - 0.0) / 20.0

                    # aggr_bdw = normalized_bwd_path0 + normalized_bwd_path1
                    aggr_srtt = normalized_srtt_path0 + normalized_srtt_path1
                    aggr_loss = normalized_loss_path0 + normalized_loss_path1

                    reward = (a_batch[index][0] * normalized_bwd_path0 +
                              a_batch[index][1] * normalized_bwd_path1
                              ) - stream['CompletionTime'] - (
                                  0.8 * aggr_srtt) - (1.0 * aggr_loss)
                    r_batch.append(reward)
                    completion_times.append(stream['CompletionTime'])

                # Check if we have a stream[0] = 0 add -> 0 to r_batch
                tmp_s_batch = np.stack(s_batch[:], axis=0)
                tmp_r_batch = np.vstack(r_batch[:])
                if tmp_s_batch.shape[0] > tmp_r_batch.shape[0]:
                    logger.debug("s_batch({}) > r_batch({})".format(
                        tmp_s_batch.shape[0], tmp_r_batch.shape[0]))
                    logger.debug(tmp_s_batch[0])
                    r_batch.insert(0, 0)

                # Save metrics for debugging
                # log time_stamp, bit_rate, buffer_size, reward
                for index, stream in enumerate(stream_info):
                    path1_smoothed_RTT, path1_bandwidth, path1_packets, \
                    path1_retransmissions, path1_losses, \
                    path2_smoothed_RTT, path2_bandwidth, path2_packets, \
                    path2_retransmissions, path2_losses, \
                        = getTrainingVariables(list_states[index])
                    log_file.write(
                        str(time_stamp) + '\t' + str(PATHS[path]) + '\t' +
                        str(bdw_paths[0]) + '\t' + str(bdw_paths[1]) + '\t' +
                        str(path1_smoothed_RTT) + '\t' +
                        str(path2_smoothed_RTT) + '\t' +
                        str(path1_retransmissions + path1_losses) + '\t' +
                        str(path2_retransmissions + path2_losses) + '\t' +
                        str(stream['CompletionTime']) + '\t' +
                        str(stream['Path']) + '\n')
                    log_file.flush()
                    time_stamp += 1

                # Single Training step
                # ----------------------------------------------------------------------------------------------------
                actor_gradient, critic_gradient, td_batch = \
                    a3c.compute_gradients(s_batch=np.stack(s_batch[1:], axis=0),  # ignore the first chuck
                                        a_batch=np.vstack(a_batch[1:]),  # since we don't have the
                                        r_batch=np.vstack(r_batch[1:]),  # control over it
                                        terminal=True, actor=actor, critic=critic)
                td_loss = np.mean(td_batch)

                actor_gradient_batch.append(actor_gradient)
                critic_gradient_batch.append(critic_gradient)

                logger.debug("====")
                logger.debug("Epoch: {}".format(epoch))
                msg = "TD_loss: {}, Avg_reward: {}, Avg_entropy: {}".format(
                    td_loss, np.mean(r_batch[1:]), np.mean(entropy_record[1:]))
                logger.debug(msg)
                logger.debug("====")
                # ----------------------------------------------------------------------------------------------------

                # Print summary for tensorflow
                # ----------------------------------------------------------------------------------------------------
                summary_str = sess.run(summary_ops,
                                       feed_dict={
                                           summary_vars[0]:
                                           td_loss,
                                           summary_vars[1]:
                                           np.mean(r_batch),
                                           summary_vars[2]:
                                           np.mean(entropy_record),
                                           summary_vars[3]:
                                           np.mean(completion_times)
                                       })

                writer.add_summary(summary_str, epoch)
                writer.flush()
                # ----------------------------------------------------------------------------------------------------

                # Update gradients
                if len(actor_gradient_batch) >= GRADIENT_BATCH_SIZE:
                    assert len(actor_gradient_batch) == len(
                        critic_gradient_batch)

                    for i in range(len(actor_gradient_batch)):
                        actor.apply_gradients(actor_gradient_batch[i])
                        critic.apply_gradients(critic_gradient_batch[i])

                    epoch += 1
                    if epoch % MODEL_SAVE_INTERVAL == 0:
                        save_path = saver.save(
                            sess, SUMMARY_DIR + "/nn_model_ep_" + str(epoch) +
                            ".ckpt")

                entropy_record = []

                # Clear all before proceeding to next run
                del s_batch[:]
                del a_batch[:]
                del r_batch[:]
                stream_info.clear()
                list_states.clear()
                end_of_run.clear()
            else:
                ev1.set()  # let `producer` (rh) know we received request
                list_states.append(request)

                # The bandwidth metrics coming from MPQUIC are not correct
                # constant values not upgraded
                path1_smoothed_RTT, path1_bandwidth, path1_packets, \
                path1_retransmissions, path1_losses, \
                path2_smoothed_RTT, path2_bandwidth, path2_packets, \
                path2_retransmissions, path2_losses, \
                    = getTrainingVariables(request)

                time_stamp += 1  # in ms
                last_path = path

                # retrieve previous state
                if len(s_batch) == 0:
                    state = np.zeros((S_INFO, S_LEN))
                else:
                    state = np.array(s_batch[-1], copy=True)

                # dequeue history record
                state = np.roll(state, -1, axis=1)

                # this should be S_INFO number of terms
                state[0, -1] = (bdw_paths[0] - 1.0) / (100.0 - 1.0
                                                       )  # bandwidth path1
                state[1, -1] = (bdw_paths[1] - 1.0) / (100.0 - 1.0
                                                       )  # bandwidth path2
                state[2, -1] = ((path1_smoothed_RTT * 1000.0) - 1.0) / (
                    120.0)  # max RTT so far 120ms
                state[3, -1] = ((path2_smoothed_RTT * 1000.0) - 1.0) / (120.0)
                state[4, -1] = (
                    (path1_retransmissions + path1_losses) - 0.0) / 20.0
                state[5, -1] = (
                    (path2_retransmissions + path2_losses) - 0.0) / 20.0

                s_batch.append(state)

                action_prob = actor.predict(
                    np.reshape(state, (1, S_INFO, S_LEN)))
                action_cumsum = np.cumsum(action_prob)
                path = (action_cumsum > np.random.randint(1, RAND_RANGE) /
                        float(RAND_RANGE)).argmax()

                action_vec = np.zeros(A_DIM)
                action_vec[path] = 1
                a_batch.append(action_vec)

                logger.debug("PATH: {}".format(path))

                entropy_record.append(a3c.compute_entropy(action_prob[0]))

                # prepare response
                response = [request['StreamID'], PATHS[path]]
                response = [str(r).encode('utf-8') for r in response]
                ev2 = threading.Event()
                put_response((response, ev2), tqueue, logger)
                ev2.wait(
                )  # blocks until `consumer` (i.e. rh) receives response

    # send kill signal to all
    stop_env.set()
    rhandler.stophandler()
    collector.stophandler()

    # wait for threads and process to finish gracefully...
    for tp in tp_list:
        tp.join()
예제 #8
0
def train_net(config):
    pGen, pKv, pRpn, pRoi, pBbox, pDataset, pModel, pOpt, pTest, \
    transform, data_name, label_name, metric_list = config.get_config(is_train=True)

    ctx = [mx.gpu(int(i)) for i in pKv.gpus]
    pretrain_prefix = pModel.pretrain.prefix
    pretrain_epoch = pModel.pretrain.epoch
    prefix = pGen.name
    save_path = os.path.join("experiments", prefix)
    begin_epoch = pOpt.schedule.begin_epoch
    end_epoch = pOpt.schedule.end_epoch
    lr_iter = pOpt.schedule.lr_iter

    # only rank==0 print all debug infos
    kvstore_type = "dist_sync" if os.environ.get(
        "DMLC_ROLE") == "worker" else pKv.kvstore
    kv = mx.kvstore.create(kvstore_type)
    rank = kv.rank

    # for distributed training using shared file system
    if rank == 0:
        if not os.path.exists(save_path):
            os.makedirs(save_path)

    from utils.logger import config_logger
    config_logger(os.path.join(save_path, "log.txt"))

    model_prefix = os.path.join(save_path, "checkpoint")

    # set up logger
    logger = logging.getLogger()

    sym = pModel.train_symbol

    # setup multi-gpu
    input_batch_size = pKv.batch_image * len(ctx)

    # print config
    # if rank == 0:
    #     logger.info(pprint.pformat(config))

    # load dataset and prepare imdb for training
    image_sets = pDataset.image_set
    roidbs = [
        pkl.load(open("data/cache/{}.roidb".format(i), "rb"),
                 encoding="latin1") for i in image_sets
    ]
    roidb = reduce(lambda x, y: x + y, roidbs)
    # filter empty image
    roidb = [rec for rec in roidb if rec["gt_bbox"].shape[0] > 0]
    # add flip roi record
    flipped_roidb = []
    for rec in roidb:
        new_rec = rec.copy()
        new_rec["flipped"] = True
        flipped_roidb.append(new_rec)
    roidb = roidb + flipped_roidb

    from core.detection_input import AnchorLoader
    train_data = AnchorLoader(roidb=roidb,
                              transform=transform,
                              data_name=data_name,
                              label_name=label_name,
                              batch_size=input_batch_size,
                              shuffle=True,
                              kv=kv)

    # infer shape
    worker_data_shape = dict(train_data.provide_data +
                             train_data.provide_label)
    for key in worker_data_shape:
        worker_data_shape[key] = (
            pKv.batch_image, ) + worker_data_shape[key][1:]
    arg_shape, _, aux_shape = sym.infer_shape(**worker_data_shape)

    _, out_shape, _ = sym.get_internals().infer_shape(**worker_data_shape)
    out_shape_dict = list(zip(sym.get_internals().list_outputs(), out_shape))

    _, out_shape, _ = sym.infer_shape(**worker_data_shape)
    terminal_out_shape_dict = zip(sym.list_outputs(), out_shape)

    if rank == 0:
        logger.info('parameter shape')
        logger.info(
            pprint.pformat(
                [i for i in out_shape_dict if not i[0].endswith('output')]))

        logger.info('intermediate output shape')
        logger.info(
            pprint.pformat(
                [i for i in out_shape_dict if i[0].endswith('output')]))

        logger.info('terminal output shape')
        logger.info(pprint.pformat([i for i in terminal_out_shape_dict]))

    # memonger
    if pModel.memonger:
        last_block = pModel.memonger_until or ""
        if rank == 0:
            logger.info("do memonger up to {}".format(last_block))

        type_dict = {k: np.float32 for k in worker_data_shape}
        sym = search_plan_to_layer(sym,
                                   last_block,
                                   1000,
                                   type_dict=type_dict,
                                   **worker_data_shape)

    # load and initialize params
    if pOpt.schedule.begin_epoch != 0:
        arg_params, aux_params = load_checkpoint(model_prefix, begin_epoch)
    elif pModel.from_scratch:
        arg_params, aux_params = dict(), dict()
    else:
        arg_params, aux_params = load_checkpoint(pretrain_prefix,
                                                 pretrain_epoch)

    try:
        pModel.process_weight(sym, arg_params, aux_params)
    except AttributeError:
        pass

    if pModel.random:
        import time
        mx.random.seed(int(time.time()))
        np.random.seed(int(time.time()))

    init = mx.init.Xavier(factor_type="in", rnd_type='gaussian', magnitude=2)
    init.set_verbosity(verbose=True)

    # create solver
    fixed_param_prefix = pModel.pretrain.fixed_param
    data_names = [k[0] for k in train_data.provide_data]
    label_names = [k[0] for k in train_data.provide_label]

    mod = DetModule(sym,
                    data_names=data_names,
                    label_names=label_names,
                    logger=logger,
                    context=ctx,
                    fixed_param_prefix=fixed_param_prefix)

    eval_metrics = mx.metric.CompositeEvalMetric(metric_list)

    # callback
    batch_end_callback = callback.Speedometer(train_data.batch_size,
                                              frequent=pGen.log_frequency)
    epoch_end_callback = callback.do_checkpoint(model_prefix)
    sym.save(model_prefix + ".json")

    # decide learning rate
    base_lr = pOpt.optimizer.lr * kv.num_workers
    lr_factor = 0.1

    iter_per_epoch = len(train_data) // input_batch_size
    lr_iter = [it // kv.num_workers for it in lr_iter]
    lr_iter = [it - iter_per_epoch * begin_epoch for it in lr_iter]
    lr_iter_discount = [it for it in lr_iter if it > 0]
    current_lr = base_lr * (lr_factor**(len(lr_iter) - len(lr_iter_discount)))
    if rank == 0:
        logging.info('total iter {}'.format(iter_per_epoch *
                                            (end_epoch - begin_epoch)))
        logging.info('lr {}, lr_iters {}'.format(current_lr, lr_iter_discount))
    if pOpt.warmup is not None and pOpt.schedule.begin_epoch == 0:
        if rank == 0:
            logging.info('warmup lr {}, warmup step {}'.format(
                pOpt.warmup.lr, pOpt.warmup.iter))

        lr_scheduler = WarmupMultiFactorScheduler(step=lr_iter_discount,
                                                  factor=lr_factor,
                                                  warmup=True,
                                                  warmup_type=pOpt.warmup.type,
                                                  warmup_lr=pOpt.warmup.lr,
                                                  warmup_step=pOpt.warmup.iter)
    else:
        if len(lr_iter_discount) > 0:
            lr_scheduler = mx.lr_scheduler.MultiFactorScheduler(
                lr_iter_discount, lr_factor)
        else:
            lr_scheduler = None

    # optimizer
    optimizer_params = dict(momentum=pOpt.optimizer.momentum,
                            wd=pOpt.optimizer.wd,
                            learning_rate=current_lr,
                            lr_scheduler=lr_scheduler,
                            rescale_grad=1.0 /
                            (len(pKv.gpus) * kv.num_workers),
                            clip_gradient=pOpt.optimizer.clip_gradient)

    if pKv.fp16:
        optimizer_params['multi_precision'] = True
        optimizer_params['rescale_grad'] /= 128.0

    # train
    mod.fit(train_data=train_data,
            eval_metric=eval_metrics,
            epoch_end_callback=epoch_end_callback,
            batch_end_callback=batch_end_callback,
            kvstore=kv,
            optimizer=pOpt.optimizer.type,
            optimizer_params=optimizer_params,
            initializer=init,
            allow_missing=True,
            arg_params=arg_params,
            aux_params=aux_params,
            begin_epoch=begin_epoch,
            num_epoch=end_epoch)

    logging.info("Training has done")