def _instances(self, file_path: str, manager: Manager, output_queue: Queue) -> Iterator[Instance]:
        """
        A generator that reads instances off the output queue and yields them up
        until none are left (signified by all ``num_workers`` workers putting their
        ids into the queue).
        """
        shards = glob.glob(file_path)
        num_shards = len(shards)

        # If we want multiple epochs per read, put shards in the queue multiple times.
        input_queue = manager.Queue(num_shards * self.epochs_per_read + self.num_workers)
        for _ in range(self.epochs_per_read):
            random.shuffle(shards)
            for shard in shards:
                input_queue.put(shard)

        # Then put a None per worker to signify no more files.
        for _ in range(self.num_workers):
            input_queue.put(None)

        processes: List[Process] = []
        num_finished = 0

        for worker_id in range(self.num_workers):
            process = Process(target=_worker,
                              args=(self.reader, input_queue, output_queue, worker_id))
            logger.info(f"starting worker {worker_id}")
            process.start()
            processes.append(process)

        # Keep going as long as not all the workers have finished.
        while num_finished < self.num_workers:
            item = output_queue.get()
            if isinstance(item, int):
                # Means a worker has finished, so increment the finished count.
                num_finished += 1
                logger.info(f"worker {item} finished ({num_finished}/{self.num_workers})")
            else:
                # Otherwise it's an ``Instance``, so yield it up.
                yield item

        for process in processes:
            process.join()
        processes.clear()
Пример #2
0
def main(args):
    '''
    embedding_path에 있는 data 
        - [sentence#1, vecetor#1]
        - [sentence#2, vecetor#2]
        ...
    '''
    print('> START  ')
    print('> parameter  ')
    for k, v in args._get_kwargs():
        print('> {} : {}'.format(k, v))
    print('')
    print('> Action ')
    # 0. sentence_embeddings 준비
    embedding_type_name = args.embedding_type_name
    topk = args.topk
    target_data_path = args.target_data_path
    ground_data_path = args.ground_data_path
    source_embedding_path = args.source_embedding_path
    number_of_processes = args.gpu_num

    # 1. source pool loading
    # [(vector#1, sentence#1), (vector#2, sentence#2) ... ]
    source_pool = load_embedding_data(source_embedding_path)
    src_embeddings = [_[0] for _ in source_pool]
    src_sentences = [_[1] for _ in source_pool]
    # 2. target data split
    target_data_list = [
        _.strip() for _ in open(target_data_path, mode='r', encoding='utf-8')
    ]
    number_of_processes = number_of_processes if number_of_processes < len(
        target_data_list) else len(target_data_list)
    num_of_tasks = len(target_data_list) // number_of_processes
    tasks = [
        target_data_list[_ * num_of_tasks:(_ + 1) * num_of_tasks]
        for _ in range(number_of_processes)
    ]

    # 3. queue 준비
    tasks_to_accomplish = Manager().Queue()
    tasks_finished = Manager().Queue()
    for task in tasks:
        tasks_to_accomplish.put(task)

    processes = []
    # process 생성
    # target에 대해서 encoding 하고 가장 벡터가 유사한 것을 찾는다.
    for i in range(number_of_processes):
        p = Process(target = multi_inference \
                    , args = (embedding_type_name, tasks_to_accomplish, tasks_finished, i+1, src_sentences, src_embeddings,))
        processes.append(p)
        p.start()

    for p in processes:
        p.join()

    # 결과 파일로 저장
    store_target = []
    while not tasks_finished.empty():
        store_target.append(tasks_finished.get_nowait())

    if ground_data_path:
        gt_data = [
            _.strip()
            for _ in open(ground_data_path, mode='r', encoding='utf-8')
        ]
        for idx, val in enumerate(zip(store_target, gt_data)):
            store_target[idx].append(val[-1])

    time_tag = datetime.datetime.now().strftime('%Y%m%d%H%S')

    head_line = ['<target>', '<inference>', '<ground_truth>']
    with open(args.output_data_path, mode='w', encoding='utf-8') as wdesc:

        wdesc.writelines('\t'.join(head_line))
        wdesc.writelines('\n')

        for i in store_target:
            # target = data[0]
            # inference = data[1]
            # sbert_only_inference = data[2]
            line = '\t'.join(i)
            wdesc.writelines(line)
            wdesc.writelines('\n')

    print('> FINISH - result file : {}'.format(args.output_data_path))
    return True
Пример #3
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('env_name', type=str)
    parser.add_argument('--exp_name', type=str, default='vac')
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--discount', type=float, default=1.0)
    parser.add_argument('--n_iter', '-n', type=int, default=100)
    parser.add_argument('--batch_size', '-b', type=int, default=1000)
    parser.add_argument('--ep_len', '-ep', type=float, default=-1.)
    parser.add_argument('--actor_learning_rate', '-lr', type=float, default=5e-3)
    parser.add_argument('--critic_learning_rate', '-clr', type=float)
    parser.add_argument('--dont_normalize_advantages', '-dna', action='store_true')
    parser.add_argument('--num_target_updates', '-ntu', type=int, default=10)
    parser.add_argument('--num_grad_steps_per_target_update', '-ngsptu', type=int, default=10)
    parser.add_argument('--seed', type=int, default=1)
    parser.add_argument('--n_experiments', '-e', type=int, default=1)
    parser.add_argument('--actor_n_layers', '-l', type=int, default=2)
    parser.add_argument('--critic_n_layers', '-cl', type=int)
    parser.add_argument('--size', '-s', type=int, default=64)
    args = parser.parse_args()

    if not(os.path.exists('data')):
        os.makedirs('data')
    logdir = 'ac_' + args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    logdir = os.path.join('data', logdir)
    if not(os.path.exists(logdir)):
        os.makedirs(logdir)

    max_path_length = args.ep_len if args.ep_len > 0 else None

    if not args.critic_learning_rate:
        args.critic_learning_rate = args.actor_learning_rate

    if not args.critic_n_layers:
        args.critic_n_layers = args.actor_n_layers
        
    processes = []

    for e in range(args.n_experiments):
        seed = args.seed + 10*e
        print('Running experiment with seed %d'%seed)

        def train_func():
            train_AC(
                exp_name=args.exp_name,
                env_name=args.env_name,
                n_iter=args.n_iter,
                gamma=args.discount,
                min_timesteps_per_batch=args.batch_size,
                max_path_length=max_path_length,
                actor_learning_rate=args.actor_learning_rate,
                critic_learning_rate=args.critic_learning_rate,
                num_target_updates=args.num_target_updates,
                num_grad_steps_per_target_update=args.num_grad_steps_per_target_update,
                animate=args.render,
                logdir=os.path.join(logdir,'%d'%seed),
                normalize_advantages=not(args.dont_normalize_advantages),
                seed=seed,
                actor_n_layers=args.actor_n_layers,
                critic_n_layers=args.critic_n_layers,
                size=args.size
                )
        p = Process(target=train_func, args=tuple())
        p.start()
        processes.append(p)
        # if you comment in the line below, then the loop will block 
        # until this process finishes
        # p.join()

    for p in processes:
        p.join()
Пример #4
0
def NCS_MP(crates,
           ncs_stepsize,
           masked_models,
           valid,
           corpus,
           acc_constraint,
           orig_fitvalue,
           num_runs=0):
    total_time = 0
    total_iteration = 100
    itr_count = 0
    popsize = len(other_GPU_IDs) + 1
    __C = edict()
    __C.parameters = {
        'reset_xl_to_pop': False,
        'init_value': crates,
        'stepsize': ncs_stepsize,
        'bounds': [0.1, 0.99999999],
        'ftarget': 0,
        'tmax': total_iteration * popsize,
        'popsize': popsize,
        'best_k': 1
    }
    es = ncs.NCS(__C.parameters)

    start_t = time.time()

    print('***************NCS initialization***************')
    ref_net = masked_models[0]
    # 0.0 represents no parameters have been pruned, so it's original fitness
    ref_net.change_mask(len(crates) * [0.0], apply_MP_on_mask)
    ref_net.apply_mask()
    start_fit = evaluate_lm(ref_net.masked_model, valid, corpus,
                            TEST_BATCH_SIZE)
    orignal_fit = orig_fitvalue
    print('start fit: {}'.format(start_fit))
    print('orig fit: {}'.format(orignal_fit))

    ref_net = masked_models[0]
    ref_net.change_mask(crates, apply_MP_on_mask)
    ref_net.apply_mask()
    tmp_fit = evaluate_lm(ref_net.masked_model, valid, corpus, TEST_BATCH_SIZE)
    print("start init threshold:", crates)
    print('Start sparsity: {}%'.format(ref_net.get_sparsity() * 100))
    es.set_initFitness(
        es.popsize *
        [ref_net.get_sparsity()
         ])  # assume the inital crates store the size of each tensor
    #es.ask()
    #tmp_fit = torch.FloatTensor([0,0,0])

    end_t = time.time()
    total_time = (end_t - start_t)

    print('fit:{}'.format(tmp_fit))
    print('time {}min elapse'.format(total_time / 60.))
    print('***************NCS initialization***************')

    ref_net.clear_cache()
    processes = []
    results = {'result_NCS': torch.FloatTensor(crates)}
    results['result_NCS'].share_memory_()

    # paralell individuals
    for rank in range(popsize):
        p = Process(target=init_processes,
                    args=(rank, popsize, orignal_fit, acc_constraint,
                          prune_and_eval, valid, corpus, es, masked_models,
                          num_runs, results))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()

    ref_net.change_mask(results['result_NCS'].numpy(), apply_MP_on_mask)
    ref_net.apply_mask()
    best_prune = evaluate_lm(ref_net.masked_model, valid, corpus,
                             TEST_BATCH_SIZE)
    print('Accuracy:{}=>{}, ppl:{}=>{}, sparsity: {}%'.format(
        orignal_fit[1], best_prune[1], orignal_fit[0], best_prune[0],
        ref_net.get_sparsity() * 100.))

    logger.scalar_summary('ncs_start_acc', tmp_fit[1], num_runs)
    logger.scalar_summary('ncs_start_ppl', tmp_fit[0], num_runs)
    logger.scalar_summary('ncs_best_acc', best_prune[1], num_runs)
    logger.scalar_summary('ncs_best_ppl', best_prune[0], num_runs)
    if True:
        saved_model_name = 'ncs_pruned_model_%s_iteration%s_%s_%s_acc_cons_%s.pt' % (
            name_mark, num_runs, Model_type, layer_group_type,
            str(acc_constraint))
        torch.save(ref_net, cfg.LM_MODEL_TMP_FOLDER + saved_model_name)

    return results['result_NCS'].numpy(), saved_model_name, ref_net
Пример #5
0
    elif dist.get_rank() == 2:
        input_size = 10
        output_size = 6
    elif dist.get_rank() == 3:
        input_size = 6
        output_size = 2
    elif dist.get_rank() == 4:
        input_size = 2
        output_size = 1
    layer = Layer(input_size=input_size, output_size=output_size)
    fn(layer, size, batcher)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-size', type=int, help='input the sum of node')
    parser.add_argument('-path', help='the path fo share file system')
    args = parser.parse_args()
    processes = []
    print("size:" + str(args.size))
    print("path:" + args.path)
    batcher = Batcher()
    for rank in range(args.size):
        p = Process(target=init_processes,
                    args=(run, args.path, args.size, batcher))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()
    def test_sampling_with_distributed_sampler(self, decoder):

        # Make one video with 15 frames and one with 10 frames, producing 3 clips and 2
        # clips respectively.
        num_frames = 10
        fps = 5
        with temp_encoded_video(num_frames=int(num_frames * 1.5), fps=fps) as (
                video_file_name_1,
                data_1,
        ):
            with temp_encoded_video(num_frames=num_frames, fps=fps) as (
                    video_file_name_2,
                    data_2,
            ):
                with tempfile.NamedTemporaryFile(delete=False,
                                                 suffix=".txt") as f:
                    f.write(f"{video_file_name_1} 0\n".encode())
                    f.write(f"{video_file_name_2} 1\n".encode())

                total_duration = num_frames / fps
                half_duration = total_duration / 2 - self._EPS

                # Create several processes initialized in a PyTorch distributed process
                # group so that distributed sampler is setup correctly when dataset is
                # constructed.
                num_processes = 2
                processes = []
                return_dict = multiprocessing.Manager().dict()
                for rank in range(num_processes):
                    p = Process(
                        target=run_distributed,
                        args=(
                            rank,
                            num_processes,
                            decoder,
                            half_duration,
                            f.name,
                            return_dict,
                        ),
                    )
                    p.start()
                    processes.append(p)

                for p in processes:
                    p.join()

                # After joining all distributed processes we expect all these label,
                # video pairs to be returned in random order.
                half_frames = num_frames // 2
                expected = {
                    (0, data_1[:, :half_frames]),  # 1/3 clip
                    (0, data_1[:, half_frames:half_frames * 2]),  # 2/3 clip
                    (0, data_1[:, half_frames * 2:]),  # 3/3 clip
                    (1, data_2[:, :half_frames]),  # First half
                    (1, data_2[:, half_frames:]),  # Second half
                }

                epoch_results = collections.defaultdict(list)
                for v in return_dict.values():
                    for k_2, v_2 in v.items():
                        epoch_results[k_2].extend(v_2)

                assert_unordered_list_compare_true(self, expected,
                                                   epoch_results["epoch_1"])
                assert_unordered_list_compare_true(self, expected,
                                                   epoch_results["epoch_2"])
def NCS_MP(crates,
           ncs_stepsize,
           fields,
           masked_models,
           valid,
           acc_constraint,
           num_runs=0,
           checkpoint=None):
    total_time = 0
    total_iteration = 100
    itr_count = 0
    popsize = len(other_GPU_IDs) + 1
    __C = edict()
    __C.parameters = {
        'reset_xl_to_pop': False,
        'init_value': crates,
        'stepsize': ncs_stepsize,
        'bounds': [0., 0.95],
        'ftarget': 0,
        'tmax': total_iteration * popsize,
        'popsize': popsize,
        'best_k': 1
    }
    es = ncs.NCS(__C.parameters)

    start_t = time.time()

    print('***************NCS initialization***************')
    ref_net = masked_models[0]
    # 0.0 represents no parameters have been pruned, so it's original fitness
    ref_net.change_mask(len(crates) * [0.0], apply_MP_on_mask)
    ref_net.apply_mask()
    orignal_fit = evaluate(ref_net, valid, fields)
    print('original fit: {}'.format(orignal_fit))

    ref_net = masked_models[0]
    ref_net.change_mask(crates, apply_MP_on_mask)
    ref_net.apply_mask()
    tmp_fit = evaluate(ref_net, valid, fields)
    print("start init threshold:", crates)
    print('Start sparsity: {}%'.format(ref_net.get_sparsity() * 100))
    es.set_initFitness(
        es.popsize *
        [ref_net.get_sparsity()
         ])  # assume the inital crates store the size of each tensor
    #es.ask()
    #tmp_fit = torch.FloatTensor([0,0,0])

    end_t = time.time()
    total_time = (end_t - start_t)

    print('fit:{}'.format(tmp_fit))
    print('time {}min elapse'.format(total_time / 60.))
    print('***************NCS initialization***************')

    ref_net.clear_cache()
    valid.fields = []  # clear fields for send valid among thresholds
    processes = []
    results = {'result_NCS': torch.FloatTensor(crates)}
    results['result_NCS'].share_memory_()

    # paralell individuals
    for rank in range(popsize):
        p = Process(target=init_processes,
                    args=(rank, popsize, orignal_fit, acc_constraint,
                          prune_and_eval, valid, es, masked_models, num_runs,
                          results))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()

    valid.fields = fields
    ref_net.change_mask(results['result_NCS'].numpy(), apply_MP_on_mask)
    ref_net.apply_mask()
    best_prune = evaluate(ref_net, valid, fields)
    print('Accuracy:{}=>{}, ppl:{}=>{}, sparsity: {}%'.format(
        orignal_fit[1], best_prune[1], orignal_fit[0], best_prune[0],
        ref_net.get_sparsity() * 100.))

    logger.scalar_summary('ncs_start_acc', tmp_fit[1], num_runs)
    logger.scalar_summary('ncs_start_ppl', tmp_fit[0], num_runs)
    logger.scalar_summary('ncs_best_acc', best_prune[1], num_runs)
    logger.scalar_summary('ncs_best_ppl', best_prune[0], num_runs)
    if checkpoint is not None:
        real_model = (ref_net.masked_model.module if isinstance(
            ref_net.masked_model, nn.DataParallel) else ref_net.masked_model)
        real_generator = (real_model.generator.module if isinstance(
            real_model.generator, nn.DataParallel) else real_model.generator)
        model_state_dict = real_model.state_dict()
        model_state_dict = {
            k: v
            for k, v in model_state_dict.items() if 'generator' not in k
        }
        generator_state_dict = real_generator.state_dict()
        checkpoint['model'] = model_state_dict
        checkpoint['generator'] = generator_state_dict
        saved_model_name = 'ncs_pruned_model_%s_iteration%s_%s_%s_acc_cons_%s.pt' % (
            name_mark, num_runs, Model_type, layer_group_type,
            str(acc_constraint))
        print("saved model:", saved_model_name)
        torch.save(checkpoint, SAVE_MODEL_TMP_FOLDER + saved_model_name)

    return results['result_NCS'].numpy(), saved_model_name, ref_net
Пример #8
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('env_name', type=str)
    # parser.add_argument('--env_name', type=str, default='CartPole-v0')
    parser.add_argument('--exp_name', type=str, default='vpg')
    parser.add_argument('--render', action='store_true')
    parser.add_argument('--discount', type=float, default=1.0)
    parser.add_argument('--n_iter', '-n', type=int, default=100)
    parser.add_argument('--batch_size', '-b', type=int, default=1000)
    parser.add_argument('--ep_len', '-ep', type=float, default=-1.)
    parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3)
    parser.add_argument('--reward_to_go', '-rtg', action='store_true')
    # parser.add_argument('--reward_to_go', '-rtg', type=bool, default=True)
    parser.add_argument('--dont_normalize_advantages',
                        '-dna',
                        action='store_true')
    parser.add_argument('--nn_baseline', '-bl', action='store_true')
    parser.add_argument('--seed', type=int, default=1)
    parser.add_argument('--n_experiments', '-e', type=int, default=1)
    parser.add_argument('--n_layers', '-l', type=int, default=2)
    parser.add_argument('--size', '-s', type=int, default=64)
    parser.add_argument('--dir', '-d', type=str, default='test')
    args = parser.parse_args()

    if not (os.path.exists(args.dir)):
        os.makedirs(args.dir)
    logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime(
        "%d-%m-%Y_%H-%M-%S")
    logdir = os.path.join(args.dir, logdir)
    if not (os.path.exists(logdir)):
        os.makedirs(logdir)

    max_path_length = args.ep_len if args.ep_len > 0 else None

    processes = []

    for e in range(args.n_experiments):
        seed = args.seed + 10 * e
        print('Running experiment with seed %d' % seed)

        def train_func():
            train_PG(exp_name=args.exp_name,
                     env_name=args.env_name,
                     n_iter=args.n_iter,
                     gamma=args.discount,
                     min_timesteps_per_batch=args.batch_size,
                     max_path_length=max_path_length,
                     learning_rate=args.learning_rate,
                     reward_to_go=args.reward_to_go,
                     animate=args.render,
                     logdir=os.path.join(logdir, '%d' % seed),
                     normalize_advantages=not (args.dont_normalize_advantages),
                     nn_baseline=args.nn_baseline,
                     seed=seed,
                     n_layers=args.n_layers,
                     size=args.size)

        p = Process(target=train_func, args=tuple())
        p.start()
        processes.append(p)
        # if you comment in the line below, then the loop will block
        # until this process finishes
        # p.join()

    for p in processes:
        p.join()
Пример #9
0
def train_ai2thor(model, args, rank=0, b=None):

    seed = args.seed + 10000 * rank
    torch.manual_seed(seed)
    np.random.seed(seed)

    # torch.cuda.set_device(rank)
    device = torch.device(f'cuda:{rank}')
    os.environ['DISPLAY'] = f':{rank}'

    model = model.to(device)
    model.share_memory()

    # Experience buffer
    storage = PPOBuffer(model.obs_shape,
                        args.steps,
                        args.num_workers,
                        args.state_size,
                        args.gamma,
                        device=device)
    storage.share_memory()

    #torch.multiprocessing.set_start_method('spawn')
    # start multiple processes
    ready_to_works = [Event() for _ in range(args.num_workers)]
    exit_flag = Value('i', 0)
    queue = SimpleQueue()

    processes = []
    task_config_file = "config_files/multiMugTaskTrain.json"
    # start workers
    for worker_id in range(args.num_workers):
        p = Process(target=worker,
                    args=(worker_id, model, storage, ready_to_works[worker_id],
                          queue, exit_flag, task_config_file))
        p.start()
        processes.append(p)

    # start trainer
    train_params = {
        "epochs": args.epochs,
        "steps": args.steps,
        "world_size": args.world_size,
        "num_workers": args.num_workers
    }
    ppo_params = {
        "clip_param": args.clip_param,
        "train_iters": args.train_iters,
        "mini_batch_size": args.mini_batch_size,
        "value_loss_coef": args.value_loss_coef,
        "entropy_coef": args.entropy_coef,
        "rnn_steps": args.rnn_steps,
        "lr": args.lr,
        "max_kl": args.max_kl
    }

    distributed = False
    if args.world_size > 1:
        distributed = True
        # Initialize Process Group, distributed backend type
        dist_backend = 'nccl'
        # Url used to setup distributed training
        dist_url = "tcp://127.0.0.1:23456"
        print("Initialize Process Group... pid:", os.getpid())
        dist.init_process_group(backend=dist_backend,
                                init_method=dist_url,
                                rank=rank,
                                world_size=args.world_size)
        # Make model DistributedDataParallel
        model = DistributedDataParallel(model,
                                        device_ids=[rank],
                                        output_device=rank)

    learner(model, storage, train_params, ppo_params, ready_to_works, queue,
            exit_flag, rank, distributed, b)

    for p in processes:
        print("process ", p.pid, " joined")
        p.join()
Пример #10
0
class MultiprocessIterator(DataIterator):
    """
    Wraps another `DataIterator` and uses it to generate tensor dicts
    using multiple processes.

    # Parameters

    base_iterator : `DataIterator`
        The `DataIterator` for generating tensor dicts. It will be shared among
        processes, so it should not be stateful in any way.
    num_workers : `int`, optional (default = 1)
        The number of processes used for generating tensor dicts.
    output_queue_size : `int`, optional (default = 1000)
        The size of the output queue on which tensor dicts are placed to be consumed.
        You might need to increase this if you're generating tensor dicts too quickly.
    """

    def __init__(
        self, base_iterator: DataIterator, num_workers: int = 1, output_queue_size: int = 1000
    ) -> None:

        super().__init__()
        self.num_workers = num_workers
        self.batch_size = base_iterator._batch_size
        self.output_queue_size = output_queue_size

        # These two options make the iterator stateful, which means it can't be shared
        # across multiple processes.
        if base_iterator._cache_instances:
            raise ConfigurationError("cannot use Multiprocess iterator with cache_instances")
        if base_iterator._instances_per_epoch:
            raise ConfigurationError("cannot use instances_per_epoch with Multiprocess iterator")

        self.iterator = base_iterator

        self.processes: List[Process] = []
        self.queuer: Optional[Process] = None

    def _create_batches(self, instances: Iterable[Instance], shuffle: bool) -> Iterable[Batch]:
        raise RuntimeError("MultiprocessIterator doesn't use create_batches")

    def index_with(self, vocab: Vocabulary):
        self.iterator.index_with(vocab)

    def _call_with_instances(
        self, instances: Iterable[Instance], num_epochs: int, shuffle: bool
    ) -> Iterator[TensorDict]:
        # JoinableQueue needed here as sharing tensors across processes
        # requires that the creating process not exit prematurely.
        output_queue = JoinableQueue(self.output_queue_size)
        input_queue = Queue(self.output_queue_size * self.batch_size)

        # Start process that populates the queue.
        self.queuer = Process(
            target=_queuer, args=(instances, input_queue, self.num_workers, num_epochs)
        )
        self.queuer.start()

        # Start the tensor-dict workers.
        for i in range(self.num_workers):
            args = (input_queue, output_queue, self.iterator, shuffle, i)
            process = Process(target=_create_tensor_dicts_from_queue, args=args)
            process.start()
            self.processes.append(process)

        num_finished = 0
        while num_finished < self.num_workers:
            item = output_queue.get()
            output_queue.task_done()
            if isinstance(item, int):
                num_finished += 1
                logger.info(f"worker {item} finished ({num_finished} / {self.num_workers})")
            else:
                yield item

        for process in self.processes:
            process.join()
        self.processes.clear()

        if self.queuer is not None:
            self.queuer.join()
            self.queuer = None

    def _call_with_qiterable(
        self, qiterable: QIterable, num_epochs: int, shuffle: bool
    ) -> Iterator[TensorDict]:
        # JoinableQueue needed here as sharing tensors across processes
        # requires that the creating tensor not exit prematurely.
        output_queue = JoinableQueue(self.output_queue_size)

        for _ in range(num_epochs):
            qiterable.start()

            # Start the tensor-dict workers.
            for i in range(self.num_workers):
                args = (qiterable, output_queue, self.iterator, shuffle, i)
                process = Process(target=_create_tensor_dicts_from_qiterable, args=args)
                process.start()
                self.processes.append(process)

            num_finished = 0
            while num_finished < self.num_workers:
                item = output_queue.get()
                output_queue.task_done()
                if isinstance(item, int):
                    num_finished += 1
                    logger.info(f"worker {item} finished ({num_finished} / {self.num_workers})")
                else:
                    yield item

            for process in self.processes:
                process.join()
            self.processes.clear()

            qiterable.join()

    def __call__(
        self, instances: Iterable[Instance], num_epochs: int = None, shuffle: bool = True
    ) -> Iterator[TensorDict]:

        # If you run it forever, the multiprocesses won't shut down correctly.
        # TODO(joelgrus) find a solution for this
        if num_epochs is None:
            raise ConfigurationError(
                "Multiprocess Iterator must be run for a fixed number of epochs"
            )

        if isinstance(instances, QIterable):
            return self._call_with_qiterable(instances, num_epochs, shuffle)
        else:
            return self._call_with_instances(instances, num_epochs, shuffle)

    def __del__(self) -> None:
        """
        Terminate processes if the user hasn't joined implicitly by consuming
        all the tensors. This is necessary as leaving stray processes running
        can corrupt shared state. In brief, we've observed shared memory
        counters being reused (when the memory was free from the perspective of
        the parent process) while the stray workers still held a reference to
        them.

        For a discussion of using destructors in Python in this manner, see
        https://eli.thegreenplace.net/2009/06/12/safely-using-destructors-in-python/.
        """
        for process in self.processes:
            process.terminate()

        if self.queuer is not None:
            self.queuer.terminate()