Пример #1
0
def cast_data(header, tablename, data):
    typedict = get_typedict(tablename)
    type_casters = []
    for i in range(len(header)):
        sql_type = typedict[header[i]]
        if sql_type == text_type:
            type_casters.append(lambda str: str.encode('UTF-8'))
            #type_casters.append(lambda passer: passer)
        elif sql_type == int_type:
            type_casters.append(int)
        elif sql_type == date_type:
            type_casters.append(timestamp_parser.parse)

    log('casting data for ' + str(len(data)) + " rows")

    def cast_line(dataln):
        cast_line = []
        for col_id in range(len(dataln)):
            cast_line.append(type_casters[col_id](dataln[col_id]))
        return cast_line

    tpool = Pool(processes=6)
    ret = tpool.map(cast_line, data)
    tpool.close()
    return ret
Пример #2
0
def process_experiment(_experiment, _overwrite=False):
    _arguments = [(_experiment, int(_series.split('_')[1]), _overwrite)
                  for _series in paths.image_files(paths.serieses(_experiment))
                  ]
    _p = Pool(CPUS_TO_USE)
    _p.starmap(process_series, _arguments)
    _p.close()
Пример #3
0
def process_experiment(_experiment, _overwrite=False):
    _arguments = []
    for _tuple in load.experiment_groups_as_tuples(_experiment):
        _experiment, _series_id, _group = _tuple
        _arguments.append((_experiment, _series_id, _group, _overwrite))

    _p = Pool(CPUS_TO_USE)
    _p.starmap(process_group, _arguments)
    _p.close()
Пример #4
0
 def runMultiProcessTrajectories(self, repeat):
     pool=Pool(processes=len(self.posIni))
     result = pool.map(partial(self.runNtrajectory, repeat=repeat) , [(x, y) for x, y in self.posIni])
     pool.close()
     pool.join()
     meanCost, meanTraj=0., 0.
     for Cost, traj in result:
         meanCost+=Cost
         meanTraj+=traj
     size = len(result)
     return meanCost/size, meanTraj/size
Пример #5
0
 def runMultiProcessTrajectories(self, repeat):
     pool = Pool(processes=len(self.posIni))
     result = pool.map(partial(self.nTraj, repeat=repeat),
                       [(x, y) for x, y in self.posIni])
     pool.close()
     pool.join()
     meanCost, meanTraj = 0, 0
     for CostCMAES, traj in result:
         meanCost += CostCMAES
         meanTraj += traj
     size = len(result)
     return meanCost / size, meanTraj / size
    def filter(dirty_data):
        log("starting filter")
        tpool = Pool(processes=cpus)
        ret = []
        log("filtering deleted and not english")
        for line in tpool.map(Filter.__is_not_deleted_or_not_non_english,
                              dirty_data):
            if line[1]:
                ret.append(line[0])

        def clean_links_and_punctuation(comment):
            words = comment.split(" ")
            words = list(map(Filter.__filter_links, words))
            comment = reduce(lambda x, y: x + " " + y, words)
            return comment

        log("filtering links and punctuation")
        ret = tpool.map(clean_links_and_punctuation, ret)
        tpool.close()
        log("filter done")
        return ret
def connect_structures_find_saddles(sequence, structure_list):
    pairs = {}
    #fc = RNA.fold_compound(sequence)
    fp_pool = Pool(Max_Threads)
    res_list=[]
    for i, se_1 in enumerate(structure_list):
        for j in range(i+1, len(structure_list)):
            se_2 = structure_list[j]
            a = fp_pool.apply_async(find_saddle, args=(se_1.Structure, se_2.Structure, i, j))
            res_list.append(a)
            #saddle_energy_dcal = fc.path_findpath_saddle(se_1.Structure, se_2.Structure)
            #saddle_energy_kcal = saddle_energy_dcal / 100.0
            #pairs[(i,j)] = saddle_energy_kcal
            #pairs[(j,i)] = saddle_energy_kcal
    fp_pool.close()
    
    for a in res_list:
        i,j, saddle_energy_kcal = a.get()
        pairs[(i,j)] = saddle_energy_kcal
        pairs[(j,i)] = saddle_energy_kcal
        
        
    # get lowest saddle for each structure that ends in a structure with lower energy than the first structure.
    minimal_saddle_list = []
    for i in range(0, len(structure_list)):
        se_1 = structure_list[i]
        min_saddle_energy = sys.maxsize
        tree_neighbor = None
        for j in range(0, len(structure_list)):
            if i == j:
                continue
            se_2 = structure_list[j]
            saddle_energy = pairs[(i,j)]
            if saddle_energy <= min_saddle_energy and se_2.Energy < se_1.Energy:
                min_saddle_energy = saddle_energy
                tree_neighbor = j
        if tree_neighbor == None: # it could be the root.
            tree_neighbor = -1
        minimal_saddle_list.append((i, tree_neighbor, min_saddle_energy))
    return minimal_saddle_list
Пример #8
0
def save_word_error_parallel(results_file, output_file, X_test, y_test):
    file = open(results_file, 'r')
    lines = file.readlines()
    file.close()

    pool = Pool(4)  # uses up 4 cores per call (for 8 core machine), so change this to match half your cpu count
    f_vals = []

    # packs a tuple - line, index, X_test, y_test, verbose (should print or not)
    data = [(l, i, X_test, y_test, True) for i, l in enumerate(lines)]

    # process it in correct order
    accuracies = pool.imap(process_line, data)  # maintains order when processing in parallel

    # wait while processing the data
    pool.close()

    for acc in accuracies:
        f_vals.append(str(acc) + "\n")

    file = open(output_file, 'w')
    file.writelines(f_vals)
    file.close()
Пример #9
0
        neat = NEAT(nrOfOrgamisms, 1890, 4)

    pyplot.show(block=False)
    
    stillRunning = True
    while stillRunning:
        results = {}
        # randomPhenotype = random.choice(neat.phenotypes)
        # randomPhenotype.toDraw = True

        pool = Pool(nrOfInstances)
        finishedIndex = multiprocessing.Manager().Value('i', 0)
        for i, phenotype in enumerate(neat.phenotypes):
            results[i] = pool.apply_async(testOrganism, (phenotype, instances, finishedIndex, len(neat.phenotypes)))
            # results[i] = pool.apply_async(LogExceptions(testOrganism), (phenotype, instances, finishedIndex, len(neat.phenotypes)))
        pool.close()
        pool.join()

        distances = [result.get()[0] for func, result in results.items()]
        fitnessScores = [result.get()[1] for func, result in results.items()]

        for p, d in zip(neat.phenotypes, distances):
            p.genome.distance = d

        print("")
        print("-----------------------------------------------------")
        print("Running epoch")
        neat.phenotypes = neat.epoch(fitnessScores)
        print(distances)
        print("Generation: " + str(neat.generation))
        print("Number of species: " + str(len(neat.species)))
Пример #10
0
        costAll, trajTimeAll = np.zeros(repeat), np.zeros(repeat)
        for i in range(repeat):
            costAll[i], trajTimeAll[i]  = self.runOneTrajectoryOpti(x, y) 
        meanCost = np.mean(costAll)
        meanTrajTime = np.mean(trajTimeAll)
        self.costStore.append([x, y, meanCost])
        self.trajTimeStore.append([x, y, meanTrajTime])
        return meanCost, meanTrajTime
    
    def mapableTrajecrtoryFunction(self,x,y,useless):
        return self.runOneTrajectory(x, y)
    
    def runNtrajectoryMulti(self, (x, y), repeat):
        pool=Pool(processes=4)
        result = pool.map(partial(self.mapableTrajecrtoryFunction,x,y) , range(repeat))
        pool.close()
        pool.join()
        meanCost, meanTraj=0., 0.
        for Cost, traj in result:
            meanCost+=Cost
            meanTraj+=traj
        size = len(result)
        return meanCost/size, meanTraj/size

    
    def runOneTrajectoryOpti(self, x, y):
        #self.tm.saveTraj = True
        cost, trajTime, lastX = self.tm.runTrajectoryOpti(x, y)
        #cost, trajTime, lastX = self.tm.runTrajectoryOpti(x, y)
        #print "Exp local x y cost : ", x, y, cost
        if lastX != -1000:
Пример #11
0
def pre_stat(paras, df_microsatellites):
    # reference=paras["reference"]
    path_pre_stat = paras["output"].rstrip("/") + "/" + get_value(
        "case") + ".stat"
    path_pre_stat_tmp = paras["output_tmp"].rstrip("/") + "/" + get_value(
        "case") + ".stat"
    file_all_stat = open(path_pre_stat, "w")
    file_all_stat.write("\t".join([
        "repeat_unit_length", "repeat_times", "num_forward", "num_reversed",
        "this_repeat_mean_mean", "this_repeat_mean_std",
        "this_repeat_std_mean", "this_repeat_std_std", "forward_prefix",
        "forward_ms", "forward_suffix", "reversed_prefix", "reversed_ms",
        "reversed_suffix"
    ]) + "\n")

    df_microsatellites_download_sample = microsatellites_sampling(
        df_microsatellites, paras)

    for repeat_unit, info in df_microsatellites_download_sample.items():
        for repeat_times, ms_infos in info.items():
            logger.info("Processing   repeat unit: " + str(repeat_unit) +
                        " repeat times: " + str(repeat_times))
            infos = []
            for id, info in ms_infos.iterrows():
                info["reference"] = paras["reference"]
                info["prefix_len"] = paras["prefix_len"]
                info["suffix_len"] = paras["suffix_len"]
                infos.append(info)
            pool = Pool(processes=paras["threads"])
            res_infos = pool.map(process_one_ms, infos)
            pool.close()
            pool.join()
            suffix_str = "." + str(repeat_unit) + "." + str(repeat_times)
            file = open(path_pre_stat_tmp + suffix_str + ".repeat", "w")
            this_repeat_means = []
            this_repeat_stds = []
            num_forward = 0
            num_reversed = 0
            prefix_forward = []
            suffix_forward = []
            ms_forward = []
            prefix_reversed = []
            suffix_reversed = []
            ms_reversed = []
            for res in res_infos:
                if None not in res:
                    file.write("\t".join(map(str, res[:-2])) + "\n")
                    this_repeat_means.append(res[3])
                    this_repeat_stds.append(res[4])
                    prefix_forward.extend(res[-1]["prefix_forward"])
                    suffix_forward.extend(res[-1]["suffix_forward"])
                    ms_forward.extend(res[-1]["ms_forward"])
                    prefix_reversed.extend(res[-1]["prefix_reversed"])
                    suffix_reversed.extend(res[-1]["suffix_reversed"])
                    ms_reversed.extend(res[-1]["ms_reversed"])
                    num_forward += res[-1]["num_forward"]
                    num_reversed += res[-1]["num_reversed"]

            file.close()
            if num_forward + num_reversed < 2: continue
            this_repeat_mean_mean = np.mean(this_repeat_means)
            this_repeat_mean_std = np.std(this_repeat_means)
            this_repeat_std_mean = np.mean(this_repeat_stds)
            this_repeat_std_std = np.std(this_repeat_stds)
            pd.concat(
                [
                    pd.DataFrame(
                        [np.nanmean(np.array(prefix_forward), axis=0)]),
                    pd.DataFrame([np.nanmean(np.array(ms_forward), axis=0)]),
                    pd.DataFrame(
                        [np.nanmean(np.array(suffix_forward), axis=0)])
                ],
                axis=1,
            ).to_csv(path_pre_stat_tmp + suffix_str + ".forward.qual")
            pd.concat(
                [
                    pd.DataFrame(
                        [np.nanmean(np.array(prefix_reversed), axis=0)]),
                    pd.DataFrame([np.nanmean(np.array(ms_reversed), axis=0)]),
                    pd.DataFrame(
                        [np.nanmean(np.array(suffix_reversed), axis=0)])
                ],
                axis=1,
            ).to_csv(path_pre_stat_tmp + suffix_str + ".reversed.qual")
            forward_prefix = np.nanmean(prefix_forward)
            forward_ms = np.nanmean(ms_forward)
            forward_suffix = np.nanmean(suffix_forward)

            reversed_prefix = np.nanmean(prefix_reversed)
            reversed_ms = np.nanmean(ms_reversed)
            reversed_suffix = np.nanmean(suffix_reversed)
            this_info_list = list(
                map(str, [
                    repeat_unit, repeat_times, num_forward, num_reversed,
                    this_repeat_mean_mean, this_repeat_mean_std,
                    this_repeat_std_mean, this_repeat_std_std, forward_prefix,
                    forward_ms, forward_suffix, reversed_prefix, reversed_ms,
                    reversed_suffix
                ]))
            file_all_stat.write("\t".join(this_info_list) + "\n")
    file_all_stat.close()
    return
Пример #12
0
    def create_test_raw_data(self,
                             ticker_list=None,
                             start_date=None,
                             finish_date=None,
                             folder_prefix=None):
        """Downloads FX tick data from DukasCopy and then dumps each ticker in a separate HDF5 file if a folder is specified.
        If no folder is specified returns a list of DataFrames (note: can be a very large list in memory)

        Parameters
        ----------
        ticker_list : str (list)
            List of FX tickers to download

        start_date : datetime/str
            Start date of FX tick data download

        finish_date : datetime/str
            Finish date of FX tick data download

        folder_prefix : str
            Folder to dump everything

        Returns
        -------
        DataFrame (list)
        """

        from findatapy.market import MarketDataRequest, MarketDataGenerator, Market

        if start_date is None and finish_date is None:
            finish_date = datetime.datetime.utcnow().date() - timedelta(
                days=30)
            start_date = finish_date - timedelta(days=30 * 15)

            start_date = self._compute_random_date(start_date, finish_date)
            finish_date = start_date + timedelta(days=90)

        df_list = []
        result = []

        # From multiprocessing.dummy import Pool # threading
        from multiprocess.pool import Pool  # actuall new processes
        import time

        # If we don't specify a folder
        if folder_prefix is None:

            mini_ticker_list = self._split_list(ticker_list, 2)

            # Use multiprocess to speed up the download
            for mini in mini_ticker_list:
                pool = Pool(processes=2)

                for ticker in mini:
                    time.sleep(1)
                    self.logger.info("Loading " + ticker)
                    md_request = MarketDataRequest(
                        start_date=start_date,
                        finish_date=finish_date,
                        category='fx',
                        tickers=ticker,
                        fields=['bid', 'ask', 'bidv', 'askv'],
                        data_source='dukascopy',
                        freq='tick')

                    # self._download(md_request)
                    result.append(
                        pool.apply_async(self._download,
                                         args=(
                                             md_request,
                                             folder_prefix,
                                         )))

                pool.close()
                pool.join()

        else:
            market = Market(market_data_generator=MarketDataGenerator())

            for ticker in ticker_list:

                md_request = MarketDataRequest(
                    start_date=start_date,
                    finish_date=finish_date,
                    category='fx',
                    tickers=ticker,
                    fields=['bid', 'ask', 'bidv', 'askv'],
                    data_source='dukascopy',
                    freq='tick')

                df = market.fetch_market(md_request=md_request)

                df.columns = ['bid', 'ask', 'bidv', 'askv']

                df['venue'] = 'dukascopy'
                df['ticker'] = ticker

                # print(df)

                if folder_prefix is not None:
                    self.dump_hdf5_file(df,
                                        folder_prefix + "_" + ticker + ".h5")
                    # df.to_csv(folder_prefix + "_" + ticker + ".csv") # CSV files can be very large, so try to avoid
                else:
                    df_list.append(df)

            return df_list
Пример #13
0
class ParallelBackend:
    """
    The unified backend for parallelization.
    
    Currently, we support `multiprocess`, `dask`, `sharedmem` and `loky`.
    `multiprocess` usually has better performance on single-node machines, while
    `dask` can be used for multi-node parallelization. Note the following known
    issues: when used for sampling, (1) `dask` and `loky` do not respect the
    global bayesfast random seed; (2) `sharedmem` may not display the progress
    messages correctly (multiple messages in the same line); (3) `loky` does not
    print any messages at all in Jupyter. So we recommend using the default
    `multiprocess` backend when possible.
    
    Parameters
    ----------
    backend : None, int, Pool, Client or MapReduce, optional
        The backend for parallelization. If `None` or `int`, will be passed as
        the `processes` argument to initialize a Pool in a with context. Set to
        `None` by default.
    """
    def __new__(cls, backend=None):
        if isinstance(backend, ParallelBackend):
            return backend
        else:
            return super(ParallelBackend, cls).__new__(cls)

    def __init__(self, backend=None):
        if isinstance(backend, ParallelBackend):
            return
        self.backend = backend

    def __enter__(self):
        if self.backend is None or isinstance(self.backend, int):
            self._backend_activated = Pool(self.backend)
        elif HAS_SHAREDMEM and isinstance(self.backend, MapReduce):
            self.backend.__enter__()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.backend is None or isinstance(self.backend, int):
            self._backend_activated.close()
            self._backend_activated.join()
            self._backend_activated = None
        elif HAS_SHAREDMEM and isinstance(self.backend, MapReduce):
            self.backend.__exit__(exc_type, exc_val, exc_tb)

    @property
    def backend(self):
        return self._backend

    @backend.setter
    def backend(self, be):
        if be is None or (isinstance(be, int) and be > 0):
            pass
        elif isinstance(be, Pool):
            pass
        elif HAS_RAY and isinstance(be, RayPool):
            pass
        elif HAS_DASK and isinstance(be, Client):
            pass
        elif HAS_SHAREDMEM and isinstance(be, MapReduce):
            pass
        elif HAS_LOKY and isinstance(be,
                                     reusable_executor._ReusablePoolExecutor):
            pass
        # elif be == 'serial':
        #     pass
        else:
            raise ValueError('invalid value for backend.')
        self._backend_activated = be
        self._backend = be

    @property
    def backend_activated(self):
        return self._backend_activated

    @property
    def kind(self):
        if self.backend is None or isinstance(self.backend, int):
            return 'multiprocess'
        elif isinstance(self.backend, Pool):
            return 'multiprocess'
        elif HAS_RAY and isinstance(self.backend, RayPool):
            return 'ray'
        elif HAS_DASK and isinstance(self.backend, Client):
            return 'dask'
        elif HAS_SHAREDMEM and isinstance(self.backend, MapReduce):
            return 'sharedmem'
        elif HAS_LOKY and isinstance(self.backend,
                                     reusable_executor._ReusablePoolExecutor):
            return 'loky'
        # elif self.backend == 'serial':
        #     return 'serial'
        else:
            raise RuntimeError('unexpected value for self.backend.')

    def map(self, fun, *iters):
        if self.backend_activated is None:
            raise RuntimeError(
                'the backend is not activated. Please use it in '
                'a with context.')
        elif isinstance(self.backend_activated, Pool):
            return self.backend_activated.starmap(fun, zip(*iters))
        elif HAS_RAY and isinstance(self.backend_activated, RayPool):
            return self.backend_activated.starmap(fun, list(zip(*iters)))
            # https://github.com/ray-project/ray/issues/11451
            # that's why I need to explicitly convert it to a list for now
        elif HAS_DASK and isinstance(self.backend_activated, Client):
            return self.gather(self.backend_activated.map(fun, *iters))
        elif HAS_SHAREDMEM and isinstance(self.backend_activated, MapReduce):
            return self.backend_activated.map(fun,
                                              list(zip(*iters)),
                                              star=True)
        elif HAS_LOKY and isinstance(self.backend_activated,
                                     reusable_executor._ReusablePoolExecutor):
            return self.gather(self.backend_activated.map(fun, *iters))
        # elif self.backend_activated == 'serial':
        #     return [deepcopy(fun)(*[i[j] for i in iters]) for j in range(l)]
        else:
            raise RuntimeError('unexpected value for self.backend_activated.')

    def map_async(self, fun, *iters):
        if self.backend_activated is None:
            raise RuntimeError(
                'the backend is not activated. Please use it in '
                'a with context.')
        elif isinstance(self.backend_activated, Pool):
            return self.backend_activated.starmap_async(fun, zip(*iters))
        elif HAS_RAY and isinstance(self.backend_activated, RayPool):
            return self.backend_activated.starmap_async(fun, list(zip(*iters)))
        elif HAS_DASK and isinstance(self.backend_activated, Client):
            return self.backend_activated.map(fun, *iters)
        elif HAS_SHAREDMEM and isinstance(self.backend_activated, MapReduce):
            warnings.warn(
                'sharedmem does not support map_async. Using map '
                'instead.', RuntimeWarning)
            return self.backend_activated.map(fun,
                                              list(zip(*iters)),
                                              star=True)
        elif HAS_LOKY and isinstance(self.backend_activated,
                                     reusable_executor._ReusablePoolExecutor):
            return self.backend_activated.map(fun, *iters)
        # elif self.backend_activated == 'serial':
        #     return self.map(fun, *iters)
        else:
            raise RuntimeError('unexpected value for self.backend_activated.')

    def gather(self, async_result):
        if self.backend_activated is None:
            raise RuntimeError(
                'the backend is not activated. Please use it in '
                'a with context.')
        elif isinstance(self.backend_activated, Pool):
            return async_result.get()
        elif isinstance(self.backend_activated, RayPool):
            return async_result.get()
        elif HAS_DASK and isinstance(self.backend_activated, Client):
            return self.backend_activated.gather(async_result)
        elif HAS_SHAREDMEM and isinstance(self.backend_activated, MapReduce):
            return async_result
        elif HAS_LOKY and isinstance(self.backend_activated,
                                     reusable_executor._ReusablePoolExecutor):
            return list(async_result)
        # elif self.backend_activated == 'serial':
        #     return async_result
        else:
            raise RuntimeError('unexpected value for self.backend_activated.')
        ('SN20_Bleb_fromStart', 14, 0, 1, -235, 30),
        ('SN20_Bleb_fromStart', 14, 0, 2, 120, 230),
        ('SN20_Bleb_fromStart', 14, 0, 3, -230, 105),
        ('SN20_Bleb_fromStart', 14, 0, 4, 205, 35),
        ('SN20_Bleb_fromStart', 14, 1, 2, 110, -180),
        ('SN20_Bleb_fromStart', 14, 1, 3, -220, 25),
        ('SN20_Bleb_fromStart', 14, 1, 4, -150, 0),
        ('SN20_Bleb_fromStart', 14, 2, 3, 160, -130),
        ('SN20_Bleb_fromStart', 14, 2, 4, -75, 210),
        ('SN20_Bleb_fromStart', 14, 3, 4, 220, 105),
        ('SN20_Bleb_fromStart', 15, 0, 1, 0, 235),
        ('SN20_Bleb_fromStart', 16, 0, 1, 0, -225),
        ('SN20_Bleb_fromStart', 16, 0, 2, -80, 130),
        ('SN20_Bleb_fromStart', 16, 1, 2, -60, -120),
        ('SN20_Bleb_fromStart', 17, 0, 2, -180, 0),
        ('SN20_Bleb_fromStart', 17, 0, 3, 155, 0),
        ('SN20_Bleb_fromStart', 17, 1, 2, -225, -115),
        ('SN20_Bleb_fromStart', 17, 1, 3, -135, 20),
        ('SN20_Bleb_fromStart', 18, 0, 1, -110, -175),
        ('SN20_Bleb_fromStart', 19, 0, 1, 70, -150),
        ('SN20_Bleb_fromStart', 19, 1, 2, -100, 115),
        ('SN20_Bleb_fromStart', 19, 1, 3, 60, -170),
        ('SN20_Bleb_fromStart', 19, 2, 3, 135, 185),
        ('SN20_Bleb_fromStart', 20, 0, 1, 175, 20),
        ('SN20_Bleb_fromStart', 20, 0, 2, 205, -60),
        ('SN20_Bleb_fromStart', 20, 1, 2, -135, 80),
    ]
    _p = Pool(CPUS_TO_USE)
    _answers = _p.starmap(process_fake_following, _arguments)
    _p.close()
Пример #15
0
class DataPipelineWithReward:
    """
    Creates a data pipeline that also outputs discounted reward.
    """
    def __init__(self, observables: List[AgentHandler],
                 actionables: List[AgentHandler],
                 mission_handlers: List[AgentHandler], nsteps, gamma,
                 data_directory, num_workers, worker_batch_size,
                 min_size_to_dequeue):
        """
        Sets up a tensorflow dataset to load videos from a given data directory.
        :param data_directory: the directory of the data to be loaded, eg: 'minerl.herobraine_parse/output/rendered/'
        """

        self.data_dir = data_directory
        self.observables = observables
        self.actionables = actionables
        self.mission_handlers = mission_handlers
        # self.vectorizer = vectorizer

        self.number_of_workers = num_workers
        self.worker_batch_size = worker_batch_size
        self.size_to_dequeue = min_size_to_dequeue
        self.nsteps = nsteps
        self.gamma = gamma

        self.processing_pool = Pool(self.number_of_workers)
        self.m = multiprocessing.Manager()
        self.data_queue = self.m.Queue(maxsize=self.size_to_dequeue //
                                       self.worker_batch_size * 4)

        pool_size = self.size_to_dequeue * 4
        self.random_queue = PriorityQueue(maxsize=pool_size)

    def batch_iter(self, batch_size):
        """
        Returns a generator for iterating through batches of the dataset.
        :param batch_size:
        :param number_of_workers:
        :param worker_batch_size:
        :param size_to_dequeue:
        :return:
        """
        logger.info("Starting batch iterator on {}".format(self.data_dir))
        data_list = self._get_all_valid_recordings(self.data_dir)

        load_data_func = self._get_load_data_func(self.data_queue, self.nsteps,
                                                  self.worker_batch_size,
                                                  self.mission_handlers,
                                                  self.observables,
                                                  self.actionables, self.gamma)
        map_promise = self.processing_pool.map_async(load_data_func, data_list)

        # We map the files -> load_data -> batch_pool -> random shuffle -> yield.
        # batch_pool = []
        start = 0
        incr = 0
        while not map_promise.ready() or not self.data_queue.empty(
        ) or not self.random_queue.empty():
            #print("d: {} r: {}".format(data_queue.qsize(), random_queue.qsize()))

            while not self.data_queue.empty() and not self.random_queue.full():
                for ex in self.data_queue.get():
                    if not self.random_queue.full():
                        r_num = np.random.rand(1)[0] * (1 - start) + start
                        self.random_queue.put((r_num, ex))
                        incr += 1
                        # print("d: {} r: {} rqput".format(data_queue.qsize(), random_queue.qsize()))
                    else:
                        break

            if incr > self.size_to_dequeue:
                if self.random_queue.qsize() < (batch_size):
                    if map_promise.ready():
                        break
                    else:
                        continue
                batch_with_incr = [
                    self.random_queue.get() for _ in range(batch_size)
                ]

                r1, batch = zip(*batch_with_incr)
                start = 0
                traj_obs, traj_acts, traj_handlers, traj_n_obs, discounted_rewards, elapsed = zip(
                    *batch)

                observation_batch = [
                    HandlerCollection({
                        o: np.asarray(traj_ob[i])
                        for i, o in enumerate(self.observables)
                    }) for traj_ob in traj_obs
                ]
                action_batch = [
                    HandlerCollection({
                        a: np.asarray(traj_act[i])
                        for i, a in enumerate(self.actionables)
                    }) for traj_act in traj_acts
                ]
                mission_handler_batch = [
                    HandlerCollection({
                        m: np.asarray(traj_handler[i])
                        for i, m in enumerate(self.mission_handlers)
                    }) for traj_handler in traj_handlers
                ]
                next_observation_batch = [
                    HandlerCollection({
                        o: np.asarray(traj_n_ob[i])
                        for i, o in enumerate(self.observables)
                    }) for traj_n_ob in traj_n_obs
                ]
                yield observation_batch, action_batch, mission_handler_batch, next_observation_batch, discounted_rewards, elapsed
            # Move on to the next batch bool.
            # Todo: Move to a running pool, sampling as we enqueue. This is basically the random queue impl.
            # Todo: This will prevent the data from getting arbitrarily segmented.
            # batch_pool = []
        try:
            map_promise.get()
        except RuntimeError as e:
            logger.error("Failure in data pipeline: {}".format(e))

        logger.info("Epoch complete.")

    def close(self):
        self.processing_pool.close()
        self.processing_pool.join()

    ############################
    ## PRIVATE METHODS
    #############################

    @staticmethod
    def _get_load_data_func(data_queue, nsteps, worker_batch_size,
                            mission_handlers, observables, actionables, gamma):
        def _load_data(inst_dir):
            recording_path = str(os.path.join(inst_dir, 'recording.mp4'))
            univ_path = str(os.path.join(inst_dir, 'univ.json'))

            try:
                cap = cv2.VideoCapture(recording_path)
                # Litty uni
                with open(univ_path, 'r') as f:
                    univ = {int(k): v for (k, v) in (json.load(f)).items()}
                    univ = OrderedDict(univ)
                    univ = np.array(list(univ.values()))

                # Litty viddy
                batches = []
                rewards = []
                frames_queue = Queue(maxsize=nsteps)

                # Loop through the video and construct frames
                # of observations to be sent via the multiprocessing queue
                # in chunks of worker_batch_size to the batch_iter loop.
                frame_num = 0
                while True:
                    ret, frame = cap.read()

                    if not ret or frame_num >= len(univ):
                        break
                    else:
                        #print("Batches {} and worker batch size {}".format(len(batches), self.worker_batch_size))
                        if len(batches) >= worker_batch_size:
                            data_queue.put(batches)
                            batches = []

                        try:
                            # Construct a single observation object.
                            vf = (np.clip(frame[:, :, ::-1], 0, 255))
                            uf = univ[frame_num]

                            frame = {'pov': vf}
                            frame.update(uf)

                            cur_reward = 0
                            for m in mission_handlers:
                                try:
                                    if isinstance(m, RewardHandler):
                                        cur_reward += m.from_universal(frame)
                                except NotImplementedError:
                                    pass
                            rewards.append(cur_reward)

                            #print("Frames queue size {}".format(frames_queue.qsize()))
                            frames_queue.put(frame)
                            if frames_queue.full():
                                next_obs = [
                                    o.from_universal(frame)
                                    for o in observables
                                ]
                                frame = frames_queue.get()
                                obs = [
                                    o.from_universal(frame)
                                    for o in observables
                                ]
                                act = [
                                    a.from_universal(frame)
                                    for a in actionables
                                ]
                                mission = []
                                for m in mission_handlers:
                                    try:
                                        mission.append(m.from_universal(frame))
                                    except NotImplementedError:
                                        mission.append(None)
                                        pass

                                batches.append(
                                    (obs, act, mission, next_obs,
                                     DataPipelineWithReward.
                                     _calculate_discount_rew(
                                         rewards[-nsteps:],
                                         gamma), frame_num + 1 - nsteps))
                        except Exception as e:
                            # If there is some error constructing the batch we just start a new sequence
                            # at the point that the exception was observed
                            logger.warn(
                                "Exception {} caught in the middle of parsing {} in "
                                "a worker of the data pipeline.".format(
                                    e, inst_dir))

                    frame_num += 1

                return batches
            except Exception as e:
                logger.error("Caught Exception")
                raise e
                return None

        return _load_data

    @staticmethod
    def _calculate_discount_rew(rewards, gamma):
        total_reward = 0
        for i, rew in enumerate(rewards):
            total_reward += (gamma**i) * rew
        return total_reward

    @staticmethod
    def _get_all_valid_recordings(path):
        directoryList = []

        # return nothing if path is a file
        if os.path.isfile(path):
            return []

        # add dir to directorylist if it contains .txt files
        if len([f for f in os.listdir(path) if f.endswith('.mp4')]) > 0:
            if len([f for f in os.listdir(path) if f.endswith('.json')]) > 0:
                directoryList.append(path)

        for d in os.listdir(path):
            new_path = os.path.join(path, d)
            if os.path.isdir(new_path):
                directoryList += DataPipelineWithReward._get_all_valid_recordings(
                    new_path)

        directoryList = np.array(directoryList)
        np.random.shuffle(directoryList)
        return directoryList.tolist()