Пример #1
0
 def run_multiprocess_dataless(self, func, *args):
     pool = mp.Pool(processes=self.num_of_processes)
     funclist = []
     if len(args) > 0:
         f = pool.apply_async(func, args)
         funclist.append(f)
     else:
         f = pool.apply_async(func)
         funclist.append(f)
     for idx, f in enumerate(funclist):
         print(f)
         print(f.get())
     pool.close()
     pool.join()
Пример #2
0
def ip_convolution(wav, flux, chip_limits, R, fwhm_lim=5.0, numProcs=None):
    """Spectral convolution which allows non-equidistant step values.

    Parameters
    ----------
    wav:
        Wavelength
    flux:
        Flux of spectrum
    chip_limits: List[float, float]
        Wavelength limits of region to return after convolution.
    R:
        Resolution to convolve to.
    fwhm_lim:
        Number of FWHM of convolution kernel to use as edge buffer.
    numProcs: int
        NUmber of processes to use. Default=None selects cpu_count - 1.
     """
    # Turn into numpy arrays
    wav = np.asarray(wav, dtype="float64")
    flux = np.asarray(flux, dtype="float64")

    wav_chip, flux_chip = wav_selector(wav, flux, chip_limits[0],
                                       chip_limits[1])
    # We need to calculate the fwhm at this value in order to set the starting
    # point for the convolution
    fwhm_min = wav_chip[0] / R  # fwhm at the extremes of vector
    fwhm_max = wav_chip[-1] / R

    # Wide wavelength bin for the resolution_convolution
    wav_min = wav_chip[0] - fwhm_lim * fwhm_min
    wav_max = wav_chip[-1] + fwhm_lim * fwhm_max
    wav_ext, flux_ext = wav_selector(wav, flux, wav_min, wav_max)

    # Multiprocessing part
    if numProcs is None:
        numProcs = mprocess.cpu_count() - 1

    mprocPool = mprocess.Pool(processes=numProcs)

    args_generator = [[wav, R, wav_ext, flux_ext, fwhm_lim]
                      for wav in wav_chip]

    flux_conv_res = np.array(
        mprocPool.map(wrapper_fast_convolve, args_generator))

    mprocPool.close()

    return wav_chip, flux_conv_res
Пример #3
0
    def explain(self):
        """Use SHAP values to features' contributions to predict the 
        responsiveness of a gene.
        """
        with mp.Pool(processes=self.k_folds) as pool:
            mp_results = {}

            for k, y_te in enumerate(self.cv_results['preds']):
                n_tfs_te = len(y_te['tf'].unique())
                n_tfs_tr = len(self.tfs) - n_tfs_te

                y_te['tf:gene'] = y_te['tf'] + ':' + y_te['gene']
                te_tg_pairs = y_te['tf:gene'].values
                te_idx = [
                    self.tg_pairs.index(tg_pair) for tg_pair in te_tg_pairs
                ]

                tr_idx = sorted(set(range(len(self.tg_pairs))) - set(te_idx))
                logger.info('Explaining {} genes in fold {}'.format(
                    len(te_idx), k))

                tf_X_tr, tf_X_te = self.tf_X[tr_idx], self.tf_X[te_idx]

                tf_X_tr, tf_X_te = standardize_feat_mtx(
                    tf_X_tr, tf_X_te, 'zscore')
                nontf_X, _ = standardize_feat_mtx(self.nontf_X, None, 'zscore')

                X_tr = np.hstack(
                    [tf_X_tr,
                     np.vstack([nontf_X for i in range(n_tfs_tr)])])
                X_te = np.hstack(
                    [tf_X_te,
                     np.vstack([nontf_X for i in range(n_tfs_te)])])

                bg_idx = np.random.choice(range(X_tr.shape[0]),
                                          BG_GENE_NUM,
                                          replace=False)
                mp_results[k] = pool.apply_async(
                    calculate_tree_shap,
                    args=(
                        self.cv_results['models'][k],
                        X_te,
                        te_tg_pairs,
                        X_tr[bg_idx],
                    ))

            self.shap_vals = [
                mp_results[k].get() for k in sorted(mp_results.keys())
            ]
Пример #4
0
def matchscan(items, fun):
    p = mp.Pool()
    count=len(items)
    input = itertools.combinations_with_replacement(items, 2)
    inp = (x for x in input if x[0]!=x[1] is not None)
    results = p.map(fun, inp)
    p.close()
    p.join()
    tri = np.zeros((count, count))
    iu1 = np.triu_indices(count,1,count)
    il1 = np.tril_indices(count,-1,count)
    tri[iu1]=results
    tri[il1]=np.flipud(np.rot90(tri, 1))[il1]
    np.fill_diagonal(tri, 100)
    return tri
Пример #5
0
 def run_hmm(self):
     
     logging.info('Running HMMER against Cas profiles')
     
     # Make dir
     os.mkdir(self.out+'hmmer')
     # Start multiprocess
     pool = mp.Pool(self.threads)
     # Each HMM
     if self.lvl == 'DEBUG' or self.simplelog:
         list(pool.imap(self.hmmsearch, os.listdir(self.pdir)))
     else:
         list(tqdm.tqdm(pool.imap(self.hmmsearch, os.listdir(self.pdir)), total=len(os.listdir(self.pdir))))
     # Close multiprocess
     pool.close()
Пример #6
0
def get_data(batch_size):
	file_batch = random.sample(files_list,batch_size)
	x = 110
	y = 110
	z = 110
	pool = mp.Pool(processes=batch_size)
	for i in range(batch_size):	
		t1=time.time()
		res = pool.apply_async(data_worker,(x,y,z,file_batch[i]))
		tmp = np.ctypeslib.as_array(res.get())
		print(tmp.shape)
		print(tmp)
		print('Time:'+str(time.time()-t1)+'s')
	pool.close()
	pool.join()
Пример #7
0
def main(datasets):

    t0 = time.time()

    cores = 18  # mp.cpu_count()

    #datasets = ['/g/data1/k88/MODIS_C/MCD43A4.006/2000091/']i
    pool = mp.Pool(processes=cores)

    res = pool.map(create_doc_dataset, datasets)

    t1 = time.time()

    print('Processed in {0:.1f} seconds'.format(t1 - t0))

    return
Пример #8
0
def mmap(f, items, callback, interval=0.025, num_cores=0):
	""" num_cores: None or 0 mean maximum ("the number returned by `os.cpu_count()`"). 
			@see https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool
	"""
	results = []
	start = time.time()
	with multiprocess.Pool(num_cores if num_cores != 0 else None) as pool:
		for i, result in enumerate(pool.imap_unordered(f, items), 1):
			results.append(result)
			now = time.time()
			if now-start >= interval:
				start = now
				callback(i/len(items)*100)
		callback(100)
	# pool.clear()  # Using close/join crashes the app on the next iteration, ProcessPool must effect the global space.
	return results
Пример #9
0
def update_catalog_collection(elements,
                              max_miller,
                              n_processes=1,
                              mp_query=None):
    '''
    This function will add enumerate and add adsorption sites to our `catalog`
    Mongo collection.

    Args:
        elements        A list of strings indicating the elements you are
                        looking for, e.g., ['Cu', 'Al']
        max_miller      An integer indicating the maximum Miller index to be
                        enumerated
        n_processes     An integer indicating how many threads you want to use
                        when running the tasks. If you do not expect many
                        updates, stick to the default of 1, or go up to 4. If
                        you are re-creating your collection from scratch, you
                        may want to want to increase this argument as high as
                        you can.
        mp_query        We get our bulks from The Materials Project. This
                        dictionary argument is used as a Mongo query to The
                        Materials Project Database. If you do not supply this
                        argument, then it will automatically filter out bulks
                        whose energies above the hull are greater than 0.1 eV
                        and whose formation energy per atom are above 0 eV.
    '''
    # Python doesn't like mutable arguments
    if mp_query is None:
        mp_query = {}

    # Figure out the MPIDs we need to enumerate
    get_mpid_task = _GetMpids(elements=elements, mp_query=mp_query)
    schedule_tasks([get_mpid_task])
    mpids = get_task_output(get_mpid_task)

    # For each MPID, enumerate all the sites and then add them to our `catalog`
    # Mongo collection. Do this in parallel because it can be.
    if n_processes > 1:
        with multiprocess.Pool(n_processes) as pool:
            list(
                pool.imap(func=lambda mpid: __run_insert_to_catalog_task(
                    mpid, max_miller),
                          iterable=mpids,
                          chunksize=20))
    else:
        for mpid in mpids:
            __run_insert_to_catalog_task(mpid, max_miller)
Пример #10
0
    def _run(self):
        data = self.input
        window = self.params['window']
        group_by_feature = self.params['group_by']
        date_field = self.params['date_field']

        # Fixing same data timestamps for same card_id
        # check_data = data.reset_index().set_index([group_by_feature, date_field])
        # duplicate_transactions = check_data[check_data.index.duplicated()]['TransactionID'].values
        #
        # while len(duplicate_transactions) > 0:
        #     print(f"Found {len(duplicate_transactions)} duplicate transactions")
        #     for itid, tid in enumerate(duplicate_transactions):
        #         print(itid)
        #         q = data.loc[tid]
        #         date = q[date_field]
        #         card_id = q[group_by_feature]
        #         alldup = data[data[date_field] == date]
        #         alldup = alldup[alldup[group_by_feature] == card_id]
        #         #     print(alldup.index)
        #         for it, idx in enumerate(alldup.index):
        #             #         print(idx)
        #             data.loc[idx, date_field] += pd.Timedelta(seconds=it)
        #     check_data = data.reset_index().set_index([group_by_feature, date_field])
        #     duplicate_transactions = check_data[check_data.index.duplicated()]['TransactionID'].values

        with mp.Pool() as Pool:

            self.output = pd.DataFrame(index=data.index)
            for nf in self.params['features']:
                if nf not in data.columns:
                    continue

                print(nf)
                df = pd.DataFrame(index=data.index)
                data_slice = data[[date_field, group_by_feature,
                                   nf]].reset_index()
                args = [(data_slice, group_by_feature, nf, ws)
                        for ws in window]
                m = Pool.imap(aggregate_with_time_local, args)

                for i, df_agg in enumerate(m):
                    print('.')
                    assert df.shape[0] == df_agg.shape[0]
                    df = pd.concat([df, df_agg], axis=1)

                self.output = self.output.join(df)
Пример #11
0
def Run_Dynamic_Nested_Fitting(loglikelihood,
                               prior_transform, ndim,
                               nlive_init=100, sample='auto', 
                               nlive_batch=50, maxbatch=2,
                               pfrac=0.8, n_cpu=None,
                               print_progress=True):
    
    """ Run Fitting as a Function.
    
    Parameters
    ----------
    loglikelihood: function
        log likelihood function
    prior_transform: function
        priot transorm function
    ndim: int
        number of dimension
        
    """
    
    print("Run Nested Fitting for the image... #a of params: %d"%ndim)
    
    start = time.time()
    
    if n_cpu is None:
        n_cpu = mp.cpu_count()-1
        
    with mp.Pool(processes=n_cpu) as pool:
        print("Opening pool: # of CPU used: %d"%(n_cpu))
        pool.size = n_cpu

        dlogz = 1e-3 * (nlive_init - 1) + 0.01

        pdsampler = dynesty.DynamicNestedSampler(loglikelihood, prior_transform, ndim,
                                                 sample=sample, pool=pool,
                                                 use_pool={'update_bound': False})
        pdsampler.run_nested(nlive_init=nlive_init, 
                             nlive_batch=nlive_batch, 
                             maxbatch=maxbatch,
                             print_progress=print_progress, 
                             dlogz_init=dlogz, 
                             wt_kwargs={'pfrac': pfrac})
        
    end = time.time()
    print("Finish Fitting! Total time elapsed: %.3gs"%(end-start))
    
    return pdsampler
Пример #12
0
    def __init__(self, Pos=None, MD=None ,pth=None, acq = None,frames=None, NucChannel='DeepBlue', threads=10, register=True, **kwargs):

        if any([Pos is None]):
            raise ValueError('Please provide position')

        if pth is None:
            if MD is not None:
                self.pth = MD.base_pth;
        else:
            self.pth = pth

        if MD is None:
            MD = Metadata(pth)

        if MD().empty:
            raise AssertionError('No metadata found in supplied path')

        if Pos not in MD.posnames:
            raise AssertionError('Position does not exist in dataset')
        self.posname = Pos

        self.channels = MD.unique('Channel',Position=Pos)
        self.acq = MD.unique('acq',Position=Pos)

        if frames is None:
            self.frames = MD.unique('frame',Position=Pos)
        elif type(frames) is not list:
            self.frames = [frames]
        else:
            self.frames = frames

        if len(self.frames)==1:
            register=False
        self._registerflag=register
        self._tracked=False
        self._splitflag=False
        #self.PixelSize = MD.unique('PixelSize')[0]

        # Create all framelabels for the different TPs. This will segment and measure stuff.

        with mp.Pool(threads) as ppool:
            frames = list(tqdm(ppool.imap(partial(FrameLbl, MD = MD, pth = pth, Pos=Pos, NucChannel=NucChannel,register=self._registerflag, **kwargs), self.frames), total=len(self.frames)))
            #ppool.close()
            #ppool.join()
        self.framelabels = np.array(frames)
        self._calculate_pointmat()
        print('\nFinished loading and segmenting position ' + str(Pos))
Пример #13
0
 def kline_data(self, pair_list, interval, **kwargs):
     start_date = kwargs.get('start_date', '')
     end_date = kwargs.get('end_date', '')
     storage = kwargs.get('storage', '')
     output = kwargs.get('output', '')
     progress_statements = kwargs.get('progress_statements', '')
     if start_date:
         start_date = datetime.datetime.strptime(start_date, '%m/%d/%Y')
     if end_date:
         end_date = datetime.datetime.strptime(end_date, '%m/%d/%Y')
     valid_kline_intervals = [
         '1m', '3m', '5m', '15m', '30m', '1h', '2h', '4h', '6h', '8h', '12h'
     ]
     if interval not in set(valid_kline_intervals):
         raise ValueError(
             'Invalid Interval: Kline interval should be one of the following - {}'
             .format(','.join(valid_kline_intervals)))
     output = self.process_kline_output(output)
     if not storage:
         storage = ['csv', None]
     try:
         storage_method, intended_dir = storage
     except ValueError:
         storage_method = storage[0]
         intended_dir = None
     if progress_statements:
         self.progress_statements = progress_statements
     if storage_method.lower() == 'csv':
         kline_interval_directory = self.create_csv_directories(
             pair_list, interval, intended_dir)
         csv_file_info = mp.Manager().list()
         pair = [currency_pair for i, currency_pair in enumerate(pair_list)]
         lock = mp.Lock()
         pool = mp.Pool(processes=3, initargs=(lock, ))
         # data = pool.starmap(self.kline_to_csv,zip(pair,re(start_date),re(end_date),re(kline_interval_directory),re(interval),re(titles),re(fields),re(csv_file_info)))
         data = pool.starmap(
             self.kline_to_csv,
             zip(pair, re(start_date), re(end_date),
                 re(kline_interval_directory), re(interval),
                 re(csv_file_info)))
         pool.close()
         pool.join()
         self.concatenate_csvs(set(list(csv_file_info)))
     else:
         raise ValueError(
             'Invalid Storage Type: Currently only csv storage supported')
Пример #14
0
 def fit_transformer(self, iterations=None, samps_per_iteration=None):
     if samps_per_iteration is None:
         samps_per_iteration = 10 * self.n_components
     if iterations is None:
         iterations = np.ceil(self.total_samples /
                              samps_per_iteration).astype(int)
     self.pool = mp.Pool()
     for i in tqdm(range(iterations), desc='incremental char pca'):
         samples = self.make_sample(samps_per_iteration,
                                    seed=i + self.seed,
                                    pool=self.pool)
         self.transformer.partial_fit(samples)
     del self.transformed_expr
     self.pool.close()
     del self.pool
     self.is_fitted = True
     return self.transformer
 def _forward_Difference_And_F0(self, x0):
     assert len(x0) == self.numDim
     mask = np.eye(len(x0))
     mask = np.row_stack((np.zeros(self.numDim), mask))
     x_abArr = mask * self.gradStepSampSize + x0  # x upper and lower values for derivative calc
     if self.parallel == True:
         with mp.Pool() as pool:
             vals_ab = np.asarray(pool.map(self.funcObj, x_abArr))
     else:
         vals_ab = np.asarray([self.funcObj(x) for x in x_abArr])
     assert len(vals_ab.shape) == 1 and len(vals_ab) == self.numDim + 1
     F0 = vals_ab[0]
     deltaVals = vals_ab[1:] - F0
     grad = deltaVals / self.gradStepSampSize
     if self.disp == True and self.descentMethod == 'adam':
         print(x0, F0)
     return F0, grad
Пример #16
0
def get_ci_seq(x, ci_fn, times, parallel=False):
    """
    Get sequence of confidence intervals

    Parameters
    ----------
    x, array-like
        The vector of observations between 0 and 1.

    ci_fn, univariate function
        A function which takes an array-like of bounded numbers `x`
        and outputs a tuple `(l, u)` of lower and upper confidence
        intervals. Note that `l` and `u` are scalars (not vectors).

    times, array-like of positive integers
        Times at which to compute the confidence interval.

    parallel, boolean
        Should this function be parallelized?

    Returns
    -------
    l, array-like of [0, 1]-valued reals
        Lower confidence intervals

    u, array-like of [0, 1]-valued reals
        Upper confidence intervals
    """
    x = np.array(x)

    l = np.repeat(0.0, len(times))
    u = np.repeat(1.0, len(times))

    if parallel:
        n_cores = multiprocess.cpu_count()
        print("Using " + str(n_cores) + " cores")
        with multiprocess.Pool(n_cores) as p:
            result = np.array(p.map(lambda time: ci_fn(x[0:time]), times))
        l, u = result[:, 0], result[:, 1]
    else:
        for i in np.arange(0, len(times)):
            time = times[i]
            x_t = x[0:time]
            l[i], u[i] = ci_fn(x_t)

    return l, u
Пример #17
0
    def solve_heat_transfer(self):
        """ Solve the heat transfer problem for each tube

    Adds the thermal results to each receiver.Tube object
    """
        #pylint: disable=no-member
        if self.progress:
            print("Running thermal analysis:")
        with multiprocess.Pool(self.nthreads) as p:
            temps = list(
                self.progress_decorator(
                    p.imap(
                        lambda x: self.thermal_solver.solve(
                            x, self.thermal_material, self.fluid_material),
                        self.tubes), self.ntubes))
        for tube, temps in zip(self.tubes, temps):
            tube.add_results("temperature", temps)
Пример #18
0
 def optimize1(self):
     self.initialize_Optimization()
     numSamples = 1000
     samples = np.asarray(skopt.sampler.Sobol().generate(
         self.bounds, numSamples))
     with mp.Pool(maxtasksperchild=1) as pool:
         vals = np.asarray(
             pool.map(self.cost_Function, samples, chunksize=1))
     xOptimal = samples[np.argmin(vals)]
     return gradient_Descent(self.continuous_Cost,
                             xOptimal,
                             200e-6,
                             50,
                             gradStepSize=50e-6,
                             gradMethod='central',
                             descentMethod='adam',
                             disp=True)
Пример #19
0
 def __init__(self,
              data1,
              data2,
              reference_data,
              index_db,
              chromosom_length,
              cpus=10):
     self.data1 = data1
     self.data2 = data2
     self.reference_data = reference_data
     self.index_db = index_db
     self.chromosom_length = chromosom_length
     self.population = []
     self.scores = []
     self.pool = multiprocess.Pool(cpus)
     self.fittness = []
     self.fittness_function = self.fittness_min_mean
Пример #20
0
    def determine_life(self,
                       receiver,
                       material,
                       nthreads=1,
                       decorator=lambda x, n: x):
        """
        Determine the life of the receiver by calculating individual
        material point damage and finding the minimum of all points.

        Parameters:
          receiver        fully-solved receiver object
          material        material model to use

        Additional Parameters:
          nthreads        number of threads
          decorator       progress bar
        """
        with multiprocess.Pool(nthreads) as p:
            results = list(
                decorator(
                    p.imap(
                        lambda x: self.tube_log_reliability(
                            x, material, receiver),
                        receiver.tubes,
                    ),
                    receiver.ntubes,
                ))

        p_tube = np.array([res[0] for res in results])
        tube_fields = [res[1] for res in results]

        # Tube reliability is the minimum of all the time steps
        tube = np.min(p_tube, axis=1)

        # Overall reliability is the minimum of the sum
        overall = np.min(np.sum(p_tube, axis=0))

        # Add the field to the tubes
        for tubei, field in zip(receiver.tubes, tube_fields):
            tubei.add_quadrature_results("log_reliability", field)

        # Convert back from log-prob as we go
        return {
            "tube_reliability": np.exp(tube),
            "overall_reliability": np.exp(overall),
        }
Пример #21
0
def pdist(data, dist_func, n_jobs=1, max_time=None):
    """
    Parallel pairwise distances between elements of x. Similar to scipy pdist with
    parallel processing and bounded runtime.
    """
    x = np.asanyarray(data)
    n_jobs = mp.cpu_count() if n_jobs == -1 else min(n_jobs, mp.cpu_count())
    if max_time is None or max_time == np.inf:
        with mp.Pool(processes=n_jobs) as pool:
            dists = pool.map(lambda a: dist_func(*a), itr.combinations(x, 2))
        return np.array(dists)

    def dist_col(a, b_list):
        return [dist_func(a, b) for b in b_list]

    dist_lists = []
    end = np.inf if max_time is None else time.time() + max_time
    projected_end = np.NINF
    n = len(x)
    i = 0
    end_i = 0

    # calculate matrix columns in chunks of n_jobs at a time
    with jl.Parallel(n_jobs=n_jobs) as parallel:
        while end_i < n and projected_end <= end:
            chunk_start = time.time()
            start_i = i + 1
            end_i = min(i + n_jobs + 1, n)
            a_list = [x[j] for j in range(start_i, end_i)]
            b_lists = [x[:k] for k in range(start_i, end_i)]

            cols = parallel(
                jl.delayed(dist_col)(a, b) for a, b in zip(a_list, b_lists))

            for col in cols:
                for sublist, d in zip(dist_lists, col):
                    sublist.append(d)
                dist_lists.append([col[-1]])

            i += n_jobs
            n_items = sum([len(b) for b in b_lists])
            next_items = n_items + n_jobs * (n_jobs - 1) / 2
            col_time = time.time() - chunk_start
            projected_end = time.time() + col_time * next_items / n_items

    return flat_list(dist_lists)
Пример #22
0
    def __init__(self, num_workers, eval_function, timeout=None):
        self.num_workers = num_workers
        self.eval_function = eval_function
        self.timeout = timeout
        self.manager = mp.Manager()
        self.connection_strings = self.manager.Queue()
        for i in range(num_workers):
            #  Connect to SITL directly wthout mavproxy, can only do one instance
            # port = 5760 + i*10
            # self.connection_strings.put('tcp:127.0.0.1:' + str(port))
            # Connect to mavproxy, uses more rescources, max about 20 instances
            port = 14550 + i * 10
            self.connection_strings.put('127.0.0.1:' + str(port))

        self.pool = mp.Pool(num_workers,
                            initializer=self.initializer,
                            initargs=(self.connection_strings, ))
Пример #23
0
    def mt_evaluate_logps(self, parallel, multitry, proposed_pts, pfunc, ref=False):
        """Evaluate the log probability for multiple points in serial or parallel when using multi-try.

        Parameters
        ----------
        parallel : bool
            Whether to evaluate multi-try points in parallel
        multitry : int
            Number of multi-try points
        proposed_pts : numpy 2D array nmulti-try x nparameterdims
            Proposed points
        pfunc : function
            Function that takes a point in parameter space and
            returns the log of the prior value and the log of the likelihood at that point
        ref : bool
            Whether this is a multi-try reference draw. Default = False"""
        
        #If using multi-try and running in parallel farm out proposed points to process pool.
        if parallel:
            p = mp.Pool(multitry)
            args = list(zip([self]*multitry, np.squeeze(proposed_pts)))
            logps = p.map(call_logp, args)
            p.close()
            p.join()
            log_priors = [val[0] for val in logps]
            log_likes = [val[1] for val in logps]
            
        else:
            log_priors = []
            log_likes = []
            if multitry == 2:
                log_priors, log_likes = np.array([pfunc(np.squeeze(proposed_pts))])
            else:
                for pt in np.squeeze(proposed_pts):
                    log_priors.append(pfunc(pt)[0])  
                    log_likes.append(pfunc(pt)[1])
        
        log_priors = np.array(log_priors)  
        log_likes = np.array(log_likes)
        
        if ref:
            log_likes = np.append(log_likes, self.last_like)
            log_priors = np.append(log_priors, self.last_prior)
            
        return log_priors, log_likes
Пример #24
0
    def _run(self):
        data = self.input
        window = self.params['window']
        group_by_feature = self.params['group_by']
        date_field = self.params['date_field']

        # Fixing same data timestamps for same card_id

        # check_data = data.reset_index().set_index([group_by_feature, 'Date'])
        # duplicate_transactions = check_data[check_data.index.duplicated()]['TransactionID'].values
        # while len(duplicate_transactions) > 0:
        #     print(f"Found {len(duplicate_transactions)} duplicate transactions")
        #     for itid, tid in enumerate(duplicate_transactions):
        #         print(itid)
        #         q = data.loc[tid]
        #         date = q['Date']
        #         card_id = q[group_by_feature]
        #         alldup = data[data['Date'] == date]
        #         alldup = alldup[alldup[group_by_feature] == card_id]
        #         #     print(alldup.index)
        #         for it, idx in enumerate(alldup.index):
        #             #         print(idx)
        #             data.loc[idx, 'Date'] += pd.Timedelta(seconds=it)
        #     check_data = data.reset_index().set_index([group_by_feature, 'Date'])
        #     duplicate_transactions = check_data[check_data.index.duplicated()]['TransactionID'].values
        #     print(data.loc[alldup.index])
        #     break

        with mp.Pool() as Pool:
            self.output = pd.DataFrame(index=data.index)
            df = pd.DataFrame(index=data.index)
            data_slice = data[[date_field, group_by_feature]].reset_index()

            data_slice.loc[:, 'hours'] = (
                data_slice.date_field -
                data_slice.date_field.iloc[0]).dt.total_seconds() / 3600

            args = [(data_slice, group_by_feature, ws) for ws in window]
            m = Pool.imap(aggregate_transaction_frequencies, args)
            for i, df_agg in enumerate(m):
                print('.')
                assert df.shape[0] == df_agg.shape[0]
                df = pd.concat([df, df_agg], axis=1)

            self.output = self.output.join(df)
Пример #25
0
    def _set_alphas(self):
        manager = mp.Manager()
        shared_alphas = manager.dict()
        shared_test_counts = manager.dict()
        self.alphas = dict()
        self.test_counts = dict()
        test_anno = self.anno.groupby('id')['test_data'].agg(is_test=any)
        exclude_test_samples = self.exclude_test_samples

        def load_alphas(entry):
            data = pd.read_csv(entry[1],
                               sep="\t",
                               skip_blank_lines=False,
                               keep_default_na=False)
            samp_names = [re.sub('\.[0-9]+', '', x) for x in data.columns]
            if not samp_names:
                return entry[0], False
            is_test_sample = [
                test_anno.loc[x, 'is_test'] if x in test_anno.index else False
                for x in samp_names
            ]
            is_test_sample = np.array(is_test_sample)
            if exclude_test_samples is True and any(is_test_sample):
                shared_alphas[entry[0]] = data.loc[:, ~is_test_sample] + 1
                shared_test_counts[entry[0]] = data.loc[:, is_test_sample]
            else:
                try:
                    shared_alphas[entry[0]] = data + 1
                except Exception as e:
                    raise Exception(f'Unable to deal with {entry[1]}')
            return entry[0], True

        tasks = list(self.expression_tsv.items())
        with mp.Pool() as pool:
            for key, success in tqdm(pool.imap(load_alphas, tasks),
                                     total=len(tasks),
                                     desc='loading counts'):
                if success is False:
                    warnings.warn('No samples found for {}.'.format(key),
                                  RuntimeWarning)
                    continue
                self.alphas[key] = shared_alphas[key]
                if key in shared_test_counts.keys():
                    self.test_counts[key] = shared_test_counts[key]
        return self.alphas, self.test_counts
Пример #26
0
    def run_multiple_backtest(self, initial_portf, start_time,
                              end_time, policies,
                              loglevel=logging.WARNING, parallel=True):
        """Backtest multiple policies.
        """

        def _run_backtest(policy):
            return self.run_backtest(initial_portf, start_time, end_time,
                                     policy, loglevel=loglevel)

        num_workers = min(multiprocess.cpu_count(), len(policies))
        if parallel:
            workers = multiprocess.Pool(num_workers)
            results = workers.map(_run_backtest, policies)
            workers.close()
            return results
        else:
            return list(map(_run_backtest, policies))
Пример #27
0
def run_experiment(algorithms_for_experiment, backup, cores=8):

    iterations = list(range(0, 10))
    node_sizes = [250, 500]
    mus = np.arange(0.1, 0.8, 0.1)
    configuration_set = itertools.product(
        *[iterations,
          algorithms_for_experiment.items(), node_sizes, mus])

    cpu_cnt = cores
    pool = mp.Pool(processes=cpu_cnt)
    print(f"Running experiments in parallel with {cpu_cnt} cpus")
    parallel_execution_data = pool.imap_unordered(compute_experiment,
                                                  configuration_set)

    for result in parallel_execution_data:
        save_data(result, backup)
        save_data(result, result["method"])
Пример #28
0
    def process_images(self, image_paths_list):
        '''
        multiprocess load and process frames
        '''
        
        model = self.model

        p = mp.Pool(mp.cpu_count())
        images = p.map(Image.open, image_paths_list)
        images = list(images)
        resized_img = p.map(resize_method, images)
        resized_img = np.concatenate(list(resized_img), axis=0)
        # features = model.predict(resized_img, batch_size=5)
        
        p.close()
        p.join()

        return resized_img
Пример #29
0
    def findTransformation(self, transform, matches, processes, **kwargs):
        ''' This is a Method that finds the optimal transformation between two images
        given matching features using a random sample consensus.
            Input:
                transform: skimage.transform object
                matches (list): matches found through match_features method.
                processors: Number of processors to use. 
                **kwargs are passed to skimage.transform.ransac
                
            Output:
                Transformations.
        '''

        keypts = self.features[0]

        def optimization(Pts):
            robustTrans, inliers = ransac((Pts[0], Pts[1]), transform,
                                          **kwargs)
            output = [robustTrans, inliers]
            return output

        # start pool of workers
        print('launching %i kernels...' % (processes))
        pool = mp.Pool(processes)
        tasks = [(key1[match[:, 0]], key2[match[:, 1]])
                 for match, key1, key2 in zip(matches, keypts[:], keypts[1:])]
        chunk = int(len(keypts) / processes)
        jobs = pool.imap(optimization, tasks, chunksize=chunk)

        # get Transforms and inlier matches
        transforms, trueMatches = [], []
        print('Extracting Inlier Matches with RANSAC...')
        try:
            for j in jobs:
                transforms.append(j[0])
                trueMatches.append(j[1])
        except np.linalg.LinAlgError:
            pass

        # close the pool
        pool.close()
        print('Closing down the kernels...\n')

        return transforms, trueMatches
Пример #30
0
def findPeaksArray(vectors, waveletWidths, processes=4, **kwargs):
    '''
    Parallel version of find_peaks_cwt
    Input:
        vectors: 2d array, each column is a spectrum
        waveletWidths: 1d array of width peaks
        processes: # number of processes to use.
        kwargs: passed to scipy.signal.find_peaks_cwt
        
    Output:
        location of peaks
        
    '''

    # This is the function that will be mapped (in parallel) over the vectors 2d array
    def peaks(vector):
        peakIndices = find_peaks_cwt(vector, waveletWidths, **kwargs)
        return peakIndices

    # start pool of workers
    print('launching %i kernels...' % (processes))
    pool = mp.Pool(processes)
    tasks = [(vector) for vector in vectors]
    chunk = int(vectors.shape[0] / processes)
    jobs = pool.imap(peaks, tasks, chunksize=chunk)

    # get peaks from different processes
    results = []
    print('Extracting Peaks...')
    try:
        for j in jobs:
            results.append(j)
    except ValueError:
        warnings.warn('Error: Something went wrong!!!')

    # pack all peaks into 2d array
    peaks = [itm for itm in results]

    # close the pool
    print('Closing down the kernels... \n')
    pool.close()

    return peaks