示例#1
0
def lorenz():
    sigma = 10
    rho = 28
    beta = 8.0 / 3
    theta = 3 * np.pi / 4

    def lor(xyz, t):
        x, y, z = xyz
        x_dot = sigma * (y - x)
        y_dot = x * rho - x * z - y
        z_dot = x * y - beta * z
        return [x_dot, y_dot, z_dot]

    initial = (-10, -7, 35)
    t = np.arange(0, 100, 0.006)
    solution = odeint(lor, initial, t)
    x = solution[:, 0]
    y = solution[:, 1]
    z = solution[:, 2]
    xprime = np.cos(theta) * x - np.sin(theta) * y
    colors = ["#C6DBEF", "#9ECAE1", "#6BAED6", "#4292C6", "#2171B5", "#08519C", "#08306B"]
    p = figure(title="Lorenz example", tools='', toolbar_location=None, responsive='box')
    p.title_location = 'right'
    p.multi_line(np.array_split(xprime, 7), np.array_split(z, 7), line_color=colors, line_alpha=0.8, line_width=1.5)

    return p
示例#2
0
def reshape_soln_y(ug, nx, ny, p, px, py):
    # evenly split ug into a list of p parts
    soln = np.array_split(ug, p)
    # reshape each part
    soln = np.hstack([a.reshape(ny, nx) for a in soln])
    soln = np.vstack([arr.transpose() for arr in np.array_split(soln.transpose(), p)])
    return soln
示例#3
0
文件: sobol.py 项目: SALib/SALib
def create_task_list(D, calc_second_order, n_processors):
    # Create list with one entry (key, parameter 1, parameter 2) per sobol
    # index (+conf.). This is used to supply parallel tasks to multiprocessing.Pool
    tasks_first_order = [[d, j, None] for j in range(
        D) for d in ('S1', 'S1_conf', 'ST', 'ST_conf')]

    # Add second order (+conf.) to tasks
    tasks_second_order = []
    if calc_second_order:
        tasks_second_order = [[d, j, k] for j in range(D) for k in
                              range(j + 1, D) for d in ('S2', 'S2_conf')]

    if n_processors is None:
        n_processors = min(cpu_count(), len(
            tasks_first_order) + len(tasks_second_order))

    if not calc_second_order:
        tasks = np.array_split(tasks_first_order, n_processors)
    else:
        # merges both lists alternating its elements and splits the resulting list into n_processors sublists
        tasks = np.array_split([v for v in sum(
            zip_longest(tasks_first_order[::-1], tasks_second_order), ())
            if v is not None], n_processors)

    return tasks, n_processors
示例#4
0
文件: spa.py 项目: TylrA/pvlib-python
def solar_position_numba(unixtime, lat, lon, elev, pressure, temp, delta_t,
                         atmos_refract, numthreads, sst=False):
    """Calculate the solar position using the numba compiled functions
    and multiple threads. Very slow if functions are not numba compiled.
    """
    loc_args = np.array([lat, lon, elev, pressure, temp, delta_t,
                         atmos_refract, sst])
    ulength = unixtime.shape[0]
    result = np.empty((6, ulength), dtype=np.float64)
    if unixtime.dtype != np.float64:
        unixtime = unixtime.astype(np.float64)

    if ulength < numthreads:
        pvl_logger.warning('The number of threads is more than the length of' +
                           ' the time array. Only using %s threads.',
                            ulength)
        numthreads = ulength

    if numthreads <= 1:
        pvl_logger.debug('Only using one thread for calculation')
        solar_position_loop(unixtime, loc_args, result)
        return result

    split0 = np.array_split(unixtime, numthreads)
    split2 = np.array_split(result, numthreads, axis=1)
    chunks = [[a0, loc_args, split2[i]] for i, a0 in enumerate(split0)]
    # Spawn one thread per chunk
    threads = [threading.Thread(target=solar_position_loop, args=chunk)
               for chunk in chunks]
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()
    return result
示例#5
0
 def distribute_nodes(self, path_index):
     path = self.paths[path_index]
     if path.type == 'linear':
         digits = int(np.ceil(np.log10(path.ne)))
         base = path.index * 10 ** digits
         energies = np.linspace(path.begin, path.end, path.ne)
         weights = path.weights2 + [1] * (path.ne - 6) + path.weights3
         weights = np.array(weights) * path.int_step
         nids = np.arange(path.ne) + base + 1
     
     elif path.type == 'poles':
         base = path.index * 100
         nids0 = base + 10 + np.arange(path.poles_num) + 1
         nids1 = base + 20 + np.arange(path.poles_num) + 1
         nids = np.append(nids0, nids1)
         energies0 = path.begin + (np.arange(path.poles_num) * 2
                                                     - 1) * np.pi * 1.j
         energies1 = path.end + (np.arange(path.poles_num) * 2
                                                     - 1) * np.pi * 1.j
         weights0 = [-1] * path.poles_num
         weights1 = [1] * path.poles_num
         weights = np.append(weights0, weights1)
     
     loc_nids = np.array_split(nids, self.comm.size)[self.comm.rank]
     loc_energies = np.array_split(energies,
                                      self.comm.size)[self.comm.rank]
     loc_weights = np.array_split(weights, self.comm.size)[self.comm.rank]
     return loc_nids, loc_energies, loc_weights
示例#6
0
文件: elm.py 项目: IstanbulBoy/hpelm
    def add_data(self, X, T):
        """Feed new training data (X,T) to ELM model in batches; does not solve ELM itself.

        Helper method that updates intermediate solution parameters HH and HT, which are used for solving ELM later.
        Updates accumulate, so this method can be called multiple times with different parts of training data.
        To reset accumulated training data, use `ELM.nnet.reset()`.

        For training an ELM use `ELM.train()` instead.

        Args:
            X (matrix): input training data
            T (matrix): output training data
        """
        # initialize batch size
        nb = int(np.ceil(float(X.shape[0]) / self.batch))
        wc_vector = None

        # find automatic weights if none are given
        if self.classification == "wc" and self.wc is None:
            ns = T.sum(axis=0).astype(self.precision)  # number of samples in classes
            self.wc = ns.sum() / ns  # weights of classes

        for X0, T0 in zip(np.array_split(X, nb, axis=0),
                          np.array_split(T, nb, axis=0)):
            if self.classification == "wc":
                wc_vector = self.wc[np.where(T0 == 1)[1]]  # weights for samples in the batch
            self.nnet.add_batch(X0, T0, wc_vector)
示例#7
0
 def vocode(self, segment_voice, segment_gen):
     """This is the vocoder.  It multiplies the amplitudes of two seperate signals
     to produce a singular response""" 
     temp_final = []
     for j in range(self.num_channels):
         saw_spec = segment_gen[j].make_spectrum()
         input_spec = segment_voice[j].make_spectrum()
     
         input_hs = input_spec.hs
         saw_hs = saw_spec.hs
     
         saw_bands = np.array_split(saw_hs, self.num_bands)
         input_bands = np.array_split(input_hs, self.num_bands)
     
         final_bands = np.empty_like(saw_bands)
         for i in range(self.num_bands):
             amp_multi = np.abs(saw_bands[i])*np.abs(input_bands[i])
             phase_multi = np.angle(saw_bands[i])
             final_bands[i] = amp_multi*(np.cos(phase_multi)+(np.sin(phase_multi)*1j))
             
         temp_final.append(np.ma.concatenate(final_bands).data)
     final_wave = []
     for i in range(len(temp_final)):
         final_wave.append(thinkdsp.Spectrum(hs=temp_final[i], framerate = self.framerate).make_wave())
     output = final_wave[0]
     for i in range(1,len(final_wave)):
         output |= final_wave[i]
     return output
示例#8
0
def generate_indices(mode='r',iterations=1,train_size=300):
    if mode=='d': # deterministic
        def get_indices():
            ind = numpy.arange(1000)
            pos_train_ind = ind[:train_size]
            pos_test_ind = ind[train_size:]
            neg_train_ind = ind[:train_size]
            neg_test_ind = ind[train_size:]
            for i in range(iterations):
                yield (pos_train_ind, pos_test_ind, neg_train_ind, neg_test_ind)
    elif mode=='r': # random
        def get_indices():
            for i in range(iterations):
                pos_ind = shuffle_ind()
                pos_train_ind = pos_ind[:train_size]
                pos_test_ind = pos_ind[train_size:]
                neg_ind = shuffle_ind()
                neg_train_ind = neg_ind[:train_size]
                neg_test_ind = neg_ind[train_size:]
                yield (pos_train_ind, pos_test_ind, neg_train_ind, neg_test_ind)
    elif mode=='k': # k-fold cross-validation
        # here, iterations = number of folds
        pos_ind = shuffle_ind()
        neg_ind = shuffle_ind()
        pos_folds = numpy.array_split(pos_ind,iterations)
        neg_folds = numpy.array_split(neg_ind,iterations)
        def get_indices():
            for i in range(iterations):
                pos_train_ind = numpy.hstack(pos_folds[:i] + pos_folds[i+1:]).tolist()
                pos_test_ind = pos_folds[i].tolist() 
                neg_train_ind = numpy.hstack(neg_folds[:i] + neg_folds[i+1:]).tolist()
                neg_test_ind = neg_folds[i].tolist() 
                yield (pos_train_ind, pos_test_ind, neg_train_ind, neg_test_ind)
    return get_indices()
示例#9
0
    def process(self, data, output, processes, process):
        """
        """
        print "in the process function"
        if data.center_of_rotation is None:
            centre_of_rotation = np.ones(data.get_number_of_sinograms())
            centre_of_rotation = centre_of_rotation * self.parameters["center_of_rotation"]
        else:
            centre_of_rotation = data.center_of_rotation[:]

        if centre_of_rotation is None:
            centre_of_rotation = np.ones(data.get_number_of_sinograms())
            centre_of_rotation = centre_of_rotation * self.parameters["center_of_rotation"]

        sinogram_frames = np.arange(data.get_number_of_sinograms())

        frames = np.array_split(sinogram_frames, len(processes))[process]
        centre_of_rotations = np.array_split(centre_of_rotation, len(processes))[process]

        angles = data.rotation_angle.data[:]

        for i in range(len(frames)):
            frame_centre_of_rotation = centre_of_rotations[i]
            sinogram = data.data[:, frames[i], :]
            reconstruction = self.reconstruct(
                sinogram,
                frame_centre_of_rotation,
                angles,
                (output.data.shape[0], output.data.shape[2]),
                (output.data.shape[0] / 2, output.data.shape[2] / 2),
            )
            output.data[:, frames[i], :] = reconstruction
            self.count += 1
            print self.count
示例#10
0
def create_experiment_20151020():
    # Using stratified sampling, select n assets, one from each
    # stratum, for n = 5, 10, 25, 100 for monthly and n = 5, 10, 25,
    # 100, 250, 500 for daily.
    monthly_returns = read_monthly_returns()
    monthly_indices = monthly_returns.tail(12).mean().sort_values().index
    monthly_asset_stratums = { 
        i : np.array_split(monthly_indices, i)
        for i in [5, 10, 25, 100]}

    daily_returns = read_daily_returns()
    daily_indices = daily_returns.tail(90).mean().sort_values().index
    daily_asset_stratums = { 
        i : np.array_split(daily_indices, i)
        for i in [5, 10, 25, 100, 250, 500]}

    def select_assets(stratums):
        return [np.random.choice(i)
                for i in stratums]

    # Write monthly return data.
    for (num_assets, stratum) in monthly_asset_stratums.items():
        filename = (
            '../data/experiments/pu_bounds_uncertainty_20151020' +
            '/monthly_scenario_' + str(num_assets) + '.csv')
        monthly_returns.loc[:, select_assets(stratum)].to_csv(filename)

    # Write daily return data.
    for (num_assets, stratum) in daily_asset_stratums.items():
        filename = (
            '../data/experiments/pu_bounds_uncertainty_20151020' +
            '/daily_scenario_' + str(num_assets) + '.csv')
        daily_returns.loc[:, select_assets(stratum)].to_csv(filename)
示例#11
0
    def score(self, X, y):
        """Returns the score obtained for each estimators/data slice couple.

        Parameters
        ----------
        X : array, shape (n_samples, n_features, n_estimators)
            The input samples. For each data slice, the corresponding estimator
            score the prediction: e.g. [estimators[ii].score(X[..., ii], y)
                                        for ii in range(n_estimators)]
        y : array, shape (n_samples,) | (n_samples, n_targets)
            The target values.

        Returns
        -------
        score : array, shape (n_samples, n_estimators)
            Score for each estimator / data slice couple.
        """
        self._check_Xy(X)
        if X.shape[-1] != len(self.estimators_):
            raise ValueError('The number of estimators does not match '
                             'X.shape[2]')
        # For predictions/transforms the parallelization is across the data and
        # not across the estimators to avoid memory load.
        parallel, p_func, n_jobs = parallel_func(_sl_score, self.n_jobs)
        X_splits = np.array_split(X, n_jobs, axis=-1)
        est_splits = np.array_split(self.estimators_, n_jobs)
        score = parallel(p_func(est, x, y)
                         for (est, x) in zip(est_splits, X_splits))

        if n_jobs > 1:
            score = np.concatenate(score, axis=0)
        else:
            score = score[0]
        return score
示例#12
0
文件: rks.py 项目: sunqm/mpi4pyscf
def _setup_grids_(mf, dm):
    mol = mf.mol
    grids = mf.grids

    if rank == 0:
        grids.build(with_non0tab=False)
        grids.coords = numpy.array_split(grids.coords, mpi.pool.size)
        grids.weights = numpy.array_split(grids.weights, mpi.pool.size)
    grids.coords = mpi.scatter(grids.coords)
    grids.weights = mpi.scatter(grids.weights)

    ground_state = (isinstance(dm, numpy.ndarray) and dm.ndim == 2)
    if mf.small_rho_cutoff > 1e-20 and ground_state:
        rho = mf._numint.get_rho(mol, dm, grids, mf.max_memory)
        n = comm.allreduce(numpy.dot(rho, grids.weights))
        if abs(n-mol.nelectron) < rks.NELEC_ERROR_TOL*n:
            rw = mpi.gather(rho * grids.weights)
            idx = abs(rw) > mf.small_rho_cutoff / grids.weights.size
            logger.alldebug1(mf, 'Drop grids %d',
                             grids.weights.size - numpy.count_nonzero(idx))
            grids.coords  = numpy.asarray(grids.coords [idx], order='C')
            grids.weights = numpy.asarray(grids.weights[idx], order='C')

    grids.non0tab = grids.make_mask(mol, grids.coords)

    return grids
示例#13
0
    def transform(self, pts, verbose=None):
        """Apply the warp.

        Parameters
        ----------
        pts : shape (n_transform, 3)
            Source points to warp to the destination.

        Returns
        -------
        dest : shape (n_transform, 3)
            The transformed points.
        """
        logger.info('Transforming %s points' % (len(pts),))
        from scipy.spatial.distance import cdist
        assert pts.shape[1] == 3
        # for memory reasons, we should do this in ~100 MB chunks
        out = np.zeros_like(pts)
        n_splits = max(int((pts.shape[0] * self._destination.shape[0]) /
                           (100e6 / 8.)), 1)
        for this_out, this_pts in zip(np.array_split(out, n_splits),
                                      np.array_split(pts, n_splits)):
            dists = _tps(cdist(this_pts, self._destination, 'sqeuclidean'))
            L = np.hstack((dists, np.ones((dists.shape[0], 1)), this_pts))
            this_out[:] = np.dot(L, self._weights)
        assert not (out == 0).any()
        return out
示例#14
0
def lorenz_example():

    sigma = 10
    rho = 28
    beta = 8.0/3
    theta = 3 * np.pi / 4

    def lorenz(xyz, t):
        x, y, z = xyz
        x_dot = sigma * (y - x)
        y_dot = x * rho - x * z - y
        z_dot = x * y - beta* z
        return [x_dot, y_dot, z_dot]

    initial = (-10, -7, 35)
    t = np.arange(0, 100, 0.006)

    solution = odeint(lorenz, initial, t)

    x = solution[:, 0]
    y = solution[:, 1]
    z = solution[:, 2]
    xprime = np.cos(theta) * x - np.sin(theta) * y

    colors = ["#C6DBEF", "#9ECAE1", "#6BAED6", "#4292C6", "#2171B5", "#08519C", "#08306B",]

    output_file("lorenz.html", title="lorenz.py example")

    multi_line(np.array_split(xprime, 7), np.array_split(z, 7),
               line_color=colors, line_alpha=0.8, line_width=1.5,
               tools="pan,zoom,resize", title="lorenz example", name="lorenz_example")

    return curplot()
示例#15
0
文件: theta.py 项目: dunovank/radd_kd
 def filter_params(self, p_sets, p_fmins, nkeep=5, method='best'):
     # rank inits by costfx error low-to-high
     fmin_series = pd.Series(p_fmins)
     rankorder = fmin_series.sort_values()
     # eliminate extremely bad parameter sets
     rankorder = rankorder[rankorder<=5.0]
     if method=='random':
         # return nkeep from randomly sampled inits
         inits = p_sets[:nkeep]
         inits_err = p_fmins[:nkeep]
     elif method=='best':
         # return nkeep from inits with lowest err
         inits = [p_sets[i] for i in rankorder.index[:nkeep]]
         inits_err = rankorder.values[:nkeep]
     elif method=='lmh':
         # split index for low, med, and high err inits
         # if nkeep is odd, will sample more low than high
         if nkeep<3: nkeep=3
         ix = rankorder.index.values
         nl, nm, nh = [arr.size for arr in np.array_split(np.arange(nkeep), 3)]
         # extract indices roughly equal numbers of parameter sets with low, med, hi err
         keep_ix = np.hstack([ix[:nl], np.array_split(ix,2)[0][-nm:], ix[-nh:]])
         inits = [p_sets[i] for i in keep_ix]
         inits_err = [fmin_series[i] for i in keep_ix]
     return inits, np.min(inits_err)
def cross_validate_k(S, X, t, l, min_feature=100, max_feature=250,disp=True):
    """Performs Cross validation to optimize the number of features that gives the best error rate."""
    X_groups = np.array_split(X, S)
    Y_groups = np.array_split(t, S)    
    min_err_hp = (0, float("inf"))
    for k in range(min_feature, max_feature+1):
        if disp: print("Starting S-fold cross-validation for k =", k)
        if disp: print("Training run:", end=' ')
        # i represents the held-out group
        error_rates = np.ndarray(S)
        for i in range(S):
            if disp: print(i+1, end=' ')
            X_others = [X_groups[x] for x in range(S) if x!=i]
            Y_others = [Y_groups[x] for x in range(S) if x!=i]
            X_training = np.concatenate(tuple(X_others), axis=0)
            Y_training = np.concatenate(tuple(Y_others), axis=0)
            
            # Feature selection has to be done for each partition to generalize the fitting.
            # It leads to over-fitting to the validation data if the scoring & selection is done 
            # once on the entire dataset and then masked locally in a partition.
            mask = univariate_fs(X_training, Y_training, k=k)
            X_training_subset = X_training[:, mask]
            X_val_subset = X_groups[i][:, mask]
            
            w_star = train(X_training_subset, Y_training, l, disp=0)
            # Prediction on the hold-out partition
            error_rates[i] = prediction_performance(X_val_subset, Y_groups[i], w_star, report=False)        
        error_rate = error_rates.mean()
        if disp: print("; Error rate:",error_rate)
        if error_rate < min_err_hp[1]:
            min_err_hp = (k, error_rate)
    return min_err_hp
def ModelSelectionTest01():
	from sklearn import datasets, svm
	import numpy as np
	digits = datasets.load_digits()
	X_digits = digits.data
	Y_digits = digits.target
	svc = svm.SVC(C = 1, kernel = 'linear')
	score = svc.fit(X_digits[:-100], Y_digits[:-100]).score(X_digits[-100:], Y_digits[-100:])

	#print score

	X_folds = np.array_split(X_digits, 3)
	Y_folds = np.array_split(Y_digits, 3)

	#print len(X_folds[0])

	scores = list()

	for k in range(3):
		X_train = list(X_folds) #这里的X_folds是一个具有3个元素的list
		X_test = X_train.pop(k) #test是train的第K个元素
		X_train = np.concatenate(X_train) #这里是把X_train减去X_test
		#print len(X_train)
		Y_train = list(Y_folds)
		Y_test = Y_train.pop(k)
		Y_train = np.concatenate(Y_train)

		scores.append(svc.fit(X_train, Y_train).score(X_test, Y_test))

	#print scores


	from sklearn import cross_validation
	k_fold = cross_validation.KFold(n = 6, n_folds = 3)
	for train_indices, test_indices in k_fold:
		print train_indices, test_indices

	k_fold = cross_validation.KFold(len(X_digits), n_folds = 3)
	scores = [svc.fit(X_digits[train], Y_digits[train]).score(X_digits[test], Y_digits[test]) for train , test in k_fold]

	#print scores

	scores = cross_validation.cross_val_score(svc, X_digits, Y_digits, cv = k_fold, n_jobs = 1)
	#print scores

	from sklearn.grid_search import GridSearchCV
	gammas = np.logspace(-6, -1, 10)
	clf = GridSearchCV(estimator = svc, param_grid = dict(gamma = gammas), n_jobs = 1)
	clf.fit(X_digits[:1000], Y_digits[:1000])
	print clf.best_score_
	print clf.best_estimator_.gamma

	from sklearn import linear_model, datasets
	lasso = linear_model.LassoCV()    #这里的lassoCV和lasso有什么区别?
	diabetes = datasets.load_diabetes()
	X_diabetes = diabetes.data
	Y_diabetes = diabetes.target
	lasso.fit(X_diabetes, Y_diabetes)

	print lasso.alpha_
示例#18
0
def ellis_bpm(fname, start_bpm, hpss=True, hop_length=512, tightness=100.0, plot=False, sound=False):
    y, sr = librosa.load(fname, sr=None)
    log.debug(u'Estimating tempo: {}'.format(TERM.cyan(fname)))
    if hpss:
        log.debug(TERM.magenta("Getting percussive elements"))
        y_harmonic, y_percussive = librosa.effects.hpss(y)
        chunks = np.array_split(y_percussive, PLOT_SPLIT)
        log.debug(TERM.magenta("Estimating beats per minute"))
        bpm, beat_frames = librosa.beat.beat_track(y=y_percussive, sr=sr, start_bpm=start_bpm, hop_length=hop_length, tightness=tightness)
    else:
        log.debug(TERM.magenta("Estimating beats per minute"))
        bpm, beat_frames = librosa.beat.beat_track(y=y, sr=sr, start_bpm=start_bpm, hop_length=hop_length, tightness=tightness)
        chunks = np.array_split(y, PLOT_SPLIT)

    log.debug(u'Tempo: {:6.2f} bpm'.format(bpm))
    if plot:
        plt.figure(figsize=(16,10))

        curr_frame = 0
        for i in range(PLOT_SPLIT):
            plt.subplot(PLOT_SPLIT * 100 + 11 + i)
            plt.plot(curr_frame + np.arange(len(chunks[i])), chunks[i], 'g')
            for b in beat_frames:
                plt.axvline(x=b*hop_length, color='k')
            plt.xlim([curr_frame, len(chunks[i]) + curr_frame])
            curr_frame += len(chunks[i])
        plt.show(block=False)
    if sound:
        beat_times = librosa.frames_to_time(beat_frames, sr=sr, hop_length=hop_length)
        clicks = mir_eval.sonify.clicks(beat_times, sr, length=len(y))
        sd.play(y + clicks, sr)
        input('Press Return key to stop sound')
        sd.stop()
    return bpm
def cross_validate(S, X, t, disp=True):
    """ Return the most Optimal value for the regularization hyper-parameter by 
        performing S-fold cross validation for all allowed values of λ."""    
    X_groups = np.array_split(X, S)
    Y_groups = np.array_split(t, S)    
    min_err_hp = (0, float("inf"))
    for p in range(-3,2):        
        l = 10 ** p
        if disp: print("Starting S-fold cross-validation for λ =", l)
        if disp: print("Training run:", end=' ')
        # i represents the held-out group
        error_rates = np.ndarray(S)
        for i in range(S):
            if disp: print(i+1, end=' ')
            X_others = [X_groups[x] for x in range(S) if x!=i]
            Y_others = [Y_groups[x] for x in range(S) if x!=i]
            X_training = np.concatenate(tuple(X_others), axis=0)
            Y_training = np.concatenate(tuple(Y_others), axis=0)
            w_star = train(X_training, Y_training, l, disp=0)

            # Prediction on the help-out group
            error_rates[i] = prediction_performance(X_groups[i], Y_groups[i], w_star, report=False)
        if disp: print("")
        error_rate = error_rates.mean()
        
        if error_rate < min_err_hp[1]:
            min_err_hp = (l, error_rate)
    return min_err_hp
示例#20
0
 def RSM(self,avgl,rossete = 4,loopdist='gaussian'):
     x = np.arange(1,self.N)
     pickpdist = pdist()
     if loopdist == 'gaussian':
         cdf = np.cumsum(pickpdist.gaussian(avgl)(x))
         
     while True:
         self.looplst = x[np.searchsorted(cdf,np.random.random(self.M))]
         if np.sum(self.looplst) < self.N - 1:
             break
     
     rest = self.N-1-np.sum(self.looplst)
     temp = np.array_split(np.arange(rest),3*rossete)
     anchor = []
     for i in range(rossete):
         temp3 = []
         temp3.append(np.random.choice(temp[i*3+1],1)[0])
         for j in np.arange([len(np.array_split(np.arange(self.M),rossete)[k]) for k in range(rossete)][i]-1):
             temp3.append(temp3[-1]+np.random.randint(1,5))
         anchor.append(temp3)
     
     anchor = np.array(anchor)
     anchor = anchor.flatten()
     temp1 = anchor + np.cumsum(self.looplst)
     temp2 = temp1 - self.looplst
     self.pair = np.array(zip(temp2,temp1))
     
     return self.looplst,self.pair
示例#21
0
    def __setitem__(self, attr, val):
        if self.read_only:
            raise ValueError('Cannot set item in read-only mode.')
        is_np = type(val).__module__ == np.__name__
        if isinstance(self.data, dict):
            if isinstance(attr, bytes):
                attr = attr.decode('utf-8')
            if is_np:
                self.data[attr] = pickle.dumps(val)
                # We have to remember to unpickle in __getitem__
                self.data['_{}_pickled'.format(attr)] = True
            else:
                self.data[attr] = val
            return
        if isinstance(self.data, h5py.Group) and attr in self.data:
            raise KeyError('Cannot set attribute. '
                           'Group with name "{}" exists.'.format(attr))
        if is_np:
            dataset = self.data.create_dataset(attr, val.shape, dtype=val.dtype)
            if not val.shape:
                # scalar
                dataset[()] = val
            else:
                dataset[:] = val
        elif isinstance(val, (list, tuple)):
            # Check that no item in `data` is larger than `HDF5_OBJECT_HEADER_LIMIT`
            # because in that case even chunking the array would not make the saving
            # possible.
            bad_attributes = [x for x in val if len(x) > HDF5_OBJECT_HEADER_LIMIT]

            # Expecting this to never be true.
            if bad_attributes:
                raise RuntimeError('The following attributes cannot be saved to '
                                   'HDF5 file because they are larger than '
                                   '%d bytes: %s' % (HDF5_OBJECT_HEADER_LIMIT,
                                                     ', '.join(bad_attributes)))

            if (val and sys.version_info[0] == 3 and isinstance(
                    val[0], six.string_types)):
                # convert to bytes
                val = [x.encode('utf-8') for x in val]

            data_npy = np.asarray(val)

            num_chunks = 1
            chunked_data = np.array_split(data_npy, num_chunks)

            # This will never loop forever thanks to the test above.
            is_too_big = lambda x: x.nbytes > HDF5_OBJECT_HEADER_LIMIT
            while any(map(is_too_big, chunked_data)):
                num_chunks += 1
                chunked_data = np.array_split(data_npy, num_chunks)

            if num_chunks > 1:
                for chunk_id, chunk_data in enumerate(chunked_data):
                    self.data.attrs['%s%d' % (attr, chunk_id)] = chunk_data
            else:
                self.data.attrs[attr] = val
        else:
            self.data.attrs[attr] = val
def schedule_generator_maintenance_loop(load, pmaxs, annual_maintenance_rates, dispatch_periods, scheduling_order):
    # if nothing else, better to schedule the large generators first
    scheduling_order = np.argsort(-pmaxs) if scheduling_order is None else scheduling_order

    # annual maintenance rates must be between zero and one
    annual_maintenance_rates = np.clip(annual_maintenance_rates, 0, 1)

    # gives the index for the change between dispatch_periods
    group_cuts = list(np.where(np.diff(dispatch_periods) != 0)[0] + 1) if dispatch_periods is not None else None
    group_lengths = np.array([group_cuts[0]] + list(np.diff(group_cuts)) + [len(load) - group_cuts[-1]])
    num_groups = len(group_cuts) + 1

    # necessary to scale load in some cases for the optimization to work. Basically, load shouldn't be > gen
    load_scaled = scale_load_to_system(load, pmaxs)
    load_scaled = np.concatenate([[np.max(ls)]*gl for gl, ls in zip(group_lengths, np.array_split(load_scaled, np.array(group_cuts)))])

    pmaxs_clipped = copy.deepcopy(pmaxs)
    pmaxs_clipped = np.clip(pmaxs_clipped, 1e-1, None)
    maintenance_energy = annual_maintenance_rates*pmaxs_clipped*len(load)
    scheduled_maintenance = np.zeros((num_groups, len(pmaxs)))

    # loop through and schedule maintenance for each generator one at a time. Update the net load after each one.
    for i in scheduling_order:
        energy_allocation = dispatch_budget.dispatch_to_energy_budget(load_scaled, -maintenance_energy[i], pmins=0, pmaxs=pmaxs_clipped[i])
        scheduled_maintenance[:, i] = np.clip(np.array([np.mean(ls) for ls in np.array_split(energy_allocation, np.array(group_cuts))])/pmaxs_clipped[i], 0, 1)
        load_scaled += np.concatenate([[sm * pmaxs[i]]*gl for gl, sm in zip(group_lengths, scheduled_maintenance[:, i])])

    if not all(np.isclose(annual_maintenance_rates, (scheduled_maintenance.T * group_lengths).sum(axis=1)/len(load))):
        logging.warning("scheduled maintance rates don't all match the annual maintenance rates")
    return scheduled_maintenance
示例#23
0
def split_data(ras, decs):
	"""
	It will split the RAs and DECs into smaller chunks which would be better
	for cache coherent
	"""
	size = ceil(len(ras)/256.0)
	return zip(array_split(ras, size), array_split(decs, size))
示例#24
0
def gp2(data, block_size = 100, nugget = 0.005):

	c = data[0]
	s = data[1]
	s_2 = np.array_split(s, len(s)/block_size + 1)
	c_2 = np.array_split(c, len(s)/block_size + 1)
	
	sapflux_pred = []
	
	nug = nugget;
	for a in range(0,len(s_2)):
	
		t0 = time.time()
		X = np.atleast_2d(c_2[a]).T
		y = np.atleast_2d(s_2[a]).T
	
		gproc = gaussian_process.GaussianProcess(theta0=0.01, thetaL=1e-4, thetaU=1e-1,nugget=nug)
	
		
		gproc.fit(X, y)
		y_pred, sigma2_pred = gproc.predict(X, eval_MSE=True)
		sapflux_pred.extend(y_pred.ravel())
		t1 = time.time()
		print t1-t0
	
	return np.array([c, s, np.array(sapflux_pred)])
示例#25
0
def ensemble_maker_inner(train_mat,labels,model_gen_function, info_dict,num=10):
    ## contains core functions to make ensemble models
    ## from training data and labels
    ## model_gen_function is a functiont that takes NO arguments and returns a keras model
    ## info_dict is a dictionary of training info 
    train_mat, labels = shuffle(train_mat, labels)
    train_mat = np.array_split(train_mat, num, axis=0)
    labels = np.array_split(labels, num, axis=0)
    earlystop = EarlyStopping(monitor=info_dict['monitor'], min_delta=info_dict['min_delta'],
                              patience=info_dict['patience'],
                              verbose=0,
                              mode='auto')
    callbacks_list = [earlystop]
    model_list = []
    for ii in range(num):
        train_feature = array_stack(train_mat, ii)
        train_labels = array_stack(labels, ii)
        loaded_model = model_gen_function() # note the call to gen new model
        current_model = reset_weights(loaded_model)
        history = current_model.fit(train_feature, train_labels,
                                    epochs=info_dict['epochs'], verbose=0,
                                    batch_size=info_dict['batch_size'],
                                    callbacks=callbacks_list)
        model_list.append(current_model)
    return(model_list)
def get_gradient(theta):
    global fractional_counts, event_index, feature_index, event_grad, rc, N
    assert len(theta) == len(feature_index)
    event_grad = {}
    cpu_count = multiprocessing.cpu_count()
    pool = Pool(processes=cpu_count)  # uses all available CPUs
    batches_fractional_counts = np.array_split(range(len(event_index)), cpu_count)
    events_to_split = events_to_features.keys()
    batches_events_to_features = np.array_split(events_to_split, cpu_count)
    # for batch_of_fc in batches_fractional_counts:
    for batch_of_fc in batches_events_to_features:
        pool.apply_async(batch_gradient, args=(theta, batch_of_fc), callback=batch_accumilate_gradient)
    pool.close()
    pool.join()
    # grad = np.zeros_like(theta)
    grad = -2 * rc * theta  # l2 regularization with lambda 0.5
    for e in event_grad:
        feats = events_to_features.get(e, [])
        for f in feats:
            grad[feature_index[f]] += event_grad[e]

    # for s in seen_index:
    # grad[s] += -theta[s]  # l2 regularization with lambda 0.5
    assert len(grad) == len(feature_index)
    return -grad
    def generateTrainAndTest(self):
        """
        Generate train and test data and then yield
        :return:
        """
        partitions = np.array_split(self.dataset, self.numOfFolds)
        labels_partitions = np.array_split(self.labels, self.numOfFolds)
        for fold in range(self.numOfFolds):
            self.test = partitions[fold]
            self.labels_test = labels_partitions[fold]

            fold_left = partitions[:fold]
            fold_right = partitions[fold + 1:]

            labels_fold_left = labels_partitions[:fold]
            labels_fold_right = labels_partitions[fold + 1:]

            if fold_left.__len__() == 0:
                self.train = np.concatenate(fold_right)
                self.labels_train = np.concatenate(labels_fold_right)
            elif fold_right.__len__() == 0:
                self.train = np.concatenate(fold_left)
                self.labels_train = np.concatenate(labels_fold_left)
            else:
                self.train = np.concatenate((np.concatenate(fold_left), np.concatenate(fold_right)))
                self.labels_train = np.concatenate(
                        (np.concatenate(labels_fold_left), np.concatenate(labels_fold_right)))
            yield
示例#28
0
def parallelMorton(iMortonRanges, xMortonRanges, childMethod, numProcessesQuery):
    if iMortonRanges != None:
        numMRanges = max((len(iMortonRanges), len(xMortonRanges)))
        if numMRanges > numProcessesQuery:
            numChunks = numProcessesQuery
        else:
            numChunks = numMRanges
        ichunks = numpy.array_split(iMortonRanges, numChunks)
        xchunks = numpy.array_split(xMortonRanges, numChunks)
    else:
        numMRanges = len(xMortonRanges)
        if numMRanges > numProcessesQuery:
            numChunks = numProcessesQuery
        else:
            numChunks = numMRanges
        ichunks = numpy.array_split([], numChunks)
        xchunks = numpy.array_split(xMortonRanges, numChunks)
    children = []
    for i in range(numChunks):
        children.append(multiprocessing.Process(target=childMethod, 
            args=(ichunks[i],xchunks[i])))
        children[-1].start()  
    # wait for all children to finish their execution
    for i in range(numChunks):
        children[i].join()
示例#29
0
def make_batches(x, y, batch_size=128, shuffle=True, nest=True):
    for i in range(len(x)):
        x[i] = atleast_4d(x[i])
    y = atleast_4d(y)
    num_batches = (y.shape[0] // batch_size)
    if y.shape[0] % batch_size is not 0:
        num_batches += 1
    if shuffle:
        shuffled_arrays = sk.utils.shuffle(*x, y)
        x = shuffled_arrays[:len(x)]
        y = shuffled_arrays[-1]
    x_batches_list = []
    for i in range(len(x)):
        x_batches_list.append(np.array_split(x[i], num_batches))
    if nest:
        x_batches = []
        for i in range(num_batches):
            x_batch = []
            for x_input in x_batches_list:
                x_batch.append(x_input[i])
            x_batches.append(x_batch)
    else:
        x_batches = x_batches_list
    y_batches = np.array_split(y, num_batches)
    return x_batches, y_batches, num_batches
示例#30
0
def make_batches(X, y, batch_size=128, shuffle=True, nest=True):
    for i in range(len(X)):
        X[i] = atleast_4d(X[i])
    y = atleast_4d(y)
    num_batches = (y.shape[0] // batch_size)
    if y.shape[0] % batch_size is not 0:
        num_batches += 1
    if shuffle:
        shuffled_arrays = sk.utils.shuffle(*X, y)
        X = shuffled_arrays[:len(X)]
        y = shuffled_arrays[-1]
    X_batches_list = []
    for i in range(len(X)):
        X_batches_list.append(np.array_split(X[i], num_batches))
    if nest:
        X_batches = []
        for i in range(num_batches):
            X_batch = []
            for X_input in X_batches_list:
                X_batch.append(X_input[i])
            X_batches.append(X_batch)
    else:
        X_batches = X_batches_list
    y_batches = np.array_split(y, num_batches)
    return X_batches, y_batches, num_batches
示例#31
0
文件: als.py 项目: rheehot/studio
def apply_by_multiprocessing_list_to_list(df, func, **kwargs):
    workers = kwargs.pop('workers')
    pool = Pool(processes=workers)
    result = pool.map(apply_list, [(d, func, kwargs) for d in np.array_split(df, workers)])
    pool.close()
    return result
示例#32
0
init = tf.global_variables_initializer()
local_init = tf.local_variables_initializer()
pred_data = pd.DataFrame()

# Train model.

with tf.Session() as session:
    epochs = 100
    batch_size = 25

    session.run(init)
    session.run(local_init)

    num_batches = int(l_t_matrix.shape[0] / batch_size)
    l_t_matrix = np.array_split(l_t_matrix, num_batches)

    for i in range(epochs):
        avg_cost = 0
        for batch in l_t_matrix:
            _, l = session.run([optimizer, loss], feed_dict={X: batch})
            avg_cost += l
        avg_cost /= num_batches

        print("epoch: {} Loss: {}".format(i + 1, avg_cost))

    l_t_matrix = np.concatenate(l_t_matrix, axis=0)

    preds = session.run(decoder_op, feed_dict={X: l_t_matrix})

    preds = pd.DataFrame(preds)
示例#33
0
def classify(data, house, f):
    data, old_labels = relabel(data)
    res_obj = {"y_pred": [], "y_true": [], "acc": []}
    #data.iloc[::6]

    # WWW
    # dividing the data into training and testing
    #trainDf, testDf, trainLens, testLens, testFrac = split.trainTest(
    #    data, 5400, 5400*2, testSize=0.3)

    # e.g. structure of the array
    # X = [np.array([  [f1],[f2],[f3 ] ... [ N days], dtype=uint8 )]
    # Y = [np.array([   a, b , c])]
    # splitting so that we get a fraction of the day for training the labels
    #X_train = np.array(np.array_split(trainDf.values[:, :trainDf.shape[1] - 2], 10))
    #y_train = np.array(np.array_split(trainDf.values[:, trainDf.shape[1] - 1], 10))

    # test dataset - dividing into subsequences
    #X_test = np.array(np.array_split(testDf.values[:, :testDf.shape[1] - 2], 30))
    #y_test = np.array(np.array_split(testDf.values[:, testDf.shape[1] - 1], 30))
    # WWW

    X_train = np.array(data.values[:, :data.shape[1] - 2])
    y_train = np.array(data.values[:, data.shape[1] - 1])

    #print X_train.shape
    #test_SSVM(X_train, X_test, y_train, y_test)
    #exit()
    # 5 fold validation;
    #label = np.unique(data['activity'])
    kf = StratifiedKFold(data['activity'], n_folds=5)

    clfs = []
    accuracies = []
    # cross validation
    for i, (train_index, test_index) in enumerate(kf):
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train1, X_test1 = X_train[train_index], X_train[test_index]
        y_train1, y_test1 = y_train[train_index], y_train[test_index]

        #print np.unique(np.concatenate(y_train1).ravel())
        #print np.unique(np.concatenate(y_test1).ravel())
        X_train1 = np.array_split(X_train1, 100)
        X_test1 = np.array_split(X_test1, 10)
        y_train1 = np.array_split(y_train1, 100)
        y_test1 = np.array_split(y_test1, 10)

        #print X_train1.shape,  y_train1.shape
        #print X_train1[0].shape,  y_train1[0].shape

        fname = 'ssvm_models/ssvm_' + house + f + str(i) + '.pkl'
        if os.path.isfile(fname):
            pkl_file = open(fname, 'rb')
            clf = pickle.load(pkl_file)
            print i, ". Classifier Loaded:", house, f, clf
        else:
            clf = train_SSVM(X_train1, y_train1)
            output = open(fname, 'wb')
            pickle.dump(clf, output)

        accuracy, y_pred, y_true = test_SSVM(clf, X_test1, y_test1)

        y_pred = map(lambda x: old_labels[int(x)], y_pred)
        y_true = map(lambda x: old_labels[int(x)], y_true)

        #save the model
        res_obj['y_pred'].append(y_pred)
        res_obj['y_true'].append(y_true)
        res_obj['acc'].append(accuracy)

        obj = {"y_pred": y_pred, "y_true": y_true, "acc": accuracy}
        #write the results:
        with gzip.open('ssvm_models/ssvm_' + house + f + str(i) + '.json.gz',
                       'w') as out:
            json.dump(obj, out)

        #clfs.append(clf)
        accuracies.append(accuracy)

    print 'House:', house, 'Feature:', f,
    print accuracies
    with gzip.open('ssvm_models/ssvm_' + house + f + '_all.json.gz',
                   'w') as out:
        json.dump(res_obj, out)

    #ssvm = clfs[np.argmax(accuracies)]
    #print "Learning complete..."
    #accuracy = ssvm.score(X_test, y_test)
    #print("Test score with chain CRF: %f" % accuracy )

    print "Learning SVM complete."
def process_manager(args):

    (path_to_neatmo_ppt_hdf5, path_to_prim_netw_ppt_hdf5,
     path_to_filtered_pws) = args

    #=========================================================
    HDF5_pws_ppt = HDF5(infile=path_to_neatmo_ppt_hdf5)
    all_pws_ids = HDF5_pws_ppt.get_all_names()
    pws_coords = HDF5_pws_ppt.get_coordinates(all_pws_ids)
    pws_in_coords_df = pd.DataFrame(index=all_pws_ids,
                                    data=pws_coords['easting'],
                                    columns=['X'])
    y_pws_coords = pws_coords['northing']
    pws_in_coords_df.loc[:, 'Y'] = y_pws_coords
    pws_in_coords_df.dropna(how='all', inplace=True)
    assert pws_in_coords_df.isna().sum().sum() == 0
    #=========================================================
    HDF5_prim_netw_ppt = HDF5(infile=path_to_prim_netw_ppt_hdf5)
    all_prim_netw_stns_ids = HDF5_prim_netw_ppt.get_all_names()

    prim_netw_coords = HDF5_prim_netw_ppt.get_coordinates(
        all_prim_netw_stns_ids)
    prim_netw_in_coords_df = pd.DataFrame(index=all_prim_netw_stns_ids,
                                          data=prim_netw_coords['easting'],
                                          columns=['X'])
    y_prim_netw_coords = prim_netw_coords['northing']
    prim_netw_in_coords_df.loc[:, 'Y'] = y_prim_netw_coords
    prim_netw_in_coords_df.dropna(how='all', inplace=True)
    assert prim_netw_in_coords_df.isna().sum().sum() == 0
    #=========================================================
    # select on 'good' pws
    ids_pws_to_use = pd.read_csv(path_to_filtered_pws,
                                 index_col=0).index.to_list()
    #=========================================================
    date_range = pd.date_range(start=start_date, end=end_date, freq='H')
    date_range_summer = pd.DatetimeIndex([
        date_ for date_ in date_range
        if date_.month not in not_convective_season
    ])

    print('Using Workers: ', n_workers)
    # devide stations on workers
    all_timestamps_worker = np.array_split(date_range_summer, n_workers)
    args_worker = []

    for time_list in all_timestamps_worker:
        empty_data = np.zeros(shape=(len(time_list), len(all_pws_ids)))
        empty_data[empty_data == 0] = np.nan
        df_save_results = pd.DataFrame(index=time_list,
                                       columns=all_pws_ids,
                                       data=empty_data)
        # args_workers = list(repeat(args, n_worker))

        args_worker.append((path_to_prim_netw_ppt_hdf5, prim_netw_in_coords_df,
                            path_to_neatmo_ppt_hdf5, pws_in_coords_df,
                            ids_pws_to_use, time_list, df_save_results))

    my_pool = mp.Pool(processes=n_workers)
    # TODO: Check number of accounts

    results = my_pool.map(on_evt_filter_pws, args_worker)

    # my_pool.terminate()

    my_pool.close()
    my_pool.join()

    results_df = pd.concat(results)

    results_df.to_csv(os.path.join(out_save_dir,
                                   'pws_flagged_%s.csv' % (_year)),
                      sep=';')

    return
示例#35
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        'input',
        type=str,
        default=None,
        help=
        'a directory with remapped/precomputed-*, config.pkl, and graph.pkl')
    parser.add_argument('--output',
                        type=str,
                        default=None,
                        help='output_directory')
    parser.add_argument('--resolution', type=str, default='6,6,40')
    parser.add_argument('--chunk_size', type=str, default='256,256,64')
    parser.add_argument(
        '--batch_scale',
        type=int,
        default=1,
        help=
        'Controls how much data is loaded from h5 each time, by multiplying chunk_size'
    )
    parser.add_argument('--global_offset', type=str, default='0,0,0')
    parser.add_argument('--flip_h5', type=bool, default=False)
    parser.add_argument('--verbose', type=bool, default=True)
    args = parser.parse_args()
    if args.verbose:
        logging.basicConfig(level='DEBUG')
    else:
        logging.basicConfig(level='ERROR')
    resolution = [int(i) for i in args.resolution.split(',')]
    chunk_size = [int(i) for i in args.chunk_size.split(',')]
    global_offset = [int(i) for i in args.global_offset.split(',')]

    if args.output is None:
        output = args.input
    config_path = os.path.join(args.input, 'config.pkl')

    if mpi_rank == 0:
        assert os.path.exists(config_path), 'Run reconciliate_remap first'
        with open(config_path, 'rb') as fp:
            seg_map = pickle.load(fp)
        os.makedirs(output, exist_ok=True)
    else:
        seg_map = None
    seg_map = mpi_comm.bcast(seg_map, 0)

    merge_output = os.path.join(output, 'agglomerated')
    h5_path = os.path.join(output, 'intermediate.h5')

    if mpi_rank == 0:
        union_bbox, cv_merge_path = get_union_bbox_and_merge_path(
            seg_map, merge_output, global_offset)

        # preset precomputed
        union_offset = np.array(union_bbox.minpt)
        union_size = np.array(union_bbox.maxpt) - np.array(union_bbox.minpt)
        cv_merge = prepare_precomputed(cv_merge_path,
                                       offset=union_offset,
                                       size=union_size,
                                       resolution=resolution,
                                       chunk_size=chunk_size)

        # sub divide aligned bboxes
        sub_bbox_size = [i * args.batch_scale for i in chunk_size]

        bbs = get_chunk_bboxes(union_bbox, sub_bbox_size)
        sub_bbs = np.array_split(bbs, mpi_size)
        logging.warn('write shapes %s %s', union_bbox, sub_bbox_size)

    else:
        # union_bbox = None
        union_offset = None
        cv_merge_path = None
        sub_bbs = None

    union_offset = mpi_comm.bcast(union_offset, 0)
    cv_merge_path = mpi_comm.bcast(cv_merge_path, 0)
    sub_bbs = mpi_comm.scatter(sub_bbs, 0)

    h5_to_cloudvolume(h5_path, cv_merge_path, union_offset, sub_bbs,
                      resolution, chunk_size, args.flip_h5)
    sys.exit()
示例#36
0
def main(_):
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
        datefmt='%y-%m-%d %H:%M:%S')

    logging.info("job_name:%s, task_index:%d" % (job_name, task_index))

    ps_hosts = cluster_spec['ps']
    worker_hosts = cluster_spec['worker']
    cluster = tf.train.ClusterSpec({"ps": ps_hosts, "worker": worker_hosts})

    # allows this node know about all other nodes
    if job_name == 'ps':  # checks if parameter server
        server = tf.train.Server(cluster, job_name="ps", task_index=task_index)
        server.join()
    else:  # it must be a worker server
        is_chief = (task_index == 0)  # checks if this is the chief node
        server = tf.train.Server(cluster,
                                 job_name="worker",
                                 task_index=task_index)

        logging.info("Loading data from worker index = %d" % task_index)

        if "TRAINING_DATA_PATH" in os.environ:
            logging.info("This is a normal worker..")
            training_data_path = os.environ["TRAINING_DATA_PATH"]
        else:
            logging.info("This is a backup worker")
            # watching certain file in hdfs which contains its training data

        # Read model structure info from ModelConfig
        with open('./ModelConfig.json') as f:
            model_conf = json.load(f)
            logging.info("model" + str(model_conf))
            EPOCH = int(model_conf['train']['numTrainEpochs'])
            global VALID_TRAINING_DATA_RATIO
            VALID_TRAINING_DATA_RATIO = model_conf['train']['validSetRate']
            is_continue_train = model_conf['train']['isContinuous']
            global BATCH_SIZE
            if "MiniBatchs" in model_conf['train']['params']:
                BATCH_SIZE = model_conf['train']['params']['MiniBatchs']

            logging.info("Batch size: " + str(BATCH_SIZE) +
                         ", VALID_TRAINING_DATA_RATIO: " +
                         str(VALID_TRAINING_DATA_RATIO))

        # import data
        context = load_data(training_data_path)

        # split data into batch
        total_batch = int(len(context["train_data"]) / BATCH_SIZE)
        x_batch = np.array_split(context["train_data"], total_batch)
        y_batch = np.array_split(context["train_target"], total_batch)
        sample_w_batch = np.array_split(context["train_data_sample_weight"],
                                        total_batch)

        logging.info("Testing set size: %d" % len(context['valid_data']))
        logging.info("Training set size: %d" % len(context['train_data']))

        valid_x = np.asarray(context["valid_data"])
        valid_y = np.asarray(context["valid_target"])
        valid_sample_w = np.asarray(context["valid_data_sample_weight"])

        # Graph
        worker_device = "/job:%s/task:%d" % (job_name, task_index)
        with tf.device(
                tf.train.replica_device_setter(  #ps_tasks=n_pss,
                    cluster=cluster,
                    worker_device=worker_device)):
            input_placeholder = tf.placeholder(dtype=tf.float32,
                                               shape=(None, FEATURE_COUNT),
                                               name="shifu_input_0")
            label_placeholder = tf.placeholder(dtype=tf.int32, shape=(None, 1))
            sample_weight_placeholder = tf.placeholder(dtype=tf.float32,
                                                       shape=(None, 1))

            opt, train_step, loss, global_step, y = model(
                input_placeholder, label_placeholder,
                sample_weight_placeholder, model_conf)

            # init ops
            init_tokens_op = opt.get_init_tokens_op()
            # initialize local step
            local_init = opt.local_step_init_op
            if is_chief:
                # initializes token queue
                local_init = opt.chief_init_op

            # checks if global vars are init
            ready_for_local_init = opt.ready_for_local_init_op

            # Initializing the variables
            init_op = tf.initialize_all_variables()
            logging.info("---Variables initialized---")

        # **************************************************************************************
        # Session
        sync_replicas_hook = opt.make_session_run_hook(is_chief)
        stop_hook = tf.train.StopAtStepHook(num_steps=EPOCH)
        chief_hooks = [sync_replicas_hook, stop_hook]
        if is_continue_train:
            scaff = None
        else:
            scaff = tf.train.Scaffold(
                init_op=init_op,
                local_init_op=local_init,
                ready_for_local_init_op=ready_for_local_init)
        # Configure
        if "IS_BACKUP" in os.environ:
            config = tf.ConfigProto(log_device_placement=False,
                                    allow_soft_placement=True,
                                    device_filters=[
                                        '/job:ps', '/job:worker/task:0',
                                        '/job:worker/task:%d' % task_index
                                    ])
        else:
            config = tf.ConfigProto(log_device_placement=False,
                                    allow_soft_placement=True)

        # Create a "supervisor", which oversees the training process.
        sess = tf.train.MonitoredTrainingSession(master=server.target,
                                                 is_chief=is_chief,
                                                 config=config,
                                                 scaffold=scaff,
                                                 hooks=chief_hooks,
                                                 stop_grace_period_secs=10,
                                                 checkpoint_dir=tmp_model_path)

        if is_chief and not is_continue_train:
            sess.run(init_tokens_op)
            #start_tensorboard(tmp_model_path)
            logging.info("chief start waiting 40 sec")
            time.sleep(
                40
            )  # grace period to wait on other workers before starting training
            logging.info("chief finish waiting 40 sec")

        # Train until hook stops session
        logging.info('Starting training on worker %d' % task_index)

        run_metadata = tf.RunMetadata()
        run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
        while not sess.should_stop():
            try:
                start = time.time()
                for i in range(total_batch):
                    train_feed = {
                        input_placeholder: x_batch[i],
                        label_placeholder: y_batch[i],
                        sample_weight_placeholder: sample_w_batch[i]
                    }

                    _, l, gs = sess.run([train_step, loss, global_step],
                                        feed_dict=train_feed,
                                        options=run_options,
                                        run_metadata=run_metadata)
                training_time = time.time() - start

                valid_start = time.time()
                # compute validation loss TODO, check if batch compute
                valid_loss, gs = sess.run(
                    [loss, global_step],
                    feed_dict={
                        input_placeholder: valid_x,
                        label_placeholder: valid_y,
                        sample_weight_placeholder: valid_sample_w
                    })
                valid_time = time.time() - valid_start
                logging.info('Step: ' + str(gs) + ' worker: ' +
                             str(task_index) + " training loss:" + str(l) +
                             " training time:" + str(training_time) +
                             " valid loss:" + str(valid_loss) +
                             " valid time:" + str(valid_time))

                # Send intermediate result to master
                message = "worker_index:{},time:{},current_epoch:{},training_loss:{},valid_loss:{},valid_time:{}\n".format(
                    str(task_index), str(training_time), str(gs), str(l),
                    str(valid_loss), str(valid_time))
                if sys.version_info < (3, 0):
                    socket_client.send(bytes(message))
                else:
                    socket_client.send(bytes(message), 'utf8')

            except RuntimeError as re:
                if 'Run called even after should_stop requested.' == re.args[
                        0]:
                    logging.info('About to execute sync_clean_up_op!')
                else:
                    raise

        logging.info('Done' + str(task_index))

        # We just need to make sure chief worker exit with success status is enough
        if is_chief:
            tf.reset_default_graph()

            # add placeholders for input images (and optional labels)
            x = tf.placeholder(dtype=tf.float32,
                               shape=(None, FEATURE_COUNT),
                               name="shifu_input_0")
            with tf.get_default_graph().as_default():
                if BUILD_MODEL_BY_CONF_ENABLE and model_conf is not None:
                    output_digits, output_nodes = generate_from_modelconf(
                        x, model_conf)
                else:
                    output_digits = nn_layer(x,
                                             FEATURE_COUNT,
                                             HIDDEN_NODES_COUNT,
                                             act_op_name="hidden_layer1")
                    output_nodes = HIDDEN_NODES_COUNT

                logging.info("output_nodes : " + str(output_nodes))
                prediction = nn_layer(output_digits,
                                      output_nodes,
                                      1,
                                      act=tf.nn.sigmoid,
                                      act_op_name="shifu_output_0")

            # restore from last checkpoint
            saver = tf.train.Saver()
            with tf.Session() as sess:
                ckpt = tf.train.get_checkpoint_state(tmp_model_path)
                logging.info("ckpt: {}".format(ckpt))
                assert ckpt, "Invalid model checkpoint path: {}".format(
                    tmp_model_path)
                saver.restore(sess, ckpt.model_checkpoint_path)

                logging.info(
                    "Exporting saved_model to: {}".format(final_model_path))

                # exported signatures defined in code
                simple_save(session=sess,
                            export_dir=final_model_path,
                            inputs={"shifu_input_0": x},
                            outputs={"shifu_output_0": prediction})
                logging.info("Exported saved_model")

            tl = timeline.Timeline(run_metadata.step_stats)
            ctf = tl.generate_chrome_trace_format()
            logging.info("ctf:" + str(ctf))

            f = tf.gfile.GFile(final_model_path + "/timeline.json", mode="w+")
            f.write(ctf)
            time.sleep(40)  # grace period to wait before closing session

        #sess.close()
        logging.info('Session from worker %d closed cleanly' % task_index)
        sys.exit()
示例#37
0
from glob import glob
import h5py
from joblib import delayed, Parallel

import os
import sys

from icae.tools.config_loader import config
from icae.tools import performance

# -

in_files = config.root + config.data.raw + "*.hdf"
errors = 0
files = glob(in_files)
batches = np.array_split(files, 24)  # adjust to available RAM
# + {}
import tables


def try_read(file):
    try:
        return pd.read_hdf(file)
    except OSError:
        return None


def process_batch(batch_files, frame_counter):
    print("Reading…")
    read_errors = 0
    unique_frames_seen = 0
示例#38
0
def prepareDataC(function, splitting=False, percentage=100):
    """
    Reads train and test data, prepares data, converts sentence representations
    """
    print(datetime.now())
    #define paths to train and test pickles
    input_train_covid = '../../Covid_data_11nov/traindata_covidbatch.pkl'
    input_test_covid = '../../Covid_data_11nov/testdata_covidbatch.pkl'

    print("Reading pickle files...")
    #read pickle files
    with open(input_train_covid, "rb") as pkl_file:
        traindata_c = pickle.load(pkl_file)

    with open(input_test_covid, "rb") as pkl_file:
        testdata_c = pickle.load(pkl_file)

    print("Creating and filtering dataframes...")
    #prepare training dataframes
    df_tr_c = function(traindata_c)[0]
    #take out sentences with labels that we should ignore (background, target, view_patient, view_thirdparty, info_thirdparty)
    rows_to_delete_tr_c, filtered_df_tr_c = filterDataframe(df_tr_c)

    #prepare test dataframes
    df_te_c = function(testdata_c)[0]
    rows_to_delete_te_c, filtered_df_te_c = filterDataframe(df_te_c)
    #extract test labels
    filtered_labels_te_c = filtered_df_te_c['domain'].to_list()
    filtered_encodings_te_c = filtered_df_te_c['encoding'].tolist()

    print("Retrieve note id's...")
    #get note id's for aggregation
    try:
        ids_c = []
        list_keys_c = filtered_df_te_c['key'].tolist()
        for key in list_keys_c:
            y = key.split('--')[3]
            ids_c.append(y)
    except KeyError:
        ids_c = []

    print("Downsampling training labels...")
    #Original to randomly select indices of negative examples for downsampling
    #Get original support 0 class
    #seriesObj = filtered_df_tr.apply(lambda x: True if x['domain'] == 'None' else False , axis=1)
    # Count number of True in series
    #numOfRows = len(seriesObj[seriesObj == True].index)
    #print('Number of Rows in dataframe in which domain is None =', numOfRows)

    #per_50 = (numOfRows/2)
    #per_25 = (per_50/2)
    #per_125 = (per_25/2)
    #per_625 = (per_625/2)
    #per_3125 = (per_3125/2)
    #N = int(per_50) #+ int(per_25) + int(per_125) + int(per_625) + int(per_3125)

    #down_df_tr, indices = downsample(filtered_df_tr, N)

    with open("down_indices_covid2.pkl", "rb") as f:
        indices = pickle.load(f)
    down_df_tr_c = filtered_df_tr_c.drop(indices)

    if splitting == False:
        downsampled_filtered_labels_tr_c = down_df_tr_c['domain'].to_list()
        downsampled_filtered_encodings_tr_c = down_df_tr_c['encoding'].tolist()

    if splitting == True:
        #splitting final dataframw
        shuffled = down_df_tr_c.sample(frac=1)
        parts = np.array_split(shuffled, 4)

        df_25 = parts[0]
        df_50 = df_25.append(parts[1])
        df_75 = df_50.append(parts[2])

        if percentage == 25:
            #extract training labels
            downsampled_filtered_labels_tr_c = df_25['domain'].to_list()
            downsampled_filtered_encodings_tr_c = df_25['encoding'].tolist()
        if percentage == 50:
            downsampled_filtered_labels_tr_c = df_50['domain'].to_list()
            downsampled_filtered_encodings_tr_c = df_50['encoding'].tolist()
        if percentage == 75:
            downsampled_filtered_labels_tr_c = df_75['domain'].to_list()
            downsampled_filtered_encodings_tr_c = df_75['encoding'].tolist()

    print('Converting encodings...')

    sen_reps_tr_c = []
    for entry in downsampled_filtered_encodings_tr_c:
        entry2 = entry[-4:]
        #take mean of last 4 layers to create sentence representation
        entry3 = torch.mean(entry2, dim=0)
        #convert in numpy array
        array = entry3.numpy()
        sen_reps_tr_c.append(array)

    #prepare test features
    #filtered_encodings_te = [i for j, i in enumerate(encodings_te) if j not in set(rows_to_delete_te)]
    sen_reps_te_c = []
    for entry in filtered_encodings_te_c:
        entry2 = entry[-4:]
        #take mean of last 4 layers to create sentence representation
        entry3 = torch.mean(entry2, dim=0)
        #convert in numpy array
        array = entry3.numpy()
        sen_reps_te_c.append(array)

    return (downsampled_filtered_labels_tr_c, filtered_labels_te_c,
            sen_reps_tr_c, sen_reps_te_c, ids_c)

ind = pkl.load(open('pkls/gaus_trans.pkl', 'rb'))
inds = ind
data = parallel_rw_pkl(None, 'inter_sent%i' % 4, 'r')
mask = parallel_rw_pkl(None, 'inter_sentm%i' % 4, 'r')
sent = readfile([
    11,
], fhead)['B11']
sent = ScaleExtent(sent, (10980, 10980))
cm = parallel_rw_pkl(None, '0510diacm', 'r')
sent[cm] = np.nan
stm = parallel_rw_pkl(None, 'std_m', 'r')
print 'finshed reading data'
data[mask] = np.nan
modis_sent = np.array(data)
Sent = sent
Stm = stm
patches = np.array(
    zip(np.mgrid[0:10, 0:10][0].ravel(), np.mgrid[0:10, 0:10][1].ravel()))
pros = np.array(np.array_split(patches, 16))

par = partial(applied)
pool = multiprocessing.Pool(processes=16)
data = pool.map(par, pros)
pool.close()
pool.join()
parallel_rw_pkl(data, 'psfb11_modis', 'w')

print 'lol finished psf b11!!!!!'
#     # break

# # tweets_filtered = tweets_filtered[:100]

# print('time taken to load keyword filtered sample:', str(time.time() - start_time), 'seconds')
# print(tweets_filtered.shape)


print('Load Random Tweets:')
# random contains 7.3G of data!!
start_time = time.time()

paths_to_random=list(np.array_split(
                        # glob(os.path.join(path_to_data,'random','*.parquet')),
                        glob(os.path.join(path_to_data,'random_10perct_sample','*.parquet')),
                        # glob(os.path.join(path_to_data,'random_1perct_sample','*.parquet')),
                        SLURM_ARRAY_TASK_COUNT)[SLURM_ARRAY_TASK_ID])
print('#files:', len(paths_to_random))

tweets_random=pd.DataFrame()
for file in paths_to_random:
    print(file)
    tweets_random=pd.concat([tweets_random,pd.read_parquet(file)[['tweet_id','text']]])
    print(tweets_random.shape)

#     break

# tweets_random = tweets_random[:100]

print('time taken to load random sample:', str(time.time() - start_time), 'seconds')
    * sec_loc: by knowing that the dimension of the sample is 
      (Ux, Uy, Uz)=(4, 4, 2), and because we slice on X-dir and Y-dir, we 
      define sec_loc which shows the location of the slices in X-dir and Y-dir 
    * patch_1, patch_2 : since we need to make two 129x129 images (patches) 
      from 257x129 images, we need to define patch_1 and patch_2 in which we 
      can find the indices we need to this end. 
"""
width = 257
height = 129

sec_loc = np.linspace(0.0, 4.0, width)

numb_sections = len(sec_loc[np.arange(0, width, 5)])

patch_1 = np.array_split(np.arange(height * width).reshape(height, width),
                         2,
                         axis=1)[0].flatten()
patch_2 = patch_1 + height - 1
""" Reading, slicing, and saving the dataset 
    * time_interval : this is a numpy array used to define the time interval 
      that we want to extract the infromation from the solution which is saved 
      in the dataset. Note that we start from time step number 5 because 
      solutions at the very begining of the dataset is affected by the initial 
      states introduced to the system, and the width of interfaces are pretty 
      large.
    * df : this is a pandas dataframe which includes (X, Y, Z, phi) features
    * saving_frames : this is numpy array in which we save phi values of all 
      the slices we created.
"""
""" Iterating over names of the files (*.h5) in the reading_directory
"""
示例#42
0
            if i % 500 == 0:
                figshow(pred[0, 0, :, :])
                plt.savefig("./png/model_" + modelstr + "." + str(i) + ".pred.png")
                figshow(np.log(((target_r + eps) / (normmat_r + eps)))[0, :, :], np=True)
                plt.savefig("./png/model_" + modelstr + "." + str(i) + ".label.png")
                torch.save(net.state_dict(), "./models/model_" + modelstr + ".checkpoint")
                torch.save(optimizer.state_dict(), "./models/model_" + modelstr + ".optimizer")

            if i % 2000 == 0:
                net.eval()
                corr = []
                mse = []
                mseloss = nn.MSELoss()
                t = 0
                for sequence, target in zip(
                    np.array_split(validation_sequences, 256),
                    np.array_split(validation_targets, 256),
                ):
                    pred = net(torch.Tensor(sequence).transpose(1, 2).cuda())
                    target_r = np.nanmean(
                        np.nanmean(np.reshape(target, (target.shape[0], 250, 4, 250, 4)), axis=4),
                        axis=2,
                    )
                    if t < 10:
                        figshow(pred[0, 0, :, :])
                        plt.savefig("./png/model_" + modelstr + ".test" + str(t) + ".pred.png")
                        figshow(np.log(((target_r + eps) / (normmat_r + eps)))[0, :, :], np=True)
                        plt.savefig("./png/model_" + modelstr + ".test" + str(t) + ".label.png")
                    t += 1
                    if np.mean(np.isnan(target_r)) < 0.7:
                        target_cuda = torch.Tensor(
示例#43
0
 def paaTransformData(self, ts, n_pieces):
     splitted = np.array_split(ts, n_pieces)
     return np.asarray(map(lambda xs: xs.mean(axis=0), splitted))
示例#44
0
    psuX.append(data.columns[-1])
    mList.append(psuX)
print(" M LIST ", mList)
# j=0

# In[6]:

a = 0
pre = 0
re = 0
fM = 0
for train, test in kf.split(vector):

    newData = data

    trainResult = np.array_split(train, split)

    testResult = np.array(test)

    test = vector[testResult]
    testList = []
    for x in range(len(mList)):
        testList.append(test[:, mList[x]])

    groundTruth = test[:, -1]
    decisionTreeLst = []
    #     // TREE FORMATION

    for tr in range(len(trainResult)):
        #         m =
        dataSet = vector[trainResult[tr]]
def calcIntensitiesCUDA(x, waveNumber, y1Vals, y2Vals, y1Amps):

    #number of sections to divide y array into: N/x

    t0 = time.time()

    y1Vals = np.array(y1Vals)
    y2Vals = np.array(y2Vals)

    #print(y1Vals,y2Vals)
    """ Break y1 and y2 into sections """
    """ Need to change this yourself """
    numSections = 20

    #y1sections = np.array([np.array_split(y1Vals, numSections)])
    #y2sections = np.array([np.array_split(y2Vals, numSections)])

    y1secs = np.array_split(y1Vals, numSections)
    y2secs = np.array_split(y2Vals, numSections)

    #y1amps =

    #Need to look through these later for more gratings
    y1AmpSecs = np.array_split(y1Amps, numSections)

    y1sections = np.array(y1secs)
    y2sections = np.array(y2secs)

    y1Amps = np.array(y1AmpSecs)

    ampInc = 0

    for y2section in y2sections:

        inc = 0

        ampColumns = [None] * numSections

        #ampColumns = [[0],[0],[0],[0]]

        # Make double for loop

        for y1section in y1sections:
            #print(y1section)
            #print(y2section)

            y1s, y2s = np.meshgrid(y1section, y2section)

            rArray = np.sqrt(x**2 + (y2s - y1s)**2)

            #rArray = np.transpose(rArray)
            waveNumArray = np.full(rArray.shape, waveNumber)

            #y1Amps = np.transpose(np.repeat(np.array([y1Amps]),rArray.shape[1],0))
            y1Amps = np.repeat(np.array([y1Amps[0]]), rArray.shape[0], 0)

            ampComponentArray = np.zeros_like(rArray, dtype=complex)

            ampComponentArray[:, :] = complexAmplitudeCUDA(
                y1Amps[:, :], waveNumArray[:, :], rArray[:, :])

            Amps = ampComponentArray.sum(axis=1)

            ampColumns[inc] = Amps

            inc += 1

        ampColumns = np.array(ampColumns)
        summedAmps = ampColumns.sum(axis=0)
        #ampColumns[0] + ampColumns[1] + ampColumns[2] + ampColumns[3]

        ampInc += 1

        summedAmps = (summedAmps * np.conjugate(summedAmps)).real

        #print(summedAmps)
        sendSumAndSendTo("tempData.txt", summedAmps, 'a')

    intensities = readFromFile("tempData.txt")

    return intensities
示例#46
0
def get_window_mean(ls):
    WINDOW_COUNT = 30
    return [np.mean(x) for x in np.array_split(ls, WINDOW_COUNT)]
示例#47
0
def reorder_axis(arr, ax):
    pos_part, neg_part = np.array_split(arr, 2, axis=ax)
    rejoined_arr = np.concatenate((neg_part, pos_part), axis=ax)
    return rejoined_arr
def cross_reactivity_density_paratope_epitope_ppi_mpi(nodefile, edgefile,
                                                      sourcetag, targettag):
    '''
    prep neat data for cross reactivity density plots both paratope and epitope
    retrofit for ppi usage
    :return:
    '''
    # edge_files = fifi('abdb_outfiles_2019', 'internet_edges.csv')
    # node_files = fifi('abdb_outfiles_2019', 'internet_nodes.csv')
    # infiles = edge_files  + node_files
    # print(infiles)
    # sys.exit()
    peinfile = 'abdb_outfiles_2019/ppi_internet_edges.csv'
    rinfile = 'abdb_outfiles_2019/downsampled_ppi_internet_edges.csv'
    peinfile_nodes = 'abdb_outfiles_2019/ppi_internet_nodes.csv'
    rinfile_nodes = 'abdb_outfiles_2019/downsampled_ppi_internet_nodes.csv'
    penodesdf = pd.read_csv(peinfile_nodes)
    rnodesdf = pd.read_csv(rinfile_nodes)
    pedf = pd.read_csv(peinfile).iloc[:]
    rdf = pd.read_csv(rinfile)
    print(pedf.head())
    data = []
    data2 = []
    n = 4
    nodes_paratope = [
        item for item in penodesdf.id.tolist() if '*' not in item
    ][:]
    nodes_epitope = [item for item in penodesdf.id.tolist() if '*' in item][:]
    # chunk the list
    chunks = np.array_split(nodes_paratope, n)
    chunks2 = np.array_split(nodes_epitope, n)
    print(len(chunks[0]), len(nodes_paratope))
    # scatter the params
    comm = MPI.COMM_WORLD
    print(comm.Get_size())
    if comm.rank == 0:
        params = chunks
        params2 = chunks2
    else:
        params = None
        params2 = None
    params = comm.scatter(params, root=0)
    params2 = comm.scatter(params2, root=0)
    outdir = 'supfig12outs'
    # clear outdir before making a new one
    os.rmdir(outdir)
    os.mkdir(outdir)
    outname = outdir + '/' + nodefile.split(
        '.')[0] + 'rep%s_%s' % (comm.rank, sourcetag) + '.csv'
    outname2 = outdir + '/' + nodefile.split(
        '.')[0] + 'rep%s_%s' % (comm.rank, targettag) + '.csv'
    print(outname)
    print(outname2)
    print(params, comm.rank)
    print(params2, comm.rank)
    for motif in params:
        mdf = pedf[pedf.source == motif]
        partners = mdf.target
        for motif2 in nodes_paratope:
            mdf2 = pedf[pedf.source == motif2]
            partners2 = mdf2.target
            intersect = set(partners) & set(partners2)
            percent_overlap = round(
                len(intersect) / float(len(partners)) * 100, 1)
            # print(motif, percent_overlap)
            datum = [motif, motif2, percent_overlap, 'ppimotif']
            data.append(datum)
    colnames = ['motif1', 'motif2', 'percent_overlap', 'motif_source']
    outdf1 = pd.DataFrame(data, columns=colnames)
    outdf1.to_csv(outname, index=False)
    for motif in params2:
        mdf = pedf[pedf.target == motif]
        partners = mdf.source
        for motif2 in nodes_epitope:
            mdf2 = pedf[pedf.target == motif2]
            partners2 = mdf2.source
            intersect = set(partners) & set(partners2)
            percent_overlap = round(
                len(intersect) / float(len(partners)) * 100, 1)
            # print(motif, percent_overlap)
            datum = [motif, motif2, percent_overlap, 'ppimotifpartner']
            data.append(datum)
    outdf2 = pd.DataFrame(data2, columns=colnames)
    outdf2.to_csv(outname2, index=False)
def _down_sample(ltable,
                 rtable,
                 y_param,
                 show_progress=True,
                 verbose=False,
                 seed=None,
                 rem_puncs=True,
                 rem_stop_words=True,
                 n_ltable_chunks=-1,
                 n_rtable_chunks=-1):
    """
    Down sampling command implementation. We have reproduced the down sample command 
    because the input to the down sample command is the down sampled right table.   
    """

    if not isinstance(ltable, pd.DataFrame):
        logger.error('Input table A (ltable) is not of type pandas DataFrame')
        raise AssertionError(
            'Input table A (ltable) is not of type pandas DataFrame')

    if not isinstance(rtable, pd.DataFrame):
        logger.error('Input table B (rtable) is not of type pandas DataFrame')

        raise AssertionError(
            'Input table B (rtable) is not of type pandas DataFrame')

    if len(ltable) == 0 or len(rtable) == 0:
        logger.error('Size of the input table is 0')
        raise AssertionError('Size of the input table is 0')

    if y_param == 0:
        logger.error('y cannot be zero (3rd and 4th parameter of downsample)')
        raise AssertionError(
            'y_param cannot be zero (3rd and 4th parameter of downsample)')

    if seed is not None and not isinstance(seed, int):
        logger.error('Seed is not of type integer')
        raise AssertionError('Seed is not of type integer')

    validate_object_type(verbose, bool, 'Parameter verbose')
    validate_object_type(show_progress, bool, 'Parameter show_progress')
    validate_object_type(rem_stop_words, bool, 'Parameter rem_stop_words')
    validate_object_type(rem_puncs, bool, 'Parameter rem_puncs')
    validate_object_type(n_ltable_chunks, int, 'Parameter n_ltable_chunks')
    validate_object_type(n_rtable_chunks, int, 'Parameter n_rtable_chunks')

    # rtable_sampled = sample_right_table(rtable, size)
    rtable_sampled = rtable

    ltbl_str_cols = _get_str_cols_list(ltable)
    proj_ltable = ltable[ltable.columns[ltbl_str_cols]]

    if n_ltable_chunks == -1:
        n_ltable_chunks = multiprocessing.cpu_count()

    ltable_chunks = np.array_split(proj_ltable, n_ltable_chunks)
    preprocessed_tokenized_tbl = []
    start_row_id = 0
    for i in range(len(ltable_chunks)):
        result = delayed(process_tokenize_concat_strings)(ltable_chunks[i],
                                                          start_row_id,
                                                          rem_puncs,
                                                          rem_stop_words)
        preprocessed_tokenized_tbl.append(result)
        start_row_id += len(ltable_chunks[i])
    preprocessed_tokenized_tbl = delayed(wrap)(preprocessed_tokenized_tbl)
    if show_progress:
        with ProgressBar():
            logger.info('Preprocessing/tokenizing ltable')
            preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute(
                scheduler="processes", num_workers=multiprocessing.cpu_count())
    else:
        preprocessed_tokenized_tbl_vals = preprocessed_tokenized_tbl.compute(
            scheduler="processes", num_workers=multiprocessing.cpu_count())

    ltable_processed_dict = {}
    for i in range(len(preprocessed_tokenized_tbl_vals)):
        ltable_processed_dict.update(preprocessed_tokenized_tbl_vals[i])

    inverted_index = build_inverted_index(ltable_processed_dict)

    rtbl_str_cols = _get_str_cols_list(rtable_sampled)
    proj_rtable_sampled = rtable_sampled[rtable_sampled.columns[rtbl_str_cols]]

    if n_rtable_chunks == -1:
        n_rtable_chunks = multiprocessing.cpu_count()

    rtable_chunks = np.array_split(proj_rtable_sampled, n_rtable_chunks)
    probe_result = []

    for i in range(len(rtable_chunks)):
        result = delayed(probe)(rtable_chunks[i], y_param, len(proj_ltable),
                                inverted_index, rem_puncs, rem_stop_words,
                                seed)
        probe_result.append(result)

    probe_result = delayed(wrap)(probe_result)
    if show_progress:
        with ProgressBar():
            logger.info('Probing using rtable')
            probe_result = probe_result.compute(
                scheduler="processes", num_workers=multiprocessing.cpu_count())
    else:
        probe_result = probe_result.compute(
            scheduler="processes", num_workers=multiprocessing.cpu_count())

    probe_result = map(list, probe_result)
    l_tbl_indices = set(sum(probe_result, []))

    l_tbl_indices = list(l_tbl_indices)
    ltable_sampled = ltable.iloc[l_tbl_indices]

    # update catalog
    if cm.is_dfinfo_present(ltable):
        cm.copy_properties(ltable, ltable_sampled)

    if cm.is_dfinfo_present(rtable):
        cm.copy_properties(rtable, rtable_sampled)

    return ltable_sampled, rtable_sampled
            for learnername, learner in classalgs.items():

                meanErrParam = []
                nameParam = []
                params = ""
                #				print(numparams)
                for p in range(numparams):

                    params = parameters[p]
                    learner.reset(params)
                    print('Running learner = ' + learnername +
                          ' on parameters ' + str(learner.getparams()))

                    trainset1 = trainset[1].reshape(trainset[1].shape[0], 1)
                    XSplitter = np.array_split(trainset[0], 5)
                    YSplitter = np.array_split(trainset1, 5)

                    avgError = []
                    nameParam.append(params)

                    for k in range(k_fold):
                        trainX1 = np.zeros((1000, 9))
                        trainX1 = XSplitter[k]

                        trainY1 = np.zeros((1000, 1))
                        trainY1 = YSplitter[k]

                        trainX0 = np.array([], dtype=np.int64).reshape(0, 9)
                        trainY0 = np.array([], dtype=np.int64).reshape(0, 1)
示例#51
0
文件: knn.py 项目: chris-bc/cs231n
# In[16]:

num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []
################################################################################
# TODO:                                                                        #
# Split up the training data into folds. After splitting, X_train_folds and    #
# y_train_folds should each be lists of length num_folds, where                #
# y_train_folds[i] is the label vector for the points in X_train_folds[i].     #
# Hint: Look up the numpy array_split function.                                #
################################################################################
X_train_folds = np.array_split(X_train, num_folds)
y_train_folds = np.array_split(y_train, num_folds)
################################################################################
#                                 END OF YOUR CODE                             #
################################################################################

# A dictionary holding the accuracies for different values of k that we find
# when running cross-validation. After running cross-validation,
# k_to_accuracies[k] should be a list of length num_folds giving the different
# accuracy values that we found when using that value of k.
k_to_accuracies = {}

################################################################################
# TODO:                                                                        #
# Perform k-fold cross validation to find the best value of k. For each        #
# possible value of k, run the k-nearest-neighbor algorithm num_folds times,   #
示例#52
0
def batch_right_rate(output_label, y_variable):
    predict_max_pos = torch.max(output_label, 1)[1]
    right_vector = torch.eq(predict_max_pos, y_variable)
    count = torch.sum(right_vector)
    count = float(count.cpu().data.numpy())
    return count / len(output_label)


criterion = nn.CrossEntropyLoss()
last_epoch_loss = 0
for epoch in range(1000):
    epoch_loss_list = []
    epoch_right_list = []
    all_labels = np.arange(0, len(x_train))
    np.random.shuffle(all_labels)
    batched_labels = np.array_split(all_labels, int(len(x_train) / batch_size))
    for label_of_label in tqdm(range(len(batched_labels))):
        batched_label = batched_labels[label_of_label]
        input_image_matrix = np.zeros((batch_size, 224, 224, 3),
                                      dtype=np.float32)
        input_label = np.zeros((batch_size), dtype=np.float32)
        for i, ele in enumerate(batched_label):
            input_image_matrix[i] = (x_train[ele] - 127.5) / 127.5
            input_label[i] = y_train[ele]
        x_variable = Variable(
            torch.from_numpy(input_image_matrix).permute(0, 3, 1, 2)).type(
                torch.FloatTensor).cuda()
        y_variable = Variable(torch.from_numpy(input_label)).type(
            torch.LongTensor).cuda()
        output_label = net(x_variable)
        epoch_right_list.append(batch_right_rate(output_label, y_variable))
示例#53
0
def simulate_latent_space(t, labels, seed=None, var=.2, split_prob=.1, gap=.75):
    """
    Simulate splitting events in the latent space. The input time t is
    a one dimensional array having the times in it. The labels is a int
    array-like, which holds the labels for the wanted cell types.
    Basically it is an array of repetitions of 1 to number of cell types,
    e.g.: array([1..1,2..2,3..3,4..4]) for 4 cell types.

    :param array_like t: the time as [nx1] array, where n is the number of cells.
    :param array_like labels: the labels for the cells before splitting.
    :param int seed: the seed for this splitting, for reproducability.
    :param scalar var: the variance of spread of the first split, increasing after that.
    :param [0,1] split_prop: probability of split in the beginning, halfs with each split.
    :param [0,1] gap: the gap size between splitends and the beginning of the next.

    The method returns Xsim, seed, labels, time::

        - Xsim is the two dimensional latent space with splits included.
        - seed is the seed generated, for reproduceability.
        - labels are the corrected labels, for split events.
        - time is the corrected timeline for split events.
    """
    seed = seed or np.random.randint(1000,10000)
    np.random.seed(seed)

    n_data = t.shape[0]
    newlabs = []

    assert np.issubdtype(labels.dtype, np.int_) and np.greater(labels, 0).all(), "labels need to be of positive integer dtype, 0 is not allowed"

    ulabs = []
    for x in range(n_data):
        if labels[x] not in ulabs:
            ulabs.append(labels[x])

    Xsim = np.zeros((n_data, 2))
    split_ends = [Xsim[0]]
    prev_ms = [[.1,.1]]
    split_end_times = [t[labels==ulabs[0]].max()]

    t = np.sort(t.copy(), 0)

    tmax = t.max()

    for lab in ulabs:
        fil = (lab==labels).nonzero()[0]

        # zero out, for simulating linear relation within cluster:
        new_se = []
        new_m = []
        new_set = []

        splits = np.array_split(fil, len(split_ends))

        i = 1
        for s in range(len(split_ends)):
            # for all previously done splits:
            prev_m = prev_ms[s]
            split = splits[s]
            split_end = split_ends[s]
            split_end_time = split_end_times[s]

            pre_theta = None
            prev_split_time = None
            for split in np.array_split(split, np.random.binomial(1, split_prob)+1):
                newlabs.extend(["{} {}".format(_c, i) for _c in labels[split]])
                i += 1
                # If we split a collection into two, we want the two times to match up now:
                if prev_split_time is None:
                    prev_split_time = t[split].ptp()
                else:
                    t[split.min():] -= prev_split_time
                t[split] -= (t[split.min()]-split_end_time)

                # make splits longer, the farther in we are into
                # the split process, it scales with sqrt(<split#>)
                x = t[split].copy()
                x -= x.min()
                x /= x.max()
                x *= np.sqrt(lab)

                # rotate m away a little from the previous direction:
                if pre_theta is None:
                    pre_theta = theta = np.random.uniform(-45, 45)
                else:
                    theta = ((pre_theta+90)%90)-90
                theta *= (np.pi/180.) # radians for rotation matrix
                rot_m = np.array([[np.cos(theta), -np.sin(theta)], [np.sin(theta), np.cos(theta)]])
                m = np.dot(rot_m, prev_m)

                # later splits have bigger spread:
                v = (x.mean(0) - np.abs((-x+x.mean(0))))
                v -= v.min(0)-1e-6
                v /= v.max(0)
                v *= var*t[split]/tmax

                # make the split
                Xsim[split] = np.random.normal(split_end + m*x, v)

                # put a gap between this and the next split:
                p = m*x[-1]
                #p /= np.sqrt(GPy.util.linalg.tdot(p))

                # save the new sets of splits
                new_se.append(split_end + (1+gap)*p)
                new_m.append(m)
                new_set.append(t[split.max()])

        split_ends = new_se
        prev_ms = new_m
        split_end_times = new_set
        # The split probability goes up every time the cell stage changes:

        split_prob = min(1., split_prob*2)

    Xsim -= Xsim.mean(0)
    Xsim /= Xsim.std(0)
    #Xsim += np.random.normal(0,var,Xsim.shape)

    from scipy.stats import t as tdist
    Xsim += tdist.rvs(3, loc=0, scale=.1*var, size=Xsim.shape) #Add outliers


    return Xsim, seed, np.asarray(newlabs), t
示例#54
0
        #print("class : ",content.split()[1])
        c = int(content.split()[1]) - 2
        l = int(Lines[count].split()[1]) - 1
        ilp = float(Lines[count + 2].split()[0])
        sn = float(Lines[count + 4].split()[3])
        #print(ilp)
        List_ILP.append(ilp)
        List_SN.append(sn)
        #print("\tIteration : ",Lines[count].split()[1])
        #ILP_String=str(c)+" "+str(l)+" "+str(ilp)
        #SN_String=str(c)+" "+str(l)+" "+str(sn)
        #List_ILP.append(ILP_String)
    count = count + 1
#print(len(List_ILP))
#print(len(List_SN))
split_ilp = np.array_split(List_ILP, n)
split_sn = np.array_split(List_SN, n)

# In[5]:

#split_ilp[2][23]

# In[6]:

accuracy_ILP = []
std_devn_ILP = []
label = []
for i in range(0, n):
    #print(i)
    label.append(i + 2)
    #print(list(split_ilp[i]))
def shuffle_batch(X, y, batch_size):
    rnd_idx = np.random.permutation(len(X))
    n_batches = len(X) // batch_size
    for batch_idx in np.array_split(rnd_idx, n_batches):
        X_batch, y_batch = X[batch_idx], y[batch_idx]
        yield X_batch, y_batch
示例#56
0
def reformat (files, features, cls):
    #Median Absolute Deviation
    def mad(data, axis=None):
        return np.mean(np.absolute(data - np.mean(data, axis)), axis)


    #Meadian Filter
    def strided_app(a, L, S ):  # Window len = L, Stride len/stepsize = S
        nrows = ((a.size-L)//S)+1
        n = a.strides[0]
        return np.lib.stride_tricks.as_strided(a, shape=(nrows,L), strides=(S*n,n))       


    appended_features_all=[]
    appended_features_df = pd.DataFrame()
    wrist_class = {'Brush_teeth':0, 
              'Climb_stairs':4, 
              'Comb_hair':2, 
              'Descend_stairs':3, 
              'Drink_glass':1, 
              'Eat_meat':5, 
              'Eat_soup':6, 
              'Getup_bed':7, 
              'Liedown_bed':8, 
              'Pour_water':9, 
              'Sitdown_chair':10, 
              'Standup_chair':11, 
              'Use_telephone':12, 
              'Walk':13
             }

    #Binarization dictionary
    wrist_class_binary = {'Other':0, 
                'Drink_glass':1, 
                }


    if number_components == 3: #FOR XYZ
        data = pd.read_csv(files, sep=' ', header=None, names=['x', 'y', 'z']) 
        
        #Conversion from 0-63 to m/s^2
        df_x = -14.709 + (data.iloc[:,0:1]/63)*(2*14.709)
        df_y = -14.709 + (data.iloc[:,1:2]/63)*(2*14.709)
        df_z = -14.709 + (data.iloc[:,2:3]/63)*(2*14.709)
        
        
        #Median filtering
        x = np.median(strided_app(df_x.values.flatten(), 3,1),axis=1)
        y = np.median(strided_app(df_y.values.flatten(), 3,1),axis=1)
        z = np.median(strided_app(df_z.values.flatten(), 3,1),axis=1)
        
        df_x = pd.DataFrame(x, columns=['x'])
        df_y = pd.DataFrame(y, columns=['y'])
        df_z = pd.DataFrame(z, columns=['z'])
        

        data_x = df_x.values
        data_y = df_y.values
        data_z = df_z.values
        
        #Divide data in segments
        split_index=5 #Number of segments
        data_split_x=np.array_split(data_x, split_index)
        data_split_y=np.array_split(data_y, split_index)
        data_split_z=np.array_split(data_z, split_index)
        
        #Features Calculation

        appended_before=['data_split_x[2].min(axis=0)', 'data_x.min(axis=0)','data_split_x[2].mean(axis=0)',
        'np.median(data_split_x[2],axis=0)','data_split_x[1].min(axis=0)','data_split_x[1].mean(axis=0)',
        'data_x.mean(axis=0)','np.median(data_split_x[1],axis=0)','np.median(data_x,axis=0)',
        'data_split_x[3].mean(axis=0)','np.median(data_split_x[3],axis=0)','data_split_x[3].min(axis=0)',
        'np.median(data_z,axis=0)','data_split_x[4].min(axis=0)','data_split_x[2].max(axis=0)',
        'np.median(data_split_z[2],axis=0)','data_split_x[2].std(axis=0)','data_split_x[4].mean(axis=0)',
        'data_split_x[3].max(axis=0)','np.median(data_split_x[4],axis=0)','data_z.std(axis=0)',
        'mad(data_split_x[2],axis=0)','np.median(data_split_z[4],axis=0)','data_split_z[2].mean(axis=0)',
        'mad(data_z,axis=0)','data_split_z[2].std(axis=0)','data_z.mean(axis=0)',
        'data_split_z[4].mean(axis=0)','data_split_x[0].min(axis=0)','data_z.var(axis=0)',
        'np.median(data_split_z[3],axis=0)','data_split_z[3].mean(axis=0)','mad(data_split_z[2],axis=0)',
        'np.median(data_split_x[0],axis=0)','data_split_x[0].mean(axis=0)','data_split_x[1].max(axis=0)',
        'data_z.min(axis=0)','data_split_x[4].var(axis=0)','data_split_x[2].var(axis=0)',
        'data_split_z[2].var(axis=0)','data_split_z[1].std(axis=0)','data_split_z[2].min(axis=0)',
        'data_split_x[4].std(axis=0)','data_split_z[4].var(axis=0)','mad(data_split_z[1],axis=0)',
        'mad(data_split_y[3],axis=0)','mad(data_split_x[4],axis=0)','mad(data_y,axis=0)',
        'data_split_z[1].var(axis=0)','data_split_z[3].max(axis=0)','data_split_z[4].std(axis=0)',
        'mad(data_split_z[4],axis=0)','data_split_z[1].min(axis=0)','data_y.std(axis=0)',
        'data_split_y[3].std(axis=0)','data_split_z[4].max(axis=0)','data_split_z[0].min(axis=0)',
        'data_split_z[1].mean(axis=0)','data_split_x[0].var(axis=0)','data_split_z[3].min(axis=0)',
        'np.median(data_split_z[1],axis=0)','data_x.var(axis=0)','np.median(data_split_z[0],axis=0)',
        'data_split_z[4].min(axis=0)','data_y.var(axis=0)','data_split_z[0].mean(axis=0)',
        'data_split_x[0].std(axis=0)','kurtosis(data_split_z[4],axis=0)','np.median(data_split_y[2],axis=0)',
        'data_split_x[4].max(axis=0)','data_split_y[3].var(axis=0)','data_x.max(axis=0)',
        'data_split_z[0].var(axis=0)','data_split_y[2].max(axis=0)','data_split_y[2].mean(axis=0)',
        'mad(data_split_x[0],axis=0)','data_split_z[3].var(axis=0)','data_x.std(axis=0)',
        'kurtosis(data_split_y[1],axis=0)','data_split_z[0].std(axis=0)','data_split_z[2].max(axis=0)',
        'mad(data_split_z[0],axis=0)','kurtosis(data_y,axis=0)','data_split_y[0].min(axis=0)',
        'data_split_z[3].std(axis=0)','data_split_x[1].std(axis=0)','kurtosis(data_split_y[0],axis=0)',
        'skew(data_z,axis=0)','mad(data_split_z[3],axis=0)','skew(data_split_y[2],axis=0)',
        'data_split_x[1].var(axis=0)','data_split_x[0].max(axis=0)','np.median(data_split_y[4],axis=0)',
        'data_split_y[4].mean(axis=0)','mad(data_x,axis=0)','data_split_y[0].mean(axis=0)',
        'data_split_y[2].var(axis=0)','data_split_z[0].max(axis=0)','np.median(data_split_y[3],axis=0)',
        'data_split_z[1].max(axis=0)','data_split_y[2].std(axis=0)','data_split_y[3].max(axis=0)',
        'mad(data_split_x[1],axis=0)','np.median(data_split_y[0],axis=0)','mad(data_split_y[1],axis=0)',
        'data_split_y[3].mean(axis=0)','mad(data_split_y[2],axis=0)','data_split_y[0].max(axis=0)',
        'kurtosis(data_x,axis=0)','data_split_y[1].min(axis=0)','skew(data_split_y[3],axis=0)',
        'skew(data_split_x[3],axis=0)','kurtosis(data_split_y[3],axis=0)','data_split_y[4].min(axis=0)',
        'data_split_y[0].var(axis=0)','mad(data_split_x[3],axis=0)','data_split_y[1].std(axis=0)',
        'kurtosis(data_split_z[1],axis=0)','kurtosis(data_split_y[4],axis=0)','skew(data_split_z[2],axis=0)',
        'skew(data_split_x[1],axis=0)','data_split_y[4].max(axis=0)','np.median(data_y,axis=0)',
        'data_split_y[4].std(axis=0)','skew(data_split_z[1],axis=0)','kurtosis(data_split_x[2],axis=0)',
        'skew(data_split_x[2],axis=0)','data_split_y[1].mean(axis=0)','kurtosis(data_split_y[2],axis=0)',
        'skew(data_split_z[0],axis=0)','kurtosis(data_split_x[0],axis=0)','skew(data_split_y[0],axis=0)',
        'data_split_y[1].max(axis=0)','skew(data_split_z[3],axis=0)','kurtosis(data_split_x[1],axis=0)',
        'kurtosis(data_split_x[3],axis=0)','data_split_x[3].std(axis=0)','skew(data_y,axis=0)',
        'data_z.max(axis=0)','mad(data_split_y[4],axis=0)','data_y.mean(axis=0)',
        'np.median(data_split_y[1],axis=0)','data_y.max(axis=0)','skew(data_x,axis=0)',
        'data_split_y[4].var(axis=0)','mad(data_split_y[0],axis=0)','skew(data_split_y[1],axis=0)',
        'kurtosis(data_z,axis=0)','kurtosis(data_split_x[4],axis=0)','data_split_y[2].min(axis=0)',
        'kurtosis(data_split_z[2],axis=0)','skew(data_split_y[4],axis=0)','data_split_y[1].var(axis=0)',
        'data_split_x[3].var(axis=0)','kurtosis(data_split_z[0],axis=0)','data_split_y[3].min(axis=0)',
        'kurtosis(data_split_z[3],axis=0)','data_split_y[0].std(axis=0)','skew(data_split_x[4],axis=0)',
        'skew(data_split_x[0],axis=0)','skew(data_split_z[4],axis=0)','data_y.min(axis=0)']
    else: # For the most representative component (X)


        data = pd.read_csv(files, sep=' ', header=None, names=['x']) 
        
        #Conversion from 0-63 to m/s^2
        df_x = -14.709 + (data.iloc[:,0:1]/63)*(2*14.709)
        
        """
        #Median filtering
        x = np.median(strided_app(df_x.values.flatten(), 3,1),axis=1)

        df_x = pd.DataFrame(x, columns=['x'])
        """
        data_x = df_x.values

        #Divide data in segments
        split_index=5 #Number of segments
        data_split_x=np.array_split(data_x, split_index)

        appended_before=['data_split_x[2].min(axis=0)','data_x.min(axis=0)','data_split_x[2].mean(axis=0)',
        'np.median(data_split_x[2],axis=0)','data_split_x[1].min(axis=0)','data_split_x[1].mean(axis=0)',
        'data_x.mean(axis=0)','np.median(data_split_x[1],axis=0)','np.median(data_x,axis=0)',
        'data_split_x[3].mean(axis=0)','np.median(data_split_x[3],axis=0)','data_split_x[3].min(axis=0)',
        'data_split_x[4].min(axis=0)','data_split_x[2].max(axis=0)','data_split_x[2].std(axis=0)',
        'data_split_x[4].mean(axis=0)','data_split_x[3].max(axis=0)','np.median(data_split_x[4],axis=0)',
        'mad(data_split_x[2],axis=0)','data_split_x[0].min(axis=0)','np.median(data_split_x[0],axis=0)',
        'data_split_x[0].mean(axis=0)','data_split_x[1].max(axis=0)','data_split_x[4].var(axis=0)',
        'data_split_x[2].var(axis=0)','data_split_x[4].std(axis=0)','mad(data_split_x[4],axis=0)',
        'data_split_x[0].var(axis=0)','data_x.var(axis=0)','data_split_x[0].std(axis=0)',
        'data_split_x[4].max(axis=0)','data_x.std(axis=0)','mad(data_split_x[0],axis=0)',
        'data_split_x[1].std(axis=0)','data_x.max(axis=0)','data_split_x[1].var(axis=0)',
        'data_split_x[0].max(axis=0)','mad(data_x,axis=0)','kurtosis(data_x,axis=0)',
        'mad(data_split_x[1],axis=0)','skew(data_split_x[3],axis=0)','mad(data_split_x[3],axis=0)',
        'skew(data_split_x[1],axis=0)','kurtosis(data_split_x[2],axis=0)','skew(data_split_x[2],axis=0)',
        'skew(data_x,axis=0)','kurtosis(data_split_x[0],axis=0)','kurtosis(data_split_x[1],axis=0)',
        'kurtosis(data_split_x[3],axis=0)','data_split_x[3].std(axis=0)','kurtosis(data_split_x[4],axis=0)',
        'data_split_x[3].var(axis=0)','skew(data_split_x[4],axis=0)','skew(data_split_x[0],axis=0)']

    #Create initial_features_matrix
    appended_features_split=[]
    appended_features=[]

    for i in range (0, features):
        appended_features_before = eval(appended_before[i])
        appended_features.append(appended_features_before[0])
        appended_features_before=[] 

    appended_features_all.append(appended_features)

    appended_features_df = pd.DataFrame(appended_features_all)
 
    #Binarize detectiom
    if wrist_class[cls] != 1:
        wrist_class[cls] = 0 #Other classes than drink are considered as CLASS 0. Drink = CLASS 1
    
    #Access to dictionary class number and add it as a feature
    appended_features_df[-1]= wrist_class[cls]
    
    #Return table containing all rows for every class and feature colums 
    #(mean*3, sd*3, Max*3, Min*3, Y). Number of the row is manteined. (0~101)
    return appended_features_df
示例#57
0
    def get_spikes(self, label, buffer_manager, region, placements,
                   graph_mapper, application_vertex, machine_time_step):

        spike_times = list()
        spike_ids = list()
        ms_per_tick = machine_time_step / 1000.0

        vertices = \
            graph_mapper.get_machine_vertices(application_vertex)

        missing_str = ""

        progress_bar = ProgressBar(len(vertices),
                                   "Getting spikes for {}".format(label))
        for vertex in vertices:

            placement = placements.get_placement_of_vertex(vertex)
            vertex_slice = graph_mapper.get_slice(vertex)

            x = placement.x
            y = placement.y
            p = placement.p
            lo_atom = vertex_slice.lo_atom

            # Read the spikes
            n_words = int(math.ceil(vertex_slice.n_atoms / 32.0))
            n_bytes = n_words * 4
            n_words_with_timestamp = n_words + 1

            # for buffering output info is taken form the buffer manager
            neuron_param_region_data_pointer, data_missing = \
                buffer_manager.get_data_for_vertex(
                    placement, region)
            if data_missing:
                missing_str += "({}, {}, {}); ".format(x, y, p)
            record_raw = neuron_param_region_data_pointer.read_all()
            raw_data = (numpy.asarray(record_raw, dtype="uint8").view(
                dtype="<i4")).reshape([-1, n_words_with_timestamp])
            if len(raw_data) > 0:
                split_record = numpy.array_split(raw_data, [1, 1], 1)
                record_time = split_record[0] * float(ms_per_tick)
                spikes = split_record[2].byteswap().view("uint8")
                bits = numpy.fliplr(
                    numpy.unpackbits(spikes).reshape((-1, 32))).reshape(
                        (-1, n_bytes * 8))
                time_indices, indices = numpy.where(bits == 1)
                times = record_time[time_indices].reshape((-1))
                indices = indices + lo_atom
                spike_ids.append(indices)
                spike_times.append(times)
                progress_bar.update()

        progress_bar.end()
        if len(missing_str) > 0:
            logger.warn(
                "Population {} is missing spike data in region {} from the"
                " following cores: {}".format(label, region, missing_str))

        if len(spike_ids) == 0:
            return numpy.zeros((0, 2), dtype="float")
        spike_ids = numpy.hstack(spike_ids)
        spike_times = numpy.hstack(spike_times)
        result = numpy.dstack((spike_ids, spike_times))[0]
        return result[numpy.lexsort((spike_times, spike_ids))]
示例#58
0
        labelMissingGeoTag, user_top_used_geo_tag)
    return bipolar_tweets_with_geo


if __name__ == '__main__':
    bipolar_tweets = pd.read_csv(
        '../initial_data/selected_normal_users_tweets_less5.csv')

    df = bipolar_tweets.groupby(['username', 'tweetLat', 'tweetLong'
                                 ])['tweetLong'].agg({'count': 'count'})
    mask = df.groupby(level=0).agg('idxmax')
    user_top_used_geo_tag = df.loc[mask['count']]
    user_top_used_geo_tag = user_top_used_geo_tag.reset_index()

    p = mp.Pool(processes=8)
    split_dfs = np.array_split(bipolar_tweets, 8)
    pool_results = p.map(process, zip(split_dfs,
                                      repeat(user_top_used_geo_tag)))
    p.close()
    p.join()

    # merging parts processed by different processes
    parts = pd.concat(pool_results)

    # merging newly calculated parts to big_df
    #big_df = pd.concat([big_df, parts], axis=1)
    parts.to_csv(
        '../final_data/users_final_normal/labelMIssingGeo_normalusers.csv',
        index=False,
        quotechar='"',
        quoting=csv.QUOTE_ALL)  #, encoding='utf-8'
示例#59
0
def test_nn_shallow_mnist_smc_enhanced():
    logger.info('test nn shallow in mnist using enhanced smc')

    logger.info('initialize the crypto system ...')
    sec_param_config_file = 'config/sec_param.json'  # indicate kernel size 5
    dlog_table_config_file = 'config/dlog_b8.json'
    with timer('initialize crypto system, cost time', logger) as t:
        eta = 1250
        sec_param = 256
        setup_parties = {
            'id_1': 200,
            'id_2': 200,
            'id_3': 200,
            'id_4': 200,
            'id_5': 200
        }
        logger.info('loading dlog configuration ...')
        dlog = load_dlog_table_config(dlog_table_config_file)
        logger.info('load dlog configuration DONE')
        sife_tpa = SIFEDynamicTPA(eta,
                                  sec_param=sec_param,
                                  sec_param_config=sec_param_config_file)
        sife_tpa.setup()
        sife_enc_client = SIFEDynamicClient(sec_param=256, role='enc')
        sife_dec_client = SIFEDynamicClient(sec_param=256,
                                            role='dec',
                                            dlog=dlog)
        mife_tpa = MIFEDynamicTPA(sec_param=256,
                                  parties=setup_parties,
                                  sec_param_config=sec_param_config_file)
        mife_tpa.setup()
        mife_enc_client = MIFEDynamicClient(sec_param=256, role='enc')
        mife_dec_client = MIFEDynamicClient(sec_param=256,
                                            role='dec',
                                            dlog=dlog)
        logger.info('the crypto system initialization done!')

    precision_data = 0
    precision_weight = 4

    es2pc_client = EnhancedSecure2PCClient(sife=(sife_tpa, sife_enc_client),
                                           mife=(mife_tpa, mife_enc_client),
                                           precision=precision_data)
    es2pc_server = EnhancedSecure2PCServer(sife=(sife_tpa, sife_dec_client),
                                           mife=(mife_tpa, mife_dec_client),
                                           precision=(precision_data,
                                                      precision_weight))

    X_train, y_train = load_mnist_size('datasets/mnist', size=600)
    X_test, y_test = load_mnist_size('datasets/mnist', size=100, kind='t10k')
    # X_train, y_train = load_mnist('datasets/mnist')
    # X_test, y_test = load_mnist('datasets/mnist', kind='t10k')

    # shuffle
    X_data, y_data = X_train.copy(), y_train.copy()
    idx = np.random.permutation(X_data.shape[0])
    X_data, y_data = X_data[idx], y_data[idx]

    features_splits = np.array_split(range(X_data.shape[1]),
                                     len(setup_parties))
    X_data_lst = [X_data[:, idx] for idx in features_splits]

    total_mini_batches = 50

    nn_server = CryptoNNServer(n_output=10,
                               n_features=X_data.shape[1],
                               hidden_layers=[64],
                               l2=0.1,
                               l1=0.0,
                               epochs=50,
                               eta=0.001,
                               alpha=0.001,
                               decrease_const=0.0001,
                               mini_batches=total_mini_batches,
                               smc=es2pc_server)
    logger.info('client start to encrypt dataset ...')
    ct_ff_lst_dict = dict()
    ct_bp_lst_dict = dict()
    x_idx_count = 0
    final_y_onehot_lst = None
    for id in setup_parties.keys():
        if x_idx_count == (len(setup_parties) - 1):
            n_features = X_data_lst[x_idx_count].shape[1] + 1
            nn_client = CryptoNNClient(n_output=10,
                                       mini_batches=total_mini_batches,
                                       n_features=n_features,
                                       smc=es2pc_client,
                                       random_seed=520,
                                       id=id)
            nn_server.register(nn_client)
            ct_feedforward_lst, ct_backpropagation_lst, y_onehot_lst = nn_client.pre_process(
                X_data_lst[x_idx_count], y_data)
            ct_ff_lst_dict[id] = ct_feedforward_lst
            ct_bp_lst_dict[id] = ct_backpropagation_lst
            final_y_onehot_lst = y_onehot_lst
        else:
            n_features = X_data_lst[x_idx_count].shape[1]
            nn_client = CryptoNNClient(n_output=10,
                                       mini_batches=total_mini_batches,
                                       n_features=n_features,
                                       smc=es2pc_client,
                                       random_seed=520,
                                       id=id)
            nn_server.register(nn_client)
            ct_feedforward_lst, ct_backpropagation_lst = nn_client.pre_process(
                X_data_lst[x_idx_count])
            ct_ff_lst_dict[id] = ct_feedforward_lst
            ct_bp_lst_dict[id] = ct_backpropagation_lst
        x_idx_count = x_idx_count + 1
    logger.info('client encrypting DONE')

    logger.info('server start to train ...')
    (train_loss_hist, test_acc_hist, train_batch_time_hist,
     train_time_hist) = nn_server.fit((ct_ff_lst_dict, ct_bp_lst_dict),
                                      final_y_onehot_lst, X_test, y_test)
    logger.info('server training DONE')

    logger.info('training loss: \n\r' + str(train_loss_hist))
    logger.info('test acc: \n\r' + str(test_acc_hist))
示例#60
0
        type=str,
        help="Config file path",
        required=True,
    )
    parser.add_argument("--gen", help="Generate subreddit list", action="store_true")

    args = parser.parse_args()

    if args.c:
        config = configparser.ConfigParser()
        config.read(args.c)
        subreddit_list_path = config["REDDIT"].get("subreddit_list_path")
        if args.gen:
            if not subreddit_list_path:
                raise Exception("Need to provide path to generate subreddit list.")
            generate_subreddit_list(subreddit_list_path)
        else:
            num_processes = int(config["REDDIT"].get("num_process"))
            with open(subreddit_list_path) as f:
                subreddit_list = f.readline().split(",")

            if not subreddit_list:
                raise Exception("Empty subreddit list")

            subreddit_lists = np.array_split(subreddit_list, num_processes)
            with concurrent.futures.ThreadPoolExecutor(max_workers=num_processes) as executor:
                fs = [executor.submit(scrape, subreddit_list, config) for subreddit_list in subreddit_lists]
                for future in concurrent.futures.as_completed(fs):
                    print(future.result())