def data_sampling(data,num_of_samples): random = check_random_state(seed=None) n_samples, n_features = data.shape if(num_of_samples>n_samples): indexes = np.concatenate((sample_without_replacement(n_samples, n_samples, random_state=random), random.randint(0, n_samples, num_of_samples-n_samples)), axis=None) else: indexes = sample_without_replacement(n_samples, num_of_samples, random_state=random) return data.loc[indexes]
def mutation(self, pm): n_mutations_in_features = self.random_state.binomial(self.genome_features.shape[0], pm) n_mutations_in_samples = self.random_state.binomial(self.genome_samples.shape[0], pm) mutated_samples = sample_without_replacement( self.genome_samples.shape[0], n_mutations_in_samples, random_state=self.random_state ) mutated_features = sample_without_replacement( self.genome_features.shape[0], n_mutations_in_features, random_state=self.random_state ) self.genome_samples[mutated_samples] = ~self.genome_samples[mutated_samples] self.genome_features[mutated_features] = ~self.genome_features[mutated_features]
def data_sampling(data, replacement,num_of_samples): random = check_random_state(seed=None) n_samples, n_features = data.shape if replacement: indexes = random.randint(0, n_samples, num_of_samples) else: if(num_of_samples>n_samples): indexes = sample_without_replacement(n_samples, n_samples, random_state=random) else: indexes = sample_without_replacement(n_samples, num_of_samples, random_state=random) return data.loc[indexes]
def generate_indices(random_state, bootstrap, n_population, n_samples): """ Draw randomly sampled indices. Internal use only. See sklearn/ensemble/bagging.py Parameters ---------- random_state : RandomState A random number generator instance to define the state of the random permutations generator. bootstrap : bool Specifies whether to bootstrap indice generation n_population : int Specifies the population size when generating indices n_samples : int Specifies number of samples to draw Returns ------- indices : numpy array, shape (n_samples,) randomly drawn indices """ # Draw sample indices if bootstrap: indices = random_state.randint(0, n_population, n_samples) else: indices = sample_without_replacement(n_population, n_samples, random_state=random_state) return indices
def __init__(self, n_samples, n_features, ps, pf, random_state): self.random_state = random_state self.genome_features = np.zeros((n_features,), dtype=np.bool8) self.genome_samples = np.zeros((n_samples,), dtype=np.bool8) n_pick_samples = np.floor(n_samples * ps) n_pick_features = np.floor(n_features * pf) picked_samples = sample_without_replacement(n_samples, n_pick_samples, random_state=random_state) picked_features = sample_without_replacement(n_features, n_pick_features, random_state=random_state) self.genome_samples[picked_samples] = True self.genome_features[picked_features] = True self.cache_est_weight = 1 self.cache_contribution = 0 self.cache_predictions = None
def check_sample_int_distribution(sample_without_replacement): # This test is heavily inspired from test_random.py of python-core. # # For the entire allowable range of 0 <= k <= N, validate that # sample generates all possible permutations n_population = 10 # a large number of trials prevents false negatives without slowing normal # case n_trials = 10000 for n_samples in range(n_population): # Counting the number of combinations is not as good as counting the # the number of permutations. However, it works with sampling algorithm # that does not provide a random permutation of the subset of integer. n_expected = combinations(n_population, n_samples, exact=True) output = {} for i in range(n_trials): output[frozenset(sample_without_replacement(n_population, n_samples))] = None if len(output) == n_expected: break else: raise AssertionError( "number of combinations != number of expected (%s != %s)" % (len(output), n_expected))
def run_test(X, Y, A, B, Sigma=None, proj=None, n_combinations=50000): X, Y, A, B = normalize_list([X, Y, A, B], Sigma=Sigma, proj=proj) if Sigma is not None: A = np.matmul(A, Sigma) B = np.matmul(B, Sigma) base_statistics = statistics(X, Y, A, B) union_XY = np.vstack((X, Y)) xy_size = union_XY.shape[0] x_size = X.shape[0] count = 0 all_idx = set(range(xy_size)) if comb(xy_size, x_size) > n_combinations: for _ in range(n_combinations): group_1_idx = sample_without_replacement(xy_size, x_size) group_2_idx = list(all_idx.difference(group_1_idx)) sample_stat = statistics(union_XY[group_1_idx], union_XY[group_2_idx], A, B) count += sample_stat > base_statistics else: for group_1_idx in combinations(range(xy_size), x_size): group_2_idx = list(all_idx.difference(group_1_idx)) sample_stat = statistics(union_XY[list(group_1_idx)], union_XY[group_2_idx], A, B) count += sample_stat > base_statistics p_val = count / n_combinations effect_val = effect_size(X, Y, A, B) print('P-val is %f; effect size is %f' % (p_val, effect_val)) return p_val, effect_val
def check_sample_int(sample_without_replacement): # This test is heavily inspired from test_random.py of python-core. # # For the entire allowable range of 0 <= k <= N, validate that # the sample is of the correct length and contains only unique items n_population = 100 for n_samples in range(n_population + 1): s = sample_without_replacement(n_population, n_samples) assert len(s) == n_samples unique = np.unique(s) assert np.size(unique) == n_samples assert np.all(unique < n_population) # test edge case n_population == n_samples == 0 assert np.size(sample_without_replacement(0, 0)) == 0
def check_sample_int_distribution(sample_without_replacement): # This test is heavily inspired from test_random.py of python-core. # # For the entire allowable range of 0 <= k <= N, validate that # sample generates all possible permutations n_population = 10 # a large number of trials prevents false negatives without slowing normal # case n_trials = 10000 for n_samples in range(n_population): # Counting the number of combinations is not as good as counting the # the number of permutations. However, it works with sampling algorithm # that does not provide a random permutation of the subset of integer. n_expected = comb(n_population, n_samples, exact=True) output = {} for i in range(n_trials): output[frozenset( sample_without_replacement(n_population, n_samples))] = None if len(output) == n_expected: break else: raise AssertionError( "number of combinations != number of expected (%s != %s)" % (len(output), n_expected))
def sample_without_replacement_method(n_population, n_samples, random_state=None): return sample_without_replacement(n_population, n_samples, method=m, random_state=random_state)
def check_sample_int(sample_without_replacement): # This test is heavily inspired from test_random.py of python-core. # # For the entire allowable range of 0 <= k <= N, validate that # the sample is of the correct length and contains only unique items n_population = 100 for n_samples in range(n_population + 1): s = sample_without_replacement(n_population, n_samples) assert_equal(len(s), n_samples) unique = np.unique(s) assert_equal(np.size(unique), n_samples) assert_true(np.all(unique < n_population)) # test edge case n_population == n_samples == 0 assert_equal(np.size(sample_without_replacement(0, 0)), 0)
def __iter__(self): # check if all distributions are given as lists # in this case we want to sample without replacement all_lists = np.all([not hasattr(v, "rvs") for v in self.param_distributions.values()]) rnd = check_random_state(self.random_state) if all_lists: # look up sampled parameter settings in parameter grid param_grid = ParameterGrid(self.param_distributions) grid_size = len(param_grid) if grid_size < self.n_iter: raise ValueError( "The total space of parameters %d is smaller " "than n_iter=%d." % (grid_size, self.n_iter) + " For exhaustive searches, use GridSearchCV.") for i in sample_without_replacement(grid_size, self.n_iter, random_state=rnd): yield param_grid[i] else: # Always sort the keys of a dictionary, for reproducibility items = sorted(self.param_distributions.items()) for _ in six.moves.range(self.n_iter): params = dict() for k, v in items: if hasattr(v, "rvs"): if sp_version < (0, 16): params[k] = v.rvs() else: params[k] = v.rvs(random_state=rnd) else: params[k] = v[rnd.randint(len(v))] yield params
def function4(percent, targetX): #this function make noisy to given percentage percent = percent/100 sample = sample_without_replacement(len(targetX),percent*len(targetX)) change = random.sample(range(0,64),10) for i in change: targetX[sample][i] = abs(targetX[sample][i]-16) return targetX
def _generate_indices(random_state, bootstrap, n_population, n_samples): if bootstrap: indices = random_state.randint(0, n_population, n_samples) else: indices = sample_without_replacement(n_population, n_samples, random_state=random_state) return indices
def fit(self, X, y=None): X = check_array(X) n_samples, n_features = X.shape random_state = check_random_state(self.random_state) self.components = sample_without_replacement( n_features, self.n_components, random_state=random_state) return self
def check_edge_case_of_sample_int(sample_without_replacement): # n_population < n_sample assert_raises(ValueError, sample_without_replacement, 0, 1) assert_raises(ValueError, sample_without_replacement, 1, 2) # n_population == n_samples assert_equal(sample_without_replacement(0, 0).shape, (0, )) assert_equal(sample_without_replacement(1, 1).shape, (1, )) # n_population >= n_samples assert_equal(sample_without_replacement(5, 0).shape, (0, )) assert_equal(sample_without_replacement(5, 1).shape, (1, )) # n_population < 0 or n_samples < 0 assert_raises(ValueError, sample_without_replacement, -1, 5) assert_raises(ValueError, sample_without_replacement, 5, -1)
def check_edge_case_of_sample_int(sample_without_replacement): # n_poluation < n_sample assert_raises(ValueError, sample_without_replacement, 0, 1) assert_raises(ValueError, sample_without_replacement, 1, 2) # n_population == n_samples assert_equal(sample_without_replacement(0, 0).shape, (0, )) assert_equal(sample_without_replacement(1, 1).shape, (1, )) # n_population >= n_samples assert_equal(sample_without_replacement(5, 0).shape, (0, )) assert_equal(sample_without_replacement(5, 1).shape, (1, )) # n_population < 0 or n_samples < 0 assert_raises(ValueError, sample_without_replacement, -1, 5) assert_raises(ValueError, sample_without_replacement, 5, -1)
def _minify_dataset(ratings_df: pd.DataFrame, random_state: int) -> pd.DataFrame: users_count = len(ratings_df['user_id'].unique()) samples = 200 if 200 < users_count else users_count / 2 users_subset = set( sample_without_replacement(users_count, samples, random_state=random_state)) return ratings_df[ratings_df['user_id'].isin(users_subset)]
def get_random_gene_df(gene_df, n_genes, label_col="type"): labels = gene_df.loc[:, label_col] unlab_df = gene_df.drop(label_col, axis=1) index_set = sample_without_replacement(gene_df.shape[1], n_genes) gene_arr_all = gene_df.columns gene_arr_rand = gene_arr_all[index_set] gene_df_rand = gene_df[gene_arr_rand] gene_df_rand["type"] = labels return gene_df_rand
def subsampled_hadamard_matrix(n_components, n_features, random_state=None): """Sub-sampled hadamard matrix to have shape n_components and n_features A hadamard matrix of shape at (least n_components, n_features) is subsampled without replacement. Parameters ---------- n_components : int, Dimensionality of the target projection space. n_features : int, Dimensionality of the original source space. random_state : int, RandomState instance or None (default=None) Control the pseudo random number generator used to generate the matrix at fit time. Returns ------- components : numpy array of shape [n_components, n_features] The generated random matrix. """ if n_components <= 0: raise ValueError("n_components must be strictly positive, got %d" % n_components) if n_features <= 0: raise ValueError("n_features must be strictly positive, got %d" % n_components) random_state = check_random_state(random_state) n_hadmard_size = max(2**np.ceil(np.log2(x)) for x in (n_components, n_features)) row = sample_without_replacement(n_hadmard_size, n_components, random_state=random_state) col = sample_without_replacement(n_hadmard_size, n_features, random_state=random_state) hadamard_matrix = sp_hadamard(n_hadmard_size, dtype=np.float)[row][:, col] hadamard_matrix *= 1 / np.sqrt(n_components) return hadamard_matrix
def observer(state): """Observe simulation state with uncertainty -> approximate state.""" # First average over cells to get non-spatial distribution ncells = int(np.round(len(state) / 15, 0)) state_by_cell = np.reshape(state, (int(ncells), 15)) # Species proportions species_by_cell = np.array([ np.sum(state_by_cell[:, 0:3], axis=1), np.sum(state_by_cell[:, 3:6], axis=1), np.sum(state_by_cell[:, 6:9], axis=1), np.sum(state_by_cell[:, 9:12], axis=1), np.sum(state_by_cell[:, 12:14], axis=1), state_by_cell[:, 14] ]).T # Add empty space to be sampled also space = np.array(1.0 - np.sum(state_by_cell, axis=1)).reshape((400, 1)) cell_props = np.append(species_by_cell, space, axis=1) # Create population of hosts and sample appropriately population = np.array(pop_size * cell_props) obs_states = [] for i in range(ncells): sample = sample_without_replacement(pop_size, n_samples) bins = np.append([0.0], np.cumsum(population[i])) observed_species = np.histogram(sample, bins)[0] observed_state = np.zeros(15) # Tanoak for j in range(4): idcs = ((3 * j), (3 * j + 3)) inf_probs = state_by_cell[i, idcs[0]:idcs[1]] / np.sum( state_by_cell[i, idcs[0]:idcs[1]]) inf_sample = np.random.choice(3, observed_species[j], p=inf_probs) observed_state[idcs[0]:idcs[1]] = np.histogram( inf_sample, range(4))[0] # Bay inf_probs = state_by_cell[i, 12:14] / np.sum(state_by_cell[i, 12:14]) inf_sample = np.random.choice(2, observed_species[4], p=inf_probs) observed_state[12:14] = np.histogram(inf_sample, range(3))[0] # Redwood observed_state[14] = observed_species[5] obs_states.append(observed_state) obs_states = np.array(obs_states) obs_state = (np.sum(obs_states, axis=0) / (n_samples * ncells)) return obs_state
def bootstrap_generator(n_bootstrap_iterations, sample_fraction, X, random_state=None): """Generates bootstrap samples from dataset.""" if random_state is not None: np.random.seed(random_state) random.seed(random_state) n_samples = len(X) n_subsamples = np.floor(sample_fraction * n_samples).astype(int) for _ in range(n_bootstrap_iterations): subsample = sample_without_replacement(n_samples, n_subsamples) yield subsample
def subsampled_hadamard_matrix(n_components, n_features, random_state=None): """Sub-sampled hadamard matrix to have shape n_components and n_features A hadamard matrix of shape at (least n_components, n_features) is subsampled without replacement. Parameters ---------- n_components : int, Dimensionality of the target projection space. n_features : int, Dimensionality of the original source space. random_state : int, RandomState instance or None (default=None) Control the pseudo random number generator used to generate the matrix at fit time. Returns ------- components : numpy array of shape [n_components, n_features] The generated random matrix. """ if n_components <= 0: raise ValueError("n_components must be strictly positive, got %d" % n_components) if n_features <= 0: raise ValueError("n_features must be strictly positive, got %d" % n_components) random_state = check_random_state(random_state) n_hadmard_size = max(2 ** np.ceil(np.log2(x)) for x in (n_components, n_features)) row = sample_without_replacement(n_hadmard_size, n_components, random_state=random_state) col = sample_without_replacement(n_hadmard_size, n_features, random_state=random_state) hadamard_matrix = sp_hadamard(n_hadmard_size, dtype=np.float)[row][:, col] hadamard_matrix *= 1 / np.sqrt(n_components) return hadamard_matrix
def decoderaccuracy_wtih_numcells(self, x, y, iterations, task, classifier_type): numcells = np.size(x, 1) percsamples = [1, 5, 10, 20, 50, 80, 100] numsamples = [np.int(numcells * (p / 100)) for p in percsamples] numcells_dataframe = pd.DataFrame( columns=['SampleSize', 'Split', 'R2', 'rho', 'score', 'errorprob']) k = KFold(n_splits=numcell_kfold_splits, random_state=None, shuffle=False) for n, ns in enumerate(numsamples): print(f'Fitting on %d neurons' % ns) for i in np.arange(iterations): cells = sample_without_replacement(numcells, ns) x_resample = x[:, cells] count_cv = 1 # Also do k-fold validation for these iterations for train_index, test_index in k.split(x_resample): # print(f'Validation %d' % count_cv) # Split data x_rs_train, x_rs_test = x_resample[ train_index], x_resample[test_index] y_rs_train, y_rs_test = y[train_index], y[test_index] nbpfmodel = self.fit_SVM(x_rs_train, y_rs_train, classifier_type=classifier_type) scores, prediction, probability = self.validate_model( classifier_type=classifier_type, model=nbpfmodel, x_test=x_rs_test, y_test=y_rs_test, task=task, plotflag=plot_numcells) backend.clear_session() R2 = CommonFunctions.get_R2(y_actual=y_rs_test, y_predicted=prediction) rho = CommonFunctions.get_R2(y_actual=y_rs_test, y_predicted=prediction) numcells_dataframe = numcells_dataframe.append( { 'SampleSize': f'%d%%' % percsamples[n], 'Split': count_cv, 'R2': R2, 'rho': rho, 'score': scores, 'errorprob': probability }, ignore_index=True) count_cv += 1 return numcells_dataframe
def _generate_ts_indices(random_state, bootstrap, n_population, block_size): """Draw randomly sampled indices.""" # Draw sample indices if bootstrap: indices = mb_bootstrap_indicies(n_population, block_size) else: # FIXME: block bootstrap without replacement indices = sample_without_replacement( n_population, n_samples, random_state=random_state ) return indices
def data_pseudo_labeling(unlabeled_dataset, neigh, keywords): from sklearn.utils.random import sample_without_replacement index_pseudo_data = sample_without_replacement(len(unlabeled_dataset), seudo_labeled_data_size) pseudo_data = unlabeled_dataset.iloc[sorted(index_pseudo_data)] pseudo_predict = neigh.predict(pseudo_data[keywords]) pseudo_data.insert(0, 'Class', pseudo_predict) selected_columns = keywords.insert(0, 'Patent_Number') selected_columns = selected_columns.insert(0, 'Class') pseudo_labeled_data = pseudo_data.filter(selected_columns) return pseudo_labeled_data
def _generate_random_features(random_state, bootstrap, n_population, n_samples): """Draw randomly sampled indices.""" # Draw sample indices if bootstrap: indices = random_state.randint(0, n_population, n_samples) else: indices = sample_without_replacement(n_population, n_samples, random_state=random_state) return indices
def reportAccuracyVersusSparsityOfInput(df, normalized_df, model, labeled_tags, train_index, test_index, percent): N_train = train_index.shape[0] train_subset_index = sample_without_replacement(N_train, N_train * percent / 100) subset_train_index = train_index[train_subset_index] test_index_updated = set(train_index).union(set(test_index)).difference( set(subset_train_index)) accuracy, c_matrix, ra_score = testClassification(df, normalized_df, model, labeled_tags, list(subset_train_index), list(test_index_updated)) return accuracy, ra_score
def _generate_hypercube(samples, dimensions, rng): """Returns distinct binary samples of length dimensions """ if dimensions > 30: return np.hstack([ rng.randint(2, size=(samples, dimensions - 30)), _generate_hypercube(samples, 30, rng) ]) out = sample_without_replacement(2**dimensions, samples, random_state=rng).astype(dtype='>u4', copy=False) out = np.unpackbits(out.view('>u1')).reshape((-1, 32))[:, -dimensions:] return out
def _generate_indices(random_state, bootstrap, n_population, n_samples): """Draw randomly sampled indices. Internal use only. See sklearn/ensemble/bagging.py """ # Draw sample indices if bootstrap: indices = random_state.randint(0, n_population, n_samples) else: indices = sample_without_replacement(n_population, n_samples, random_state=random_state) return indices
def bootstrap_generator(n_bootstrap_iterations, sample_fraction, X, random_state=None): """Generates bootstrap samples from dataset.""" n_samples = len(X) n_subsamples = np.floor(sample_fraction * n_samples).astype(int) subsamples = [] for _ in range(n_bootstrap_iterations): subsample = sample_without_replacement(n_samples, n_subsamples, random_state=None) subsamples.append(subsample) return subsamples
def test_sample_without_replacement_algorithms(): methods = ("auto", "tracking_selection", "reservoir_sampling", "pool") for m in methods: sample_without_replacement_method = \ lambda n_population, n_samples, random_state=None: \ sample_without_replacement(n_population, n_samples, method=m, random_state=random_state) check_edge_case_of_sample_int(sample_without_replacement_method) check_sample_int(sample_without_replacement_method) check_sample_int_distribution(sample_without_replacement_method)
def subsampled_identity_matrix(n_components, n_features, random_state=None, with_replacement=True): """Sub-sampled identity matrix to have shape n_components and n_features Parameters ---------- n_components : int, Dimensionality of the target projection space. n_features : int, Dimensionality of the original source space. random_state : int, RandomState instance or None (default=None) Control the pseudo random number generator used to generate the matrix at fit time. with_replacement : bool, Whether or not drawing components with replacements. Returns ------- components : numpy array of shape [n_components, n_features] The generated random matrix. """ if n_components <= 0: raise ValueError("n_components must be strictly positive, got %d" % n_components) if n_features <= 0: raise ValueError("n_features must be strictly positive, got %d" % n_components) rng = check_random_state(random_state) components = sparse.dia_matrix((np.ones(n_features), [0]), shape=(n_features, n_features)).tocsr() if with_replacement: mask = rng.randint(n_features, size=(n_components, )) else: mask = sample_without_replacement(n_features, n_components, random_state=rng) components = components[mask] return components * np.sqrt(1.0 * n_features / n_components)
def get_all_indices(self, n_samples=None, max_samples=None, random_state=None): """Get the indices on which to evaluate the fitness of a program. Parameters ---------- n_samples : int The number of samples. max_samples : int The maximum number of samples to use. random_state : RandomState instance The random number generator. Returns ------- indices : array-like, shape = [n_samples] The in-sample indices. not_indices : array-like, shape = [n_samples] The out-of-sample indices. """ if self._indices_state is None and random_state is None: raise ValueError('The program has not been evaluated for fitness ' 'yet, indices not available.') if n_samples is not None and self._n_samples is None: self._n_samples = n_samples if max_samples is not None and self._max_samples is None: self._max_samples = max_samples if random_state is not None and self._indices_state is None: self._indices_state = random_state.get_state() indices_state = check_random_state(None) indices_state.set_state(self._indices_state) not_indices = sample_without_replacement(self._n_samples, self._n_samples - self._max_samples, random_state=indices_state) sample_counts = np.bincount(not_indices, minlength=self._n_samples) indices = np.where(sample_counts == 0)[0] return indices, not_indices
def make_train_validation_test_triplets_list(triplet_file, random_seed=None): random.seed(random_seed) triplets = np.loadtxt(triplet_file) # sample part of the triplets n_triplets = len(triplets) triplets = triplets[sample_without_replacement(n_population=n_triplets, n_samples=40000)] train_triplets_file = "./train_triplets_list.txt" validation_triplets_file = "./validation_triplets_list.txt" test_triplets_file = "./test_triplets_list.txt" if os.path.exists(train_triplets_file) and os.path.exists( validation_triplets_file) and os.path.exists(test_triplets_file): triplets_train = np.loadtxt(train_triplets_file) triplets_validation = np.loadtxt(validation_triplets_file) triplets_test = np.loadtxt(test_triplets_file) else: train_images = random.sample(range(0, 5000), 3600) #list(range(0, 3800)) triplets_train = [ t for t in triplets if (t[0] in train_images and t[1] in train_images and t[2] in train_images) ] triplets_vt = [ t for t in triplets if (t[0] not in train_images and t[1] not in train_images and t[2] not in train_images) ] triplets_validation, triplets_test = train_test_split(triplets_vt, train_size=0.5) np.savetxt(train_triplets_file, triplets_train) np.savetxt(validation_triplets_file, triplets_validation) np.savetxt(test_triplets_file, triplets_test) print("Train dataset size: %d" % (len(triplets_train))) print("Validation dataset size: %d" % (len(triplets_validation))) print("Test dataset size: %d" % (len(triplets_test))) return triplets_train, triplets_validation, triplets_test
def latin_hypercube_sampling(bounds, pop): """Latin Hypercube sampling to generate more uniformly distributed differential evolution initial parameters values. Parameters ---------- bounds : np.array Bounds to generate parameters within, should be of shape (nb of parameters, 2) pop : int Number of sets of inital parameters to generate """ ranges = np.linspace(bounds[:, 0], bounds[:, 1], pop + 1).T ranges = np.array([ranges[:, :-1], ranges[:, 1:]]).T cs = np.random.uniform(low=ranges[:, :, 0], high=ranges[:, :, 1]) a = sample_without_replacement(pop**len(bounds), pop) a = np.array(np.unravel_index(a, [pop] * len(bounds))) return np.array([cs[a[i], i] for i in range(len(bounds))]).T
def subsampled_identity_matrix(n_components, n_features, random_state=None, with_replacement=True): """Sub-sampled identity matrix to have shape n_components and n_features Parameters ---------- n_components : int, Dimensionality of the target projection space. n_features : int, Dimensionality of the original source space. random_state : int, RandomState instance or None (default=None) Control the pseudo random number generator used to generate the matrix at fit time. with_replacement : bool, Whether or not drawing components with replacements. Returns ------- components : numpy array of shape [n_components, n_features] The generated random matrix. """ if n_components <= 0: raise ValueError("n_components must be strictly positive, got %d" % n_components) if n_features <= 0: raise ValueError("n_features must be strictly positive, got %d" % n_components) rng = check_random_state(random_state) components = sparse.dia_matrix((np.ones(n_features), [0]), shape=(n_features, n_features)).tocsr() if with_replacement: mask = rng.randint(n_features, size=(n_components,)) else: mask = sample_without_replacement(n_features, n_components, random_state=rng) components = components[mask] return components * np.sqrt(1.0 * n_features / n_components)
def get_all_indices(self, n_samples=None, max_samples=None, random_state=None): """Get the indices on which to evaluate the fitness of a program. Parameters ---------- n_samples : int The number of samples. max_samples : int The maximum number of samples to use. random_state : RandomState instance The random number generator. Returns ------- indices : array-like, shape = [n_samples] The in-sample indices. not_indices : array-like, shape = [n_samples] The out-of-sample indices. """ if self._indices_state is None and random_state is None: raise ValueError('The program has not been evaluated for fitness ' 'yet, indices not available.') if n_samples is not None and self._n_samples is None: self._n_samples = n_samples if max_samples is not None and self._max_samples is None: self._max_samples = max_samples if random_state is not None and self._indices_state is None: self._indices_state = random_state.get_state() indices_state = check_random_state(None) indices_state.set_state(self._indices_state) not_indices = sample_without_replacement( self._n_samples, self._n_samples - self._max_samples, random_state=indices_state) sample_counts = np.bincount(not_indices, minlength=self._n_samples) indices = np.where(sample_counts == 0)[0] return indices, not_indices
def fit(self, X, y, random_state=None): """ Train ENOLS on the given training set. Parameters ---------- X: an input array of shape (n_sample, n_features) y: an array of shape (n_sample,) containing the classes for the input examples Return ------ self: the fitted model """ # use random instead of np.random to sample random numbers below random = check_random_state(random_state) estimators = [('lr', LinearRegression())] if isinstance(self.sample_size, int): self.sample_size = 'reservoir_sampling' # add all the trained OLS models to this list self.estimators_lr, self.estimators_TSR, self.estimators_enols = [], [], [] for i in range(self.n_estimators): samples = sample_without_replacement(n_population=random.choice([50, 100]), n_samples=random.choice([10, 20]), random_state=random_state, method=self.sample_size) X_train, y_train = [], [] for i in samples: X_train.append(X[i]), y_train.append(y[i]) reg = LinearRegression() reg.fit(np.array(X_train), np.array(y_train)) tsr = TheilSenRegressor() tsr.fit(np.array(X_train), np.array(y_train)) enol = StackingRegressor(estimators=estimators, final_estimator=LinearRegression()) enol.fit(np.array(X_train), np.array(y_train)) self.estimators_lr.append(reg), self.estimators_TSR.append(tsr), self.estimators_enols.append(enol) return self
def lesinn(self, x_train, to_query): ensemble_size = 50 subsample_size = int(.01 * x_train.shape[0]) scores = np.zeros([to_query.shape[0], 1]) seeds = self.Trainer.rng.randint(MAX_INT, size=ensemble_size) for i in range(0, ensemble_size): rs = np.random.RandomState(seeds[i]) sid = sample_without_replacement(n_population=x_train.shape[0], n_samples=subsample_size, random_state=rs) subsample = x_train[sid] kdt = KDTree(subsample, metric='euclidean') dists, indices = kdt.query(to_query, k=self.n_neighbors) #import pdb; pdb.set_trace() dists = np.mean(dists, axis=1)[:, np.newaxis] scores += dists scores = scores / ensemble_size return scores
def equalise_laps_with_numlaps_innorew(Imgobj, X, Y, Tasklabel): stoplicklap = Imgobj.Parsed_Behavior['lick_stop'].item() numlaps_afterlickstops = Imgobj.Parsed_Behavior['numlaps'].item( )['Task2'] - stoplicklap print('Number of laps being chosen', numlaps_afterlickstops) numlaps_currenttask = Imgobj.Parsed_Behavior['numlaps'].item( )[Tasklabel] - 3 samplelaps = sample_without_replacement(numlaps_currenttask, numlaps_afterlickstops) lapframes = \ [scipy.io.loadmat(os.path.join(Imgobj.FolderName, 'Behavior', p))['E'].T for p in Imgobj.PlaceFieldData if Tasklabel in p][0] print(samplelaps) X_eq = X[np.where(lapframes == samplelaps)[0], :] Y_eq = Y[np.where(lapframes == samplelaps)[0]] return X_eq, Y_eq
def _generate_hypercube(samples, dimensions, rng): """Returns distinct binary samples of length dimensions """ if not has_sklearn(): raise RuntimeError("Scikit-learn is needed to run \ make_classification.") from sklearn.utils.random import sample_without_replacement if dimensions > 30: return np.hstack([np.random.randint(2, size=(samples, dimensions - 30)), _generate_hypercube(samples, 30, rng)]) random_state = int(rng.randint(dimensions)) out = sample_without_replacement(2 ** dimensions, samples, random_state=random_state).astype( dtype='>u4', copy=False) out = np.unpackbits(out.view('>u1')).reshape((-1, 32))[:, -dimensions:] return out
def _aom_moa_helper(mode, scores, n_buckets, method, bootstrap_estimators, random_state): """Internal helper function for Average of Maximum (AOM) and Maximum of Average (MOA). See :cite:`aggarwal2015theoretical` for details. First dividing estimators into subgroups, take the maximum/average score as the subgroup score. Finally, take the average/maximum of all subgroup outlier scores. Parameters ---------- mode : str Define the operation model, either "AOM" or "MOA". scores : numpy array of shape (n_samples, n_estimators) The score matrix outputted from various estimators. n_buckets : int, optional (default=5) The number of subgroups to build. method : str, optional (default='static') {'static', 'dynamic'}, if 'dynamic', build subgroups randomly with dynamic bucket size. bootstrap_estimators : bool, optional (default=False) Whether estimators are drawn with replacement. random_state : int, RandomState instance or None, optional (default=None) If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- combined_scores : Numpy array of shape (n_samples,) The combined outlier scores. """ if mode != 'AOM' and mode != 'MOA': raise NotImplementedError( '{mode} is not implemented'.format(mode=mode)) scores = check_array(scores) # TODO: add one more parameter for max number of estimators # use random_state instead # for now it is fixed at n_estimators/2 n_estimators = scores.shape[1] check_parameter(n_buckets, 2, n_estimators, param_name='n_buckets') scores_buckets = np.zeros([scores.shape[0], n_buckets]) if method == 'static': n_estimators_per_bucket = int(n_estimators / n_buckets) if n_estimators % n_buckets != 0: raise ValueError('n_estimators / n_buckets has a remainder. Not ' 'allowed in static mode.') if not bootstrap_estimators: # shuffle the estimator order shuffled_list = shuffle(list(range(0, n_estimators, 1)), random_state=random_state) head = 0 for i in range(0, n_estimators, n_estimators_per_bucket): tail = i + n_estimators_per_bucket batch_ind = int(i / n_estimators_per_bucket) if mode == 'AOM': scores_buckets[:, batch_ind] = np.max( scores[:, shuffled_list[head:tail]], axis=1) else: scores_buckets[:, batch_ind] = np.mean( scores[:, shuffled_list[head:tail]], axis=1) # increment index head = head + n_estimators_per_bucket # noinspection PyUnusedLocal else: for i in range(n_buckets): ind = sample_without_replacement(n_estimators, n_estimators_per_bucket, random_state=random_state) if mode == 'AOM': scores_buckets[:, i] = np.max(scores[:, ind], axis=1) else: scores_buckets[:, i] = np.mean(scores[:, ind], axis=1) elif method == 'dynamic': # random bucket size for i in range(n_buckets): # the number of estimators in a bucket should be 2 - n/2 max_estimator_per_bucket = RandomState(seed=random_state).randint( 2, int(n_estimators / 2)) ind = sample_without_replacement(n_estimators, max_estimator_per_bucket, random_state=random_state) if mode == 'AOM': scores_buckets[:, i] = np.max(scores[:, ind], axis=1) else: scores_buckets[:, i] = np.mean(scores[:, ind], axis=1) else: raise NotImplementedError( '{method} is not implemented'.format(method=method)) if mode == 'AOM': return np.mean(scores_buckets, axis=1) else: return np.max(scores_buckets, axis=1)
# sample(n_population, n_sample) # sampling_algorithm = {} ########################################################################### # Set Python core input sampling_algorithm["python-core-sample"] = \ lambda n_population, n_sample: \ random.sample(xrange(n_population), n_sample) ########################################################################### # Set custom automatic method selection sampling_algorithm["custom-auto"] = \ lambda n_population, n_samples, random_state=None: \ sample_without_replacement(n_population, n_samples, method="auto", random_state=random_state) ########################################################################### # Set custom tracking based method sampling_algorithm["custom-tracking-selection"] = \ lambda n_population, n_samples, random_state=None: \ sample_without_replacement(n_population, n_samples, method="tracking_selection", random_state=random_state) ########################################################################### # Set custom reservoir based method sampling_algorithm["custom-reservoir-sampling"] = \ lambda n_population, n_samples, random_state=None: \
def _parallel_build_estimators(n_estimators, ensemble, X, y, cost_mat, seeds, verbose): """Private function used to build a batch of estimators within a job.""" # Retrieve settings n_samples, n_features = X.shape max_samples = ensemble.max_samples max_features = ensemble.max_features if (not isinstance(max_samples, (numbers.Integral, np.integer)) and (0.0 < max_samples <= 1.0)): max_samples = int(max_samples * n_samples) if (not isinstance(max_features, (numbers.Integral, np.integer)) and (0.0 < max_features <= 1.0)): max_features = int(max_features * n_features) bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features # Build estimators estimators = [] estimators_samples = [] estimators_features = [] for i in range(n_estimators): if verbose > 1: print("building estimator %d of %d" % (i + 1, n_estimators)) random_state = check_random_state(seeds[i]) seed = check_random_state(random_state.randint(MAX_INT)) estimator = ensemble._make_estimator(append=False) try: # Not all estimator accept a random_state estimator.set_params(random_state=seed) except ValueError: pass # Draw features if bootstrap_features: features = random_state.randint(0, n_features, max_features) else: features = sample_without_replacement(n_features, max_features, random_state=random_state) # Draw samples, using a mask, and then fit if bootstrap: indices = random_state.randint(0, n_samples, max_samples) else: indices = sample_without_replacement(n_samples, max_samples, random_state=random_state) sample_counts = np.bincount(indices, minlength=n_samples) estimator.fit((X[indices])[:, features], y[indices], cost_mat[indices, :]) samples = sample_counts > 0. estimators.append(estimator) estimators_samples.append(samples) estimators_features.append(features) return estimators, estimators_samples, estimators_features
def _parallel_build_ranking_estimators(n_estimators, ensemble, X, y, Q, sample_weight, seeds, verbose): """Private function used to build a batch of estimators within a job. Now it supports queries and querywise sampling. It also breaks the PEP8 line length constraint now""" # Retrieve settings n_samples, n_features = X.shape max_samples = ensemble.max_samples max_features = ensemble.max_features uQueries = np.unique(Q) sample_whole_queries = False if hasattr(ensemble, "sample_whole_queries"): sample_whole_queries = ensemble.sample_whole_queries if not isinstance(max_samples, (numbers.Integral, np.integer)) and (0.0 < max_samples <= 1.0): if sample_whole_queries: max_samples = int(max_samples * len(uQueries)) else: max_samples = int(max_samples * n_samples) if not isinstance(max_features, (numbers.Integral, np.integer)) and (0.0 < max_features <= 1.0): max_features = int(max_features * n_features) bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight") # Build estimators estimators = [] estimators_samples = [] estimators_features = [] for i in range(n_estimators): if verbose > 1: print("building estimator %d of %d" % (i + 1, n_estimators)) random_state = check_random_state(seeds[i]) seed = check_random_state(random_state.randint(MAX_INT)) estimator = ensemble._make_estimator(append=False) try: # Not all estimator accept a random_state estimator.set_params(random_state=seed) except ValueError: pass # Draw features if bootstrap_features: features = random_state.randint(0, n_features, max_features) else: features = sample_without_replacement(n_features, max_features, random_state=random_state) # Draw samples, using sample weights, and then fit if support_sample_weight: if sample_weight is None: curr_sample_weight = np.ones((n_samples,)) else: curr_sample_weight = sample_weight.copy() if bootstrap: if sample_whole_queries: Qindices = uQueries[random_state.randint(0, len(uQueries), max_samples)] Qindices.sort() indices = reduce(np.append, [np.where(Q == i) for i in Qindices]) else: indices = random_state.randint(0, n_samples, max_samples) sample_counts = bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts else: if sample_whole_queries: notQindices = uQueries[random_state.randint(0, len(uQueries), len(uQueries) - max_samples)] notQindices.sort() not_indices = reduce(np.append, [np.where(Q == i) for i in Qindices]) else: not_indices = sample_without_replacement( n_samples, n_samples - max_samples, random_state=random_state ) curr_sample_weight[not_indices] = 0 estimator.fit(X[:, features], y, Q=Q, sample_weight=curr_sample_weight) samples = curr_sample_weight > 0.0 # Draw samples, using a mask, and then fit else: if bootstrap: if sample_whole_queries: Qindices = uQueries[random_state.randint(0, len(uQueries), max_samples)] Qindices.sort() indices = reduce(np.append, [np.where(Q == i) for i in Qindices]) else: indices = random_state.randint(0, n_samples, max_samples) else: if sample_whole_queries: Qindices = uQueries[ sample_without_replacement(len(uQueries), max_samples, random_state=random_state) ] Qindices.sort() indices = reduce(np.append, [np.where(Q == i) for i in Qindices]) else: indices = sample_without_replacement(n_samples, max_samples, random_state=random_state) sample_counts = bincount(indices, minlength=n_samples) estimator.fit((X[indices])[:, features], y[indices], Q=Q[indices]) samples = sample_counts > 0.0 estimators.append(estimator) estimators_samples.append(samples) estimators_features.append(features) return estimators, estimators_samples, estimators_features
def _parallel_build_estimators(n_estimators, ensemble, all_X, all_y, sample_weight, seeds, verbose): """Private function used to build a batch of estimators within a job.""" positives = np.where(all_y == 1)[0] unlabeled = np.where(all_y == 0)[0] X_positives = all_X[positives] X_unlabeled = all_X[unlabeled] y_positives = all_y[positives] y_unlabeled = all_y[unlabeled] # Retrieve settings n_samples, n_features = X_unlabeled.shape max_samples = ensemble.max_samples max_features = ensemble.max_features if (not isinstance(max_samples, (numbers.Integral, np.integer)) and (0.0 < max_samples <= 1.0)): max_samples = int(max_samples * n_samples) if (not isinstance(max_features, (numbers.Integral, np.integer)) and (0.0 < max_features <= 1.0)): max_features = int(max_features * n_features) bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features #can't currently support sample weights if sample_weight is not None: raise ValueError("Can't currently support sample weight with PUBagging") support_sample_weight = False #support_sample_weight = has_fit_parameter(ensemble.base_estimator_, # "sample_weight") #if not support_sample_weight and sample_weight is not None: # raise ValueError("The base estimator doesn't support sample weight") # Build estimators estimators = [] estimators_samples = [] estimators_features = [] for i in range(n_estimators): if verbose > 1: print("building estimator %d of %d" % (i + 1, n_estimators)) random_state = check_random_state(seeds[i]) seed = check_random_state(random_state.randint(MAX_INT)) estimator = ensemble._make_estimator(append=False) try: # Not all estimator accept a random_state estimator.set_params(random_state=seed) except ValueError: pass # Draw features if bootstrap_features: features = random_state.randint(0, n_features, max_features) else: features = sample_without_replacement(n_features, max_features, random_state=random_state) # Draw samples, using sample weights, and then fit if support_sample_weight: if sample_weight is None: curr_sample_weight = np.ones((n_samples,)) else: curr_sample_weight = sample_weight.copy() if bootstrap: indices = random_state.randint(0, n_samples, max_samples) sample_counts = bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts else: not_indices = sample_without_replacement( n_samples, n_samples - max_samples, random_state=random_state) curr_sample_weight[not_indices] = 0 estimator.fit(all_X[:, features], all_y, sample_weight=curr_sample_weight) samples = curr_sample_weight > 0. # Draw samples, using a mask, and then fit else: if bootstrap: indices = random_state.randint(0, n_samples, max_samples) else: indices = sample_without_replacement(n_samples, max_samples, random_state=random_state) sample_counts = bincount(indices, minlength=n_samples) new_X=np.vstack((X_positives, X_unlabeled[indices])) new_y=np.concatenate((y_positives, y_unlabeled[indices])) estimator.fit(new_X[:, features], new_y) samples = sample_counts > 0. estimators.append(estimator) estimators_samples.append(samples) estimators_features.append(features) return estimators, estimators_samples, estimators_features
def sparse_random_matrix(n_components, n_features, density='auto', random_state=None): """Generalized Achlioptas random sparse matrix for random projection Setting density to 1 / 3 will yield the original matrix by Dimitris Achlioptas while setting a lower value will yield the generalization by Ping Li et al. If we note :math:`s = 1 / density`, the components of the random matrix are drawn from: - -sqrt(s) / sqrt(n_components) with probability 1 / 2s - 0 with probability 1 - 1 / s - +sqrt(s) / sqrt(n_components) with probability 1 / 2s Parameters ---------- n_components : int, Dimensionality of the target projection space. n_features : int, Dimensionality of the original source space. density : float in range ]0, 1/3], optional Ratio of non-zero component in the random projection matrix. By default the value is set to the minimum density as recommended by Ping Li et al.: 1 / sqrt(n_features) Use density = 1 / 3.0 if you want to reproduce the results from Achlioptas, 2001. random_state : integer, RandomState instance or None (default) Control the pseudo random number generator used to generate the matrix at fit time. Returns ------- components: numpy array or CSR matrix with shape [n_components, n_features] The generated Gaussian random matrix. See Also -------- gaussian_random_matrix References ---------- .. [1] Ping Li, T. Hastie and K. W. Church, 2006, "Very Sparse Random Projections". http://www.stanford.edu/~hastie/Papers/Ping/KDD06_rp.pdf .. [2] D. Achlioptas, 2001, "Database-friendly random projections", http://www.cs.ucsc.edu/~optas/papers/jl.pdf """ _check_input_size(n_components, n_features) density = _check_density(density, n_features) rng = check_random_state(random_state) if density == 1: # skip index generation if totally dense components = rng.binomial(1, 0.5, (n_components, n_features)) * 2 - 1 return 1 / np.sqrt(n_components) * components else: # Generate location of non zero elements indices = [] offset = 0 indptr = [offset] for i in xrange(n_components): # find the indices of the non-zero components for row i n_nonzero_i = rng.binomial(n_features, density) indices_i = sample_without_replacement(n_features, n_nonzero_i, random_state=rng) indices.append(indices_i) offset += n_nonzero_i indptr.append(offset) indices = np.concatenate(indices) # Among non zero components the probability of the sign is 50%/50% data = rng.binomial(1, 0.5, size=np.size(indices)) * 2 - 1 # build the CSR structure by concatenating the rows components = sp.csr_matrix((data, indices, indptr), shape=(n_components, n_features)) return np.sqrt(1 / density) / np.sqrt(n_components) * components
def _spark_build_estimators(n_estimators, ensemble, X, y, sample_weight, seeds, verbose): """Private function used to build a batch of estimators within a job.""" print "building estimators" # Retrieve settings X = X.value y = y.value ensemble = ensemble sample_weight = sample_weight.value n_samples, n_features = X.shape max_samples = ensemble.max_samples max_features = ensemble.max_features if (not isinstance(max_samples, (numbers.Integral, np.integer)) and (0.0 < max_samples <= 1.0)): max_samples = int(max_samples * n_samples) if (not isinstance(max_features, (numbers.Integral, np.integer)) and (0.0 < max_features <= 1.0)): max_features = int(max_features * n_features) bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight") # Build estimators estimators = [] estimators_samples = [] estimators_features = [] for i in range(n_estimators): if verbose > 1: print("building estimator %d of %d" % (i + 1, n_estimators)) random_state = check_random_state(seeds[i]) seed = check_random_state(random_state.randint(MAX_INT)) estimator = ensemble._make_estimator(append=False) try: # Not all estimator accept a random_state estimator.set_params(random_state=seed) except ValueError: pass # Draw features if bootstrap_features: features = random_state.randint(0, n_features, max_features) else: features = sample_without_replacement(n_features, max_features, random_state=random_state) # Draw samples, using sample weights, and then fit if support_sample_weight: if sample_weight is None: curr_sample_weight = np.ones((n_samples,)) else: curr_sample_weight = sample_weight.copy() if bootstrap: indices = random_state.randint(0, n_samples, max_samples) sample_counts = bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts else: not_indices = sample_without_replacement( n_samples, n_samples - max_samples, random_state=random_state) curr_sample_weight[not_indices] = 0 estimator.fit(X[:, features], y, sample_weight=curr_sample_weight) samples = curr_sample_weight > 0. # Draw samples, using a mask, and then fit else: if bootstrap: indices = random_state.randint(0, n_samples, max_samples) else: indices = sample_without_replacement(n_samples, max_samples, random_state=random_state) sample_counts = bincount(indices, minlength=n_samples) estimator.fit((X[indices])[:, features], y[indices]) samples = sample_counts > 0. estimators.append(estimator) estimators_samples.append(samples) estimators_features.append(features) return estimators, estimators_samples, estimators_features
def _generator_fitted_estimators(n_estimators, ensemble, X, y, sample_weight, seeds, verbose): """Private function used to build an iterator of estimators.""" # Modified from sklearn.ensemble.bagging._parallel_build_estimators # Retrieve settings n_samples, n_features = X.shape max_samples = ensemble.max_samples max_features = ensemble.max_features if not isinstance(max_samples, (numbers.Integral, np.integer)) and (0.0 < max_samples <= 1.0): max_samples = int(max_samples * n_samples) if not isinstance(max_features, (numbers.Integral, np.integer)) and (0.0 < max_features <= 1.0): max_features = int(max_features * n_features) bootstrap = ensemble.bootstrap bootstrap_features = ensemble.bootstrap_features support_sample_weight = has_fit_parameter(ensemble.base_estimator_, "sample_weight") # Build estimators for i in range(n_estimators): if verbose > 1: print("building estimator %d of %d" % (i + 1, n_estimators)) random_state = check_random_state(seeds[i]) seed = check_random_state(random_state.randint(MAX_INT)) estimator = ensemble._make_estimator(append=False) try: # Not all estimator accept a random_state estimator.set_params(random_state=seed) except ValueError: pass # Draw features if bootstrap_features: features = random_state.randint(0, n_features, max_features) else: features = sample_without_replacement(n_features, max_features, random_state=random_state) # Draw samples, using sample weights, and then fit if support_sample_weight: if sample_weight is None: curr_sample_weight = np.ones((n_samples,)) else: curr_sample_weight = sample_weight.copy() if bootstrap: indices = random_state.randint(0, n_samples, max_samples) sample_counts = np.bincount(indices, minlength=n_samples) curr_sample_weight *= sample_counts else: not_indices = sample_without_replacement(n_samples, n_samples - max_samples, random_state=random_state) curr_sample_weight[not_indices] = 0 estimator.fit(X[:, features], y, sample_weight=curr_sample_weight) samples = curr_sample_weight > 0.0 # Draw samples, using a mask, and then fit else: if bootstrap: indices = random_state.randint(0, n_samples, max_samples) else: indices = sample_without_replacement(n_samples, max_samples, random_state=random_state) sample_counts = np.bincount(indices, minlength=n_samples) estimator.fit((X[indices])[:, features], y[indices]) samples = sample_counts > 0.0 yield estimator, samples, features