def create_datasets(): rs = RandomState(0) data_a = np.sort(rs.normal(0, 10, 500)).astype(int).reshape(100, 5) gene_names_a = list("ABCDE") cell_types_a = ["alpha", "beta", "gamma", "delta"] labels_a = rs.choice(np.arange(len(cell_types_a)), data_a.shape[0]) batch_indices_a = np.random.choice(np.arange(5), size=data_a.shape[0]) data_b = np.sort(rs.normal(100, 10, 300)).astype(int).reshape(100, 3) gene_names_b = list("BFA") cell_types_b = ["alpha", "epsilon", "rho"] labels_b = rs.choice(np.arange(len(cell_types_b)), data_b.shape[0]) batch_indices_b = rs.choice(np.arange(5), size=data_b.shape[0]) dataset_a = GeneExpressionDataset() dataset_b = GeneExpressionDataset() dataset_a.populate_from_data(X=data_a, labels=labels_a, gene_names=gene_names_a, cell_types=cell_types_a, batch_indices=batch_indices_a) dataset_a.name = "test_a" dataset_b.populate_from_data(X=data_b, labels=labels_b, gene_names=gene_names_b, cell_types=cell_types_b, batch_indices=batch_indices_b) dataset_b.name = "test_b" return dataset_a, dataset_b
def test_update_fundamental_matrix(): prng = RandomState(20150101) P = compute_transition_matrix(karate_club_graph()) n = P.shape[0] order = arange(P.shape[0]) previous_index = prng.choice(order, 1) previous_node = order[previous_index] non_absorbing_nodes = chain(range(previous_index), range(previous_index + 1, n)) non_absorbing_nodes = list(non_absorbing_nodes) order = order[non_absorbing_nodes] F = compute_fundamental_matrix( P[non_absorbing_nodes, :][:, non_absorbing_nodes]) absorbing_nodes = [previous_node] P_updated = P.copy() F_updated = F while P_updated.shape[0] >= 3: next_node = order[prng.choice(len(order), 1)] (P_updated, F_updated, order, previous_index) = \ update_fundamental_matrix(P_updated, F_updated, next=next_node, previous=previous_node, previous_index=previous_index, node_order=order) previous_node = next_node absorbing_nodes.append(next_node) non_absorbing_nodes = [x for x in range(n) if x not in absorbing_nodes] F_slow = compute_fundamental_matrix( P[non_absorbing_nodes, :][:, non_absorbing_nodes]) error_at_step = sum(sum(F_updated - F_slow).T)[0, 0] assert abs(error_at_step) < 1e-8, "Error is more than 1e-8."
def test_unrelated_columns(N=60, random_seed=12345): """ Test to see if 'unrelated' columns jam up the analysis. See Github Issue 43. https://github.com/ACCLAB/DABEST-python/issues/44. Added in v0.2.5. """ # rng = RandomState(MT19937(random_seed)) rng = RandomState(PCG64(12345)) # rng = np.random.default_rng(seed=random_seed) df = pd.DataFrame({ 'groups': rng.choice(['Group 1', 'Group 2', 'Group 3'], size=(N, )), 'color': rng.choice(['green', 'red', 'purple'], size=(N, )), 'value': rng.random(size=(N, )) }) df['unrelated'] = np.nan test = load(data=df, x='groups', y='value', idx=['Group 1', 'Group 2']) md = test.mean_diff.results assert md.difference[0] == pytest.approx(-0.0322, abs=1e-4) assert md.bca_low[0] == pytest.approx(-0.2279, abs=1e-4) assert md.bca_high[0] == pytest.approx(0.1613, abs=1e-4)
def make_ratings(n_users, n_items, min_rating_per_user, max_rating_per_user, rating_choices, seed=None, shuffle=True): """Randomly generate a (user_id, item_id, rating) array Return ------ ndarray with shape (n_samples, 3) """ if not (isinstance(rating_choices, list) or isinstance(rating_choices, tuple)): raise ValueError("'rating_choices' must be a list or tuple") if min_rating_per_user < 0 or min_rating_per_user >= n_items: raise ValueError("invalid 'min_rating_per_user' invalid") if (min_rating_per_user > max_rating_per_user) or \ (max_rating_per_user >= n_items): raise ValueError("invalid 'max_rating_per_user' invalid") rs = RandomState(seed=seed) user_arrs = [] for user_id in xrange(n_users): item_count = rs.randint(min_rating_per_user, max_rating_per_user) item_ids = rs.choice(n_items, item_count, replace=False) ratings = rs.choice(rating_choices, item_count) arr = np.stack( [np.repeat(user_id, item_count), item_ids, ratings], axis=1) user_arrs.append(arr) ratings = np.array(np.vstack(user_arrs)) ratings[:, 2] = ratings[:, 2].astype('float') if shuffle: rs.shuffle(ratings) return ratings
def test_update_fundamental_matrix(): prng = RandomState(20150101) P = compute_transition_matrix(karate_club_graph()) n = P.shape[0] order = arange(P.shape[0]) previous_index = prng.choice(order, 1) previous_node = order[previous_index] non_absorbing_nodes = chain(range(previous_index), range(previous_index + 1, n)) non_absorbing_nodes = list(non_absorbing_nodes) order = order[non_absorbing_nodes] F = compute_fundamental_matrix(P[non_absorbing_nodes, :] [:, non_absorbing_nodes]) absorbing_nodes = [previous_node] P_updated = P.copy() F_updated = F while P_updated.shape[0] >= 3: next_node = order[prng.choice(len(order), 1)] (P_updated, F_updated, order, previous_index) = \ update_fundamental_matrix(P_updated, F_updated, next=next_node, previous=previous_node, previous_index=previous_index, node_order=order) previous_node = next_node absorbing_nodes.append(next_node) non_absorbing_nodes = [x for x in range(n) if x not in absorbing_nodes] F_slow = compute_fundamental_matrix(P[non_absorbing_nodes, :] [:, non_absorbing_nodes]) error_at_step = sum(sum(F_updated - F_slow).T)[0, 0] assert abs(error_at_step) < 1e-8, "Error is more than 1e-8."
def merge_two_zones(zones, np1, np2, seed=None): # rnd = RandomState() if seed is not None: rnd = RandomState(seed) # i = rnd.choice(a=range(0, np2)) j = rnd.choice(a=range(0, np1)) i_ = i j_ = j dir_ = rnd.choice(a=[0, 1]) if dir_ == 0: if 0 < i < np2 - 1: i_ = rnd.choice(a=[i - 1, i + 1]) elif i == 0: i_ = 1 else: i_ = i - 1 else: if 0 < j < np1 - 1: j_ = rnd.choice(a=[j - 1, j + 1]) elif j == 0: j_ = 1 else: j_ = j - 1 zones_ = { k: nodes for k, nodes in zones.iteritems() if k != (i, j) and k != (i_, j_) } new_zone = list(zones[(i, j)]) new_zone.extend(zones[i_, j_]) zones_["m"] = new_zone return zones_
def create_random_selection(self, N_elements=None, scan_percentage=None, random_type=equal, sort_dimensions=False): rs = RandomState(seed=0) rs.choice(a, 5, p=np.exp(-a) / sum(np.exp(-a)), replace=False)
def test_segmentation(): PRNG = RandomState() PRNG2 = RandomState() if args.seed > 0: PRNG.seed(args.seed) PRNG2.seed(args.seed) transform = Compose( [ [ColorJitter(prob=0.75), None], Merge(), Expand((0.8, 1.5)), RandomCompose([ # RandomResize(1, 1.5), RandomRotate(10), RandomShift(0.1) ]), Scale(300), # ElasticTransform(100), RandomCrop(300), HorizontalFlip(), Split([0, 3], [3, 6]), #[SubtractMean(mean=VOC.MEAN), None], ], PRNG, border='constant', fillval=VOC.MEAN, anchor_index=3) voc_dataset = VOCSegmentation(root=args.root, image_set=[('2007', 'trainval')], transform=transform, instance=False) viz = Viz() results = [] count = 0 i = PRNG2.choice(len(voc_dataset)) for _ in range(1000): img, target = voc_dataset[i] img2 = viz.blend_segmentation(img, target) con = np.hstack([img, target, img2]) results.append(con) cv2.imshow('result', con[..., ::-1]) c = cv2.waitKey(500) if c == 27 or c == ord('q'): # ESC / 'q' break elif c == ord('c') or count >= 3: count = 0 i = PRNG2.choice(len(voc_dataset)) count += 1
def compute_bootstrapped_diff(x0, x1, is_paired, effect_size, resamples=5000, random_seed=12345): """Bootstraps the effect_size for 2 groups.""" from . import effsize as __es import numpy as np from numpy.random import PCG64, RandomState # rng = RandomState(default_rng(random_seed)) rng = RandomState(PCG64(random_seed)) out = np.repeat(np.nan, resamples) x0_len = len(x0) x1_len = len(x1) for i in range(int(resamples)): if is_paired: if x0_len != x1_len: raise ValueError("The two arrays do not have the same length.") random_idx = rng.choice(x0_len, x0_len, replace=True) x0_sample = x0[random_idx] x1_sample = x1[random_idx] else: x0_sample = rng.choice(x0, x0_len, replace=True) x1_sample = rng.choice(x1, x1_len, replace=True) out[i] = __es.two_group_difference(x0_sample, x1_sample, is_paired, effect_size) # check whether there are any infinities in the bootstrap, # which likely indicates the sample sizes are too small as # the computation of Cohen's d and Hedges' g necessitated # a division by zero. # Added in v0.2.6. # num_infinities = len(out[np.isinf(out)]) # print(num_infinities) # if num_infinities > 0: # warn_msg = "There are {} bootstraps that are not defined. "\ # "This is likely due to smaple sample sizes. "\ # "The values in a bootstrap for a group will be more likely "\ # "to be all equal, with a resulting variance of zero. "\ # "The computation of Cohen's d and Hedges' g will therefore "\ # "involved a division by zero. " # warnings.warn(warn_msg.format(num_infinities), category="UserWarning") return out
def test_RandomRectangularPattern_ca_3ch_postit(self): rso = RandomState(1) state_tuple = rso.get_state() t = image_triggers.RandomRectangularPattern( 3, 3, 3, color_algorithm='channel_assign', color_options={'cval': [255, 254, 253]}, pattern_style='postit', random_state_obj=rso) actual_img = t.get_data() actual_mask = t.get_mask() # reset the random state and generate the pattern in the same manner rso.set_state(state_tuple) per_chan_expected_img = rso.choice(2, 3 * 3).reshape( (3, 3)).astype(bool) expected_img = np.zeros((3, 3, 3)) expected_img[:, :, 0] = per_chan_expected_img * 255 # the color expected_img[:, :, 1] = per_chan_expected_img * 254 # the color expected_img[:, :, 2] = per_chan_expected_img * 253 # the color expected_mask = np.ones((3, 3)).astype(bool) self.assertTrue(np.array_equal(actual_img, expected_img)) self.assertTrue(np.array_equal(actual_mask, expected_mask))
def update_metropolis(field: np.ndarray, states: States, free_energy: float, interaction: Interaction, interaction_coefficient: float, magnetization_coefficient: float, temperature: float, random_state: RandomState) -> (np.ndarray, float): assert states assert field.shape[0] == field.shape[1] size = field.shape[0] min_x = 0 if FIX_LEFT is None else 1 max_x = size if FIX_RIGHT is None else size - 1 min_y = 0 if FIX_BOTTOM is None else 1 max_y = size if FIX_TOP is None else size - 1 random_x = random_state.randint(min_x, max_x) # dim random_y = random_state.randint(min_y, max_y) new_spin = field[random_x, random_y] # spin flip always needs to lead to a change of spin while new_spin == field[random_x, random_y]: new_spin = random_state.choice(states) energy_delta, field_updated = calculate_energy_difference(field, random_x, random_y, new_spin, interaction, interaction_coefficient, magnetization_coefficient) random_number = random_state.uniform() acceptance_probability = np.exp(-1. / temperature * energy_delta) print_if_verbose(f'Energy delta: {energy_delta}, random number: {random_number}, ' f'acceptance_probability: {acceptance_probability}') if energy_delta <= 0 or random_number < acceptance_probability: # free_energy_updated = free_energy - energy_delta print_if_verbose('Change accepted') return field_updated, free_energy - energy_delta else: print_if_verbose('Not accepted') return field, free_energy
def sample_group_counts( random_state: RandomState, total: int, lam_low: float = 1.0, lam_high: float = 8.0 ) -> List[int]: """ Sample a list of integers which sum up to `total`. The probability of sampling an integer follows exponential decay, k ~ np.exp(-k * lam), where lam is a hyperparam sampled from a range [lam_low, lam_high). :param random_state: numpy random state :param total: the expected sum of sampled numbers. :param lam_low: lower bound for lambda in exponential decay. :param lam_high: higher bound for lambda in exponential decay. :return: """ current_max = total counts = [] while current_max > 0: candidates = range(1, current_max + 1) lam = random_state.uniform(lam_low, lam_high) probs = np.array([np.exp(-i * lam) for i in candidates]) probs /= sum(probs) selected = random_state.choice(candidates, p=probs) counts.append(selected) current_max -= selected assert sum(counts) == total return counts
def split_dataset(df, validation_percentage, seed): state = RandomState(seed) validation_indexes = state.choice( df.index, int(len(df.index) * validation_percentage), replace=False) training_set = df.loc[~df.index.isin(validation_indexes)] validation_set = df.loc[df.index.isin(validation_indexes)] return training_set, validation_set
class RandomGenerator(object): def __init__(self, seed=None): self._random = RandomState(seed=seed) def seed(self, seed): self._random = RandomState(seed=seed) def random(self): return self._random.rand() def randint(self, a, b=None): if b is None: b = a a = 0 r = self._random.randint(a, high=b, size=1) return r[0] def sample(self, population, k): if k == 0: return [] return list(self._random.choice(population, size=k, replace=False)) def __getattr__(self, attr): return getattr(self._random, attr) def __getstate__(self): return {'_random': self._random} def __setstate__(self, d): self._random = d['_random'] def uniform(self, low=0.0, high=1.0, size=None): return self._random.uniform(low, high, size)
def sample_roi_repr(roi_generator, sample_per_batch, nb_samples, repr_model, batch_size=32, random_seed=12345, q_size=20): '''Sample candidate ROIs and then extract their DL representations ''' samples_seen = 0 repr_list = [] roi_q = [] # a queue for candid ROIs before they are scored. while samples_seen < nb_samples: rng = RandomState(samples_seen + random_seed) X, w = roi_generator.next() w /= w.sum() ri = rng.choice(len(X), sample_per_batch, replace=False, p=w) roi_q.append(X[ri]) samples_seen += len(ri) if len(roi_q) >= q_size: X_q = np.concatenate(roi_q) repr_list.append(repr_model.predict(X_q, batch_size=batch_size)) roi_q = [] if len(roi_q) > 0: X_q = np.concatenate(roi_q) repr_list.append(repr_model.predict(X_q, batch_size=batch_size)) roi_q = [] return np.concatenate(repr_list)
def test_chi2mixture(): dof = 2 mixture = 0.2 n = 100 random = RandomState(1) x = random.chisquare(dof, n) n0 = int((1 - mixture) * n) idxs = random.choice(n, n0, replace=False) x[idxs] = 0 chi2mix = Chi2Mixture( scale_min=0.1, scale_max=5.0, dof_min=0.1, dof_max=5.0, qmax=0.1, tol=4e-3, lrt=x, ) chi2mix.estimate_chi2mixture(x) pv = chi2mix.sf([0.0, 0.2]) assert_allclose(pv, [0.19999999999999996, 0.1412935752078675]) assert_allclose(chi2mix.scale, 1.9808080808080812) assert_allclose(chi2mix.dof, 0.891919191919192) assert_allclose(chi2mix.mixture, 0.199999999999999960)
def flexible_values(val, size=1, random_state=None, min=-np.inf, max=np.inf): """Flexibly determine a number of values. Input format can be: - A numeric value, which will be used exactly. - A list of possible values, which will be randomly chosen from. - A tuple of (dist, arg0[, arg1, ...]), which will be used to generate random observations from a scipy random variable. """ if random_state is None: random_state = RandomState() if np.isscalar(val): out = np.ones(size, np.array(val).dtype) * val elif isinstance(val, list): out = random_state.choice(val, size=size) elif isinstance(val, tuple): rv = getattr(stats, val[0])(*val[1:]) out = truncated_sample(rv, size, min, max, random_state=random_state) else: raise TypeError("`val` must be scalar, set, or tuple") if size == 1: out = out.item() return out
class RandomGenerator(object): def __init__(self, seed=None): self._random = RandomState(seed=seed) def random(self): return self._random.rand() def randint(self, a, b=None): if b is None: b = a a = 0 r = self._random.randint(a, high=b, size=1) return r[0] def sample(self, population, k): if k == 0: return [] return self._random.choice(population, size=k, replace=False) def __getattr__(self, attr): return getattr(self._random, attr) def __getstate__(self): return {'_random': self._random} def __setstate__(self, d): self._random = d['_random']
def sample_from_every_class(y, size, seed=None): """ Get a random sample, ensuring every class is represented. This helper function is useful when the sample size is small and we want to make sure that at least one sample from each class is included. This is required, for example, in logistic regression, where the classifier cannot handle classes where it has never seen any training examples. Params ------ y : 1-dimensional numpy array The label/output array. size : int The desired number of samples. seed : RandomState object or None Provide a seed for reproducibility. Returns ------- samples : numpy array of shape [size] The random samples. """ if seed is None: seed = RandomState(1234) # Keep track of the classes which have not been sampled yet labels = np.unique(y) samples = [] while len(samples) < size: idx = seed.choice(np.arange(len(y))) if (len(labels) == 0 and idx not in samples) or y[idx] in labels: samples.append(idx) labels = np.delete(labels, np.argwhere(labels == y[idx])) return samples
def test_ransac(): ''' Test RandomSampleConsensus ''' num = 1000 rng = RandomState() scale = 10 points = np.zeros((num, 3), 'f8') for i in range(num): points[i, 0] = rng.rand() * scale points[i, 1] = rng.rand() * scale if i % 2 == 0: points[i, 2] = -points[i, 0] - points[i, 1] else: points[i, 2] = rng.rand() * scale cloud = pcl.PointCloud(points, fields=['x', 'y', 'z']) ransac = ps.RandomSampleConsensus(ps.SampleConsensusModelPlane(cloud)) assert len(ransac.get_random_samples(rng.choice(num, 50), 30)) == 30 ransac.max_iterations = 1000 ransac.distance_threshold = 0.1 ransac.compute_model() assert len(ransac.model_coefficients) == 4 assert len(ransac.inliers) > 0 assert len(ransac.model) == 3 ransac.refine_model() assert len(ransac.model_coefficients) == 4 assert len(ransac.inliers) > 0 assert len(ransac.model) == 3
def create(seed, head_prob = 0.8, two_col_prob = 0.3, section_range = [5,9]): ''' Creates the same html for a given seed ''' rand = RandomState(seed) soup = BeautifulSoup(_template, 'html.parser') if rand.rand() < head_prob: soup.body.insert(0, create_header(rand, soup, level=1)) content = soup.body.div if rand.rand() < two_col_prob: content['class'] = 'col2' def append_section(new_elem, header_level = 0): div = soup.new_tag('div') if header_level > 0: div.append(create_header(rand, soup, level = header_level)) div.append(new_elem) content.append(div) actions = [lambda:append_section(create_paragraph(rand, soup)), lambda:append_section(create_table(rand, soup), header_level = 3), lambda:append_section(create_list(rand, soup), header_level = 3)] section_count = sample_discrete_normal(rand, *section_range) for _sec_i in xrange(section_count): action = rand.choice(actions) action() return soup
def test_01(self): n = 64 k = 4 random_instance = RandomState() challenges = [random_instance.choice((-1, +1), n) for i in range(1000)] challenges = array(challenges, dtype=int8) weights = random_instance.normal(loc=0, scale=1, size=(k, n)) transformed_challenges = ph.transform_id(challenges, k) ph_solution = ph.eval_sign(transformed_challenges, weights) numpy_solution = sign( transpose( array([ dot( transformed_challenges[:,l], weights[l] ) for l in range(k) ]) )).astype(int) numpy_solution = array(numpy_solution, dtype=int8, copy=True) assert_array_equal( ph_solution, numpy_solution, "Comparison of eval_sign with numpy fails." )
def load_dataset(song_folder_name='song_data', artist_folder='artists', nb_classes=20, random_state=42): """This function loads the dataset based on a location; it returns a list of spectrograms and their corresponding artists/song names""" # Get all songs saved as numpy arrays in the given folder song_list = os.listdir(song_folder_name) # Load the list of artists artist_list = os.listdir(artist_folder) # select the appropriate number of classes prng = RandomState(random_state) artists = prng.choice(artist_list, size=nb_classes, replace=False) # Create empty lists artist = [] spectrogram = [] song_name = [] # Load each song into memory if the artist is included and return for song in song_list: with open(os.path.join(song_folder_name, song), 'rb') as fp: loaded_song = dill.load(fp) if loaded_song[0] in artists: artist.append(loaded_song[0]) spectrogram.append(loaded_song[1]) song_name.append(loaded_song[2]) return artist, spectrogram, song_name
def do(self, img_obj: ImageEntity, pattern_obj: ImageEntity, random_state_obj: RandomState) -> ImageEntity: """ Perform the described merge operation :param img_obj: The input object into which the pattern is to be inserted :param pattern_obj: The pattern object which is to be inserted into the image :param random_state_obj: used to sample from the possible valid locations, by providing a random state, we ensure reproducibility of the data :return: the merged object """ img = img_obj.get_data() pattern = pattern_obj.get_data() num_chans = img.shape[2] if num_chans != 4: raise ValueError("Alpha Channel expected!") # find valid locations & remove bounding box i_rows, i_cols, _ = img.shape p_rows, p_cols, _ = pattern.shape # TODO: remove edges of image so that the patch always stays within # the image valid_indices = np.where(img[0:i_rows-p_rows, 0:i_cols-p_cols, 3] != 0) num_valid_indices = len(valid_indices[0]) random_index = random_state_obj.choice(num_valid_indices) insert_loc = [valid_indices[0][random_index], valid_indices[1][random_index]] insert_loc_per_chan = np.tile(insert_loc, (4, 1)).astype(int) logger.debug("Selected insertion location randomly from available locations") inserter = InsertAtLocation(insert_loc_per_chan) inserted_img_obj = inserter.do(img_obj, pattern_obj, random_state_obj) return inserted_img_obj
def __getitem__(self, index): person_id = self.sort_keys[index] # 找到str的person id nori_ids_list = self.pkl[person_id]['nori_id'] rng = RandomState() nori_ids = rng.choice(nori_ids_list, self.num_instance, replace=(len(nori_ids_list) < self.num_instance)) img_list = [] nori_list = [] for nori_id in nori_ids: market_img = self.nf.get(nori_id) texture_img = imdecode(market_img) while texture_img is None or texture_img.shape[ 0] <= 0 or texture_img.shape[1] <= 0: new_nori_id = np.random.randint(0, len(nori_ids_list)) market_img = self.nf.get(nori_ids[new_nori_id]) texture_img = imdecode(market_img) texture_img = self.random_flip(texture_img) texture_img = self.to_tensor(texture_img) img_list.append(texture_img) nori_list.append(nori_id) idx_list = [index] * self.num_instance #texture_img_path = self.data[index] #texture_img = cv2.imread(texture_img_path) return img_list, idx_list
def _sample_data(X: pd.DataFrame, y: List[Any], n_sample: int, rng: random.RandomState) -> Tuple[pd.DataFrame, List[Any]]: if n_sample <= 0: return X, y else: indices = rng.choice(range(len(X)), n_sample, replace=True) return X.iloc[indices, :], itemgetter(*indices)(y)
def assign_random_gt(input_vcf, output, sample_name="HG", default_af=0.01, seed=None): vcf_pointer = pysam.VariantFile(filename=input_vcf) new_header = vcf_pointer.header.copy() if "GT" not in new_header.formats: new_header.formats.add("GT", "1", "String", "Consensus Genotype across all datasets with called genotype") new_header.samples.add(sample_name) output.write(str(new_header)) default_probs = [1 - default_af * (1 + default_af), default_af/2, default_af/2, default_af * default_af] rng = RandomState(seed) previous_pos = 0 for rec in vcf_pointer.fetch(): rec_copy = rec.copy() if "GT" not in rec_copy.format.keys(): if rec_copy.pos == previous_pos: c = "0|0" else: if "AF" not in rec_copy.info.keys(): gt_probs = default_probs else: af = rec_copy.info["AF"] gt_probs = [1 - af * (1 + af), af/2, af/2, af * af] c = rng.choice(["0|0", "0|1", "1|0", "1|1"], p=gt_probs) output.write("\t".join([str(rec_copy)[:-1], "GT", c]) + "\n") previous_pos = rec_copy.pos vcf_pointer.close()
def __permute(estimator, X, y, best_score, scorer, random_state): """ Permute each predictor and measure difference from best score Args ---- estimator (object): scikit learn estimator X, y: 2d and 1d numpy arrays data and labels from a test partition best_score (float): best scorer obtained on unperturbed data scorer (object): scoring method to use to measure importances random_state (float): random seed Returns ------- scores (2D numpy array): scores for each predictor following permutation """ from numpy.random import RandomState rstate = RandomState(random_state) # permute each predictor variable and assess difference in score scores = np.zeros(X.shape[1]) for i in range(X.shape[1]): Xscram = np.copy(X) Xscram[:, i] = rstate.choice(X[:, i], X.shape[0]) # fit the model on the training data and predict the test data y_pred = estimator.predict(Xscram) scores[i] = best_score - scorer(y, y_pred) if scores[i] < 0: scores[i] = 0 return scores
def mkeindseed(self,desc_in_ind=5): if self.mkeindseed.count<=100: prng=RandomState(self.seed+self.mkeindseed.count) if self.mkeindseed.count>100: prng=RandomState(self.seed+(self.mkeindseed.count%100)) smple=prng.choice(self.basetable.columns,size=desc_in_ind, replace=False) return list(smple)
def test_bboxes(): PRNG = RandomState() PRNG2 = RandomState() if args.seed > 0: PRNG.seed(args.seed) PRNG2.seed(args.seed) transform = Compose( [ [ColorJitter(prob=0.5)], # or write [ColorJitter(), None] BoxesToCoords(), HorizontalFlip(), Expand((1, 4), prob=0.5), ObjectRandomCrop(), Resize(300), CoordsToBoxes(), #[SubtractMean(mean=VOC.MEAN)], ], PRNG, mode=None, fillval=VOC.MEAN, outside_points='clamp') viz = Viz() voc_dataset = VOCDetection(root=args.root, image_set=[('2007', 'trainval')], keep_difficult=True, transform=transform) results = [] count = 0 i = PRNG2.choice(len(voc_dataset)) for _ in range(100): img, boxes, labels = voc_dataset[i] if len(labels) == 0: continue img = viz.draw_bbox(img, boxes, labels, True) results.append(img) cv2.imshow('0', img[:, :, ::-1]) c = cv2.waitKey(500) if c == 27 or c == ord('q'): # ESC / 'q' break elif c == ord('c') or count >= 5: count = 0 i = PRNG2.choice(len(voc_dataset)) count += 1
class ClassBalancedBatchSizeIterator(object): """ Create batches of balanced size, that are also balanced per class, i.e. each class should be sampled roughly with the same frequency during training. Parameters ---------- batch_size: int Resulting batches will not necessarily have the given batch size but rather the next largest batch size that allows to split the set into balanced batches (maximum size difference 1). seed: int Random seed for initialization of `numpy.RandomState` random generator that shuffles the batches. """ def __init__(self, batch_size, seed=328774): self.batch_size = batch_size self.seed = seed self.rng = RandomState(self.seed) def get_batches(self, dataset, shuffle): n_trials = len(dataset.X) batches = get_balanced_batches( n_trials, batch_size=self.batch_size, rng=self.rng, shuffle=shuffle ) if shuffle: n_classes = np.max(dataset.y) + 1 class_probabilities = [ np.mean(dataset.y == i_class) for i_class in range(n_classes) ] class_probabilities = np.array(class_probabilities) # choose trials in inverse probability of class trial_probabilities = [ 1.0 / class_probabilities[y] for y in dataset.y ] trial_probabilities = np.array(trial_probabilities) / np.sum( trial_probabilities ) i_trial_to_balanced = self.rng.choice( n_trials, n_trials, p=trial_probabilities ) for batch_inds in batches: if shuffle: batch_inds = [ i_trial_to_balanced[i_trial] for i_trial in batch_inds ] batch_X = dataset.X[batch_inds] batch_y = dataset.y[batch_inds] # add empty fourth dimension if necessary if batch_X.ndim == 3: batch_X = batch_X[:, :, :, None] yield (batch_X, batch_y) def reset_rng(self): self.rng = RandomState(self.seed)
class RandomAgent(Agent): def __init__(self): # TODO: move seed into an argument. self.rng = RandomState(self.config['random_seed']) return def choose_actions(self, observations, infos, dones): return [self.rng.choice(info['admissible_commands']) for info in infos]
def id_generator(size=6, chars=string.ascii_uppercase + string.digits, seed=None): # rnd = RandomState() if seed is not None: rnd = RandomState(seed) # return ''.join(rnd.choice(list(chars)) for _ in range(size))
def main(args, model_dir): logger.info('Args = {}'.format(args)) corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus]) tokenizer = TokenizerFactory().tokenizer(args.corpus) logger.info('Loaded corpus: {}'.format(corpus)) logger.info('Get sentences...') train_sents, _ = tagging_utils.standoff_to_sents(corpus.train, tokenizer, verbose=True) dev_sents, _ = tagging_utils.standoff_to_sents(corpus.dev, tokenizer, verbose=True) test_sents, test_docs = tagging_utils.standoff_to_sents(corpus.test, tokenizer, verbose=True) train_sents = train_sents + dev_sents train_sents_filtered = list(filter(_is_not_meta_sentence, train_sents)) sample_size = int(len(train_sents_filtered) * args.train_sample_frac) rs = RandomState(seed=args.random_seed) train_sents_sample = rs.choice(train_sents_filtered, replace=False, size=sample_size).tolist() logger.info('Train with fraction of training data: {} sents out of {} sentences ({}%)', sample_size, len(train_sents_filtered), args.train_sample_frac) logger.info('Compute features...') feature_extractor, meta_sentence_filter = crf_util.FEATURE_EXTRACTOR[args.feature_extractor] X_train, y_train = crf_labeler.sents_to_features_and_labels(train_sents_sample, feature_extractor) X_test, _ = crf_labeler.sents_to_features_and_labels(test_sents, feature_extractor) logger.info('len(X_train) = {}'.format(len(X_train))) logger.info('len(y_train) = {}'.format(len(y_train))) logger.info('len(X_test) = {}'.format(len(X_test))) crf = crf_labeler.SentenceFilterCRF( ignore_sentence=meta_sentence_filter, ignored_label='O', algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True ) logger.info('Start training... {}'.format(crf)) crf.fit(X_train, y_train) logger.info('CRF classes: {}'.format(crf.classes_)) logger.info('Make predictions...') y_pred_test = crf.predict(X_test) logger.info('Start evaluation...') evaluator = Evaluator(gold=corpus.test, predicted=tagging_utils.sents_to_standoff(y_pred_test, test_docs)) entity_level_metric = evaluator.entity_level() logger.info('\n{}', entity_level_metric) entity_level_metric.to_csv(join(model_dir, 'scores_entity.csv')) evaluator.token_level().to_csv(join(model_dir, 'scores_token.csv')) evaluator.token_level_blind().to_csv(join(model_dir, 'scores_token_blind.csv')) logger.info('Done.')
class Random(Subset): def __init__(self, **kwargs): self.size = kwargs['size'] self.rs = RandomState(kwargs.get('seed')) def generate(self, backdoor: Backdoor): size = min(self.size, len(backdoor)) variables = backdoor.snapshot() return Backdoor(self.rs.choice(variables, size, replace=False))
def main(args, model_dir): logger.info('Args = {}'.format(args)) corpus = CorpusLoader().load_corpus(CORPUS_PATH[args.corpus]) tokenizer = TokenizerFactory().tokenizer(args.corpus) logger.info('Loaded corpus: {}'.format(corpus)) logger.info('Get sentences...') train_sents, _ = flair_utils.standoff_to_flair_sents(corpus.train, tokenizer, verbose=True) dev_sents, _ = flair_utils.standoff_to_flair_sents(corpus.dev, tokenizer, verbose=True) test_sents, test_docs = flair_utils.standoff_to_flair_sents(corpus.test, tokenizer, verbose=True) train_sents = train_sents + dev_sents train_sents_filtered = list(filter(lambda sent: not _ignore_sentence(sent), train_sents)) sample_size = int(len(train_sents_filtered) * args.train_sample_frac) rs = RandomState(seed=args.random_seed) train_sents_sample = rs.choice(train_sents_filtered, replace=False, size=sample_size).tolist() logger.info('Train with fraction of training data: {} sents out of {} sentences ({}%)', sample_size, len(train_sents_filtered), args.train_sample_frac) # We need to pass some dev data, otherwise flair raises a ZeroDivisionError # See: https://github.com/zalandoresearch/flair/issues/1139 # We just split the training sample into half and instruct Flair to train_with_dev (see below). half = len(train_sents_sample) // 2 flair_corpus = flair_utils.FilteredCorpus(train=train_sents_sample[:half], dev=train_sents_sample[half:], test=test_sents, ignore_sentence=_ignore_sentence) logger.info(flair_corpus) logger.info('Train model...') tagger = run_bilstmcrf.get_model(flair_corpus, corpus_name=args.corpus, embedding_lang=args.embedding_lang, pooled_contextual_embeddings=True) trainer = ModelTrainer(tagger, flair_corpus) trainer.train(join(model_dir, 'flair'), max_epochs=150, monitor_train=False, train_with_dev=True, save_final_model=args.save_final_model) logger.info('Make predictions...') run_bilstmcrf.make_predictions(tagger, flair_corpus) logger.info('Start evaluation...') evaluator = Evaluator(gold=corpus.test, predicted=flair_utils.flair_sents_to_standoff(test_sents, test_docs)) entity_level_metric = evaluator.entity_level() logger.info('\n{}', entity_level_metric) entity_level_metric.to_csv(join(model_dir, 'scores_entity.csv')) evaluator.token_level().to_csv(join(model_dir, 'scores_token.csv')) evaluator.token_level_blind().to_csv(join(model_dir, 'scores_token_blind.csv')) logger.info('Done.')
def mkeindseed(self, desc_in_ind=5): if self.mkeindseed.count <= 100: prng = RandomState(self.seed + int(self.mkeindseed.count)) if self.mkeindseed.count > 100: prng = RandomState(self.seed + int((self.mkeindseed.count % 100))) smple = prng.choice(self.basetable.columns, size=desc_in_ind, replace=False) return list(smple)
def test_phase_equal_after_bandpower_mean(): rng = RandomState(3098284) inputs = rng.randn(50,20,1001,1) targets = rng.choice(4, size=50) target_arr = np.zeros((50,4)) target_arr[:,0] = targets == 0 target_arr[:,1] = targets == 1 target_arr[:,2] = targets == 2 target_arr[:,3] = targets == 3 mod_inputs, mod_targets = BandpowerMeaner().process(inputs, target_arr) assert np.allclose(np.angle(np.fft.rfft(inputs, axis=2)), np.angle(np.fft.rfft(mod_inputs, axis=2)), rtol=1e-4, atol=1e-5) assert np.array_equal(target_arr, mod_targets)
def test_random_choice(): """nestle.random_choice() is designed to mimic np.random.choice(), for numpy < v1.7.0. In cases where we have both, test that they agree. """ rstate = RandomState(0) p = rstate.rand(10) p /= p.sum() for seed in range(10): rstate.seed(seed) i = rstate.choice(10, p=p) rstate.seed(seed) j = nestle.random_choice(10, p=p, rstate=rstate) assert i == j
def replace_characters(token, index_to_char, n=1, char_pool=string.ascii_lowercase, seed=17): if isinstance(seed, RandomState): rng = seed else: rng = RandomState(seed) new_token = token for i in six.moves.range(n): idx = max(1, rng.randint(len(new_token))) #ch = index_to_char[rng.randint(len(index_to_char))] ch = rng.choice(list(char_pool)) new_token = unicode(new_token[0:idx-1] + ch + new_token[idx:]) return new_token
def rarefy_seqs(in_filename, out_filename, depth=1000, fmt="fastq", seed=0): """Rarefy a sequence file. """ prng = RandomState(seed) records = SeqIO.index(in_filename, fmt) record_ids = [record_id for record_id in records.iterkeys()] record_ids = prng.choice(record_ids, replace=False, size=depth) out_handle = open(out_filename, 'w') for record_id in record_ids: SeqIO.write(records[record_id], out_handle, fmt) out_handle.close()
def choose_random_nodes(G, ntr = 1, n_edges = 1): ''' Returns a random set of absorbing nodes, if the other nodes in the graph form a connected component after removing the absorbing nodes. Otherwise it returns None Parameters ---------- G : Networkx graph The graph from which the team will be selected. ntr : the number of absorbing nodes Returns ------- nodes_to_remove : The list of nodes in t the graph to be made absorbing. is_viable: Boolean indicating whether the graph will stay connected after making the nodes absorbing, meaning wheather the partition is viable ''' prng = RandomState() order = array(G.nodes()) nodes_to_remove = list(prng.choice(order, ntr, replace=False )) H = G.copy() H.remove_nodes_from(nodes_to_remove) if G.is_directed(): n_components = nx.number_strongly_connected_components(H) else: n_components = nx.number_connected_components(H) if n_components == 1: is_viable = True if G.is_directed(): for node in nodes_to_remove: if (H.number_of_nodes() - len(set(G.predecessors(node)) - set(nodes_to_remove))) < n_edges: is_viable = False break else: for node in nodes_to_remove: if (H.number_of_nodes() - len(set(G.neighbors(node)) - set(nodes_to_remove))) < n_edges: is_viable = False break return nodes_to_remove, is_viable else: is_viable = False return nodes_to_remove, is_viable
def generateDegradation(args, seed): from numpy.random import RandomState from numpy.linalg import norm rs = RandomState(seed) if args.D == 2: rotation = (rs.uniform(*args.rotate),) if args.D == 3: angle = rs.uniform(*args.rotate) axis = rs.uniform(size=3) axis = axis/norm(axis) rotation = angle, axis translation = rs.uniform(*args.translate, size=args.D) scale = rs.uniform(*args.scale) if args.drop[0] == args.drop[1]: ndrops = args.drop[0] else: ndrops = rs.randint(*sorted(args.drop)) drops = rs.choice(range(args.N), size=ndrops, replace=False) duplications = rs.choice(range(args.duplicate[0], args.duplicate[1] + 1), size=args.N - ndrops) noise = rs.uniform(*args.noise) * rs.randn(sum(duplications), args.D) return rotation, translation, scale, drops, duplications, noise
def rarefaction(M, seed=0): """ taken from the below link http://stackoverflow.com/posts/18967204/revisions """ prng = RandomState(seed) # reproducible results noccur = np.sum(M, axis=1) # number of occurrences for each sample nvar = M.shape[1] # number of variables depth = np.min(noccur) # sampling depth Mrarefied = np.empty_like(M) for i in range(M.shape[0]): # for each sample p = M[i] / float(noccur[i]) # relative frequency / probability choice = prng.choice(nvar, depth, p=p) Mrarefied[i] = np.bincount(choice, minlength=nvar) return Mrarefied
def rarefy_otu_table(in_filename, out_filename, perc=80, seed=0): """Rarefy OTU table to perc% of the least abundant sample. """ # Read OTU table otu_table = pd.read_csv(in_filename, sep='\t', index_col=0) # rarefaction prng = RandomState(seed) n_reads = otu_table.sum() depth = n_reads.min() * (perc / 100) for sample in otu_table: prob = otu_table[sample] / n_reads[sample] choice = prng.choice(otu_table.shape[0], depth, p=np.asarray(prob)) otu_table[sample] = np.bincount(choice, minlength=otu_table.shape[0]) # OTU pruning otu_table = otu_table.loc[otu_table.sum(axis=1) > 0] # Write OTU table otu_table.to_csv(out_filename, sep='\t')
def __init__(self, X, y, dataset, policy_name, scale=True, n_iter=10, passive=True): seed = RandomState(1234) self.X = np.asarray(X, dtype=np.float64) self.y = np.asarray(y) self.X = StandardScaler().fit_transform(self.X) if scale else self.X self.policy_name = policy_name self.dataset = dataset self.passive = passive # estimate the kernel using the 90th percentile heuristic random_idx = seed.choice(X.shape[0], 1000) distances = pairwise_distances(self.X[random_idx], metric='l1') self.gamma = 1 / np.percentile(distances, 90) transformer = RBFSampler(gamma=self.gamma, random_state=seed, n_components=100) self.X_transformed = transformer.fit_transform(self.X) n_samples = self.X.shape[0] train_size = min(10000, int(0.7 * n_samples)) test_size = min(20000, n_samples - train_size) self.kfold = StratifiedShuffleSplit(self.y, n_iter=n_iter, test_size=test_size, train_size=train_size, random_state=seed)
def generate_by_group(frame, by, source_map, source_cols=None, seed=None): """ Adds columns to a trial list from multiple sources. Splits a trial list into chunks to add columns from various sources. Chunks are paired with sources based on unique values in `frame[by]`. See :func:`generate` for more details. :param pandas.DataFrame frame: Trial list. :param str by: Grouping column in `frame`. Unique values are used as keys to get sources from `source_map`. :param dict source_map: Container of source lists. Keys are unique values of `frame[by]`. Values are pandas.DataFrame sources. :param source_cols: Columns of `source` to add to `frame`. Defaults to adding all columns of `source`. If `source_cols` is a dict, keys will be renamed to values. :type source_cols: str, list, dict, or None :param seed: Seed random number generator. If `None` the result will not be randomized. :type seed: int or None :return: The `frame` with additional `source_cols` from `source`. :rtype: pandas.DataFrame """ # create unique seeds for each part num_seeds = len(frame[by].unique()) + 1 if seed is not None: prng = RandomState(seed) seeds = list(prng.choice(arange(1000), num_seeds)) else: seeds = [None]*num_seeds def _generate_for_group(grp): group_key = grp[by].unique()[0] group_source = source_map[group_key] group_frame = generate(grp, group_source, source_cols, seeds.pop()) group_frame.index = grp.index return group_frame return frame.groupby(by, group_keys=False).apply(_generate_for_group)
def expand(valid, name, values=[1,0], ratio=0.5, sample=False, seed=None): """ Copy rows as necessary to satisfy the valid:invalid ratio. Use when complete counterbalancing is not plausible. For example, when the ratio of trials requiring response A to those requiring response B is not 50:50. :param pandas.DataFrame valid: Trial list to be expanded. :param str name: Name of new column containing valid and invalid values :param list values: Values for valid and invalid trials, respectively. :param float ratio: Desired percentage of valid trials in the resulting frame. Must be between 0 and 1. Defaults to 0.5. :param bool sample: Should the invalid trials be sampled from the valid trials? If True, len(returned) < 2*len(valid). Defaults to False. :param seed: Seed random number generator. :type seed: int or None :return: New trial list with valid and invalid trials are denoted in a new column. :rtype: pandas.DataFrame """ prng = RandomState(seed) num_trials = len(valid) if not sample: invalid = valid[:] num_valid = (num_trials*ratio)/(1.0-ratio) copies = int(num_valid/num_trials) valid = pd.concat([valid]*copies, ignore_index=True) else: num_invalid = int((num_trials*(1.0-ratio))/ratio) sampled = prng.choice(valid.index, num_invalid, replace=False) invalid = valid.reindex(sampled).reset_index(drop=True) frame = pd.concat([valid, invalid], keys=values, names=[name,'DEFAULT']) frame = frame.reset_index().drop('DEFAULT', axis=1) return frame
# for i in range(len(nsgk)): for i in range(20): # loop over categories d = [] # list of the permutation distributions for each video tst = [] # list of test statistics for each video for j in range(len(x[i])): # loop over videos res = simulate_ts_dist(x[i][j], keep_dist=True) d.append(res['dist']) tst.append(res['obs_ts']) perm_distr = np.asarray(d).transpose() category.append( simulate_npc_dist(perm_distr, size=time_stamps, obs_ts=tst)) category_pvalues = [] for i in range(len(category)): category_pvalues.append(category[i]['pvalue']) freq = RNG.choice([0.2, 0.8], Ns) res2 = np.zeros((R, Ns)) for i in range(len(freq)): res2[:, i] = RNG.binomial(1, freq[i], R) def test_irr_concordance(): rho_s2 = compute_ts(res2) assert_almost_equal(rho_s2, 0.70476190476190481) def test_simulate_ts_dist_concordance(): expected_res_conc = {'dist': None, 'geq': 0, 'obs_ts': 0.70476190476190481,
fname = root_path + 'data0_berg.json' data0_berg = utils_local.load_data0(fname=fname) item_types = ['bags', 'ties', 'earrings', 'shoes'] data = {} data['items'] = [] for item_type in item_types: N = len(data0_berg[item_type]) print "item_type", item_type print "number of items in ", item_type, ": ", N N20 = int(0.2 * N) # 20% of the data test_val_split = prng.choice(N, N20, replace=False) # randomly choose 2000 imgid for test and validations print "len test val split", len(test_val_split) # print test_val_split N10 = int(0.1 * N) # 10% of the data test_split = prng.choice(test_val_split, N10, replace=False) print "len test split", len(test_split) #print test_split for item in data0_berg[item_type]: new_item = mk_new_item(item) # add item to data data['items'].append(new_item)
class RandomizationTests(object): """Randomization tests for two-sample comparison with user-defined test statistic """ def __init__( self, measure_central_tendency=None, name='Arithmetic mean', method='monte', alternative='two_sided', seed=None): """Constructor""" self.mct = measure_central_tendency self.name = name self.method = method self.alternative = alternative self._1stexe = None # first execution self._tobs = None # observed test statistic value self._mctA = None self._mctB = None self._nA = None self._data = None self._indices = None self._count = 0 self._nperm = 0 # Number of permutations self.pvalue = None self._prng = RandomState(seed) @property def mct(self): return self._mct @mct.setter def mct(self, value): if isinstance(value, types.FunctionType): self._mct = value else: self._mct = self._aritmetic_mean @property def name(self): return self._name @name.setter def name(self, value): if value: if isinstance(value, str): self._name = value @property def method(self): return self._method @method.setter def method(self, value): assert value in ['systematic', 'monte'] self._method = value @property def alternative(self): return self._alternative @alternative.setter def alternative(self, value): assert value in ['two_sided', 'greater', 'less'] self._alternative = value def _compute_test_statistic(self, x, y): return self._mct(x) - self._mct(y) def _aritmetic_mean(self, x): return sum(x) / len(x) def _process_data_permutation(self, groupA_indices): self._nperm += 1 groupB_indices = set(self._indices).difference(groupA_indices) t = self._compute_test_statistic( [self._data[i] for i in groupA_indices], [self._data[j] for j in groupB_indices], ) if self._alternative == 'two_sided': self._count += 1 if abs(t) >= abs(self._tobs) else 0 elif self._alternative == 'greater': self._count += 1 if t >= self._tobs else 0 else: self._count += 1 if t <= self._tobs else 0 def execute(self, x=None, y=None, number_of_permutations=10000): if x and y: if isinstance(x, (list, tuple)) and isinstance(y, (list, tuple)): if all([isinstance(i, (int, float)) for i in x]): self._nA = len(x) self._mctA = self._mct(x) else: raise TypeError('Elements in x should be numbers') if all([isinstance(j, (int, float)) for j in y]): self._mctB = self._mct(y) else: raise TypeError('Elements in y should be numbers') self._1stexe = True self._tobs = self._compute_test_statistic(x, y) self._data = x + y self._indices = range(self._nA + len(y)) assert isinstance(number_of_permutations, int) if self.method == 'monte': # Monte Carlo randomization test with valid p value, i.e., # include t_obs to reference set if self._1stexe: self._1stexe = False self._count += 1 self._nperm += 1 number_of_permutations -= 1 for _ in range(number_of_permutations): groupA_indices = self._prng.choice( self._indices, self._nA, replace=False, ) self._process_data_permutation(groupA_indices) else: for groupA_indices in combinations(self._indices, self._nA): self._process_data_permutation(groupA_indices) self.pvalue = self._count / self._nperm def summary(self): if self.method == 'systematic': print('Systematic randomization test for two groups') else: print('Monte Carlo randomization test for two groups') print('Alternative hypothesis: {0}'.format(self._alternative)) print('{0} of group A: {1:.2f}'.format(self._name, self._mctA)) print('{0} of group B: {1:.2f}'.format(self._name, self._mctB)) print('Observed test statistic value: {0:.2f}'.format(self._tobs)) print('Count: {0:d}'.format(self._count)) print('Number of permutations: {0:d}'.format(self._nperm)) print('p value: {0:.4f}'.format(self.pvalue))
def find_beacons_sample_inverse(G, num_of_beacons=3, seed=None): # Sample a beacon based on its degree from numpy.random import RandomState prng = RandomState(seed) degrees = np.array(nx.degree(G).values()) return prng.choice(np.arange(len(degrees)), num_of_beacons, p=(1./degrees) *1./sum(1./degrees), replace=False )
def _run_fold(self, train_index, test_index): # reset the seed seed = RandomState(1234) # split data into train and test sets pool = self.X_transformed[train_index] oracle = self.y[train_index] labels = np.ma.MaskedArray(oracle, mask=True, copy=True) X_test = self.X_transformed[test_index] y_test = self.y[test_index] n_classes = len(np.unique(y_test)) similarity = rbf_kernel(self.X[train_index], gamma=self.gamma) mpba, accuracy, f1 = [], [], [] training_size = min(1000, len(pool)) initial_n = 50 horizon = training_size - initial_n # initialise classifier classifier = LogisticRegression(multi_class='ovr', penalty='l2', C=1000, random_state=seed, class_weight='balanced') committee = BaggingClassifier(classifier, n_estimators=7, n_jobs=1, max_samples=100, random_state=seed) # select the specified policy policy = self._get_policy(self.policy_name, pool, labels, classifier, committee, seed, similarity, horizon) # select 50 initial random examples for labelling sample_idx = seed.choice(np.arange(len(pool)), initial_n, replace=False) policy.add(sample_idx, oracle[sample_idx]) y_pred = classifier.predict(X_test) mpba.append(mpba_score(y_test, y_pred)) accuracy.append(accuracy_score(y_test, y_pred)) f1.append(micro_f1_score(y_test, y_pred, n_classes)) # start running the policy while np.sum(~labels.mask) < training_size: # use the policy to select the next instance for labelling best_candidates = policy.select() # query the oracle and add label policy.add(best_candidates, oracle[best_candidates]) # observe the reward y_pred = classifier.predict(X_test) mpba.append(mpba_score(y_test, y_pred)) reward = mpba[-1] - mpba[-2] # also compute accuracy and f1 score accuracy.append(accuracy_score(y_test, y_pred)) f1.append(micro_f1_score(y_test, y_pred, n_classes)) # normalise the reward to [0, 1] reward = (reward + 1) / 2 policy.receive_reward(reward) history = policy.history() history['mpba'] = np.array(mpba) history['accuracy'] = np.array(accuracy) history['f1'] = np.array(f1) return history
class BaseActive: """ Base class for active learning. """ def __init__(self, classifier, best_heuristic=None, accuracy_fn=compute_balanced_accuracy, initial_n=20, training_size=100, sample_size=20, n_candidates=1, verbose=False, random_state=None, pool_random_state=None, **h_kwargs): self.classifier = classifier self.best_heuristic = best_heuristic self.accuracy_fn = accuracy_fn self.initial_n = initial_n self.training_size = training_size self.current_training_size = 0 self.n_candidates = n_candidates self.sample_size = sample_size self.verbose = verbose self.learning_curve_ = [] self.h_kwargs = h_kwargs self.candidate_selections = [] self.rng = RandomState(random_state) self.pool_rng = RandomState(pool_random_state) def _random_sample(self, pool_size, train_mask, sample_size): """ Select a random sample from the pool. Parameters ---------- pool_size : int The total number of data points (both queried and unlabelled). train_mask : boolean array The boolean array that tells us which points are currently in the training set. sample_size : int The size of the random sample. Returns ------- candidate_mask : boolean array The boolean array that tells us which data points the heuristic should examine. """ candidate_mask = -train_mask if 0 < self.sample_size < np.sum(candidate_mask): unlabelled_index = np.where(candidate_mask)[0] candidate_index = self.rng.choice(unlabelled_index, self.sample_size, replace=False) candidate_mask = np.zeros(pool_size, dtype=bool) candidate_mask[candidate_index] = True return candidate_mask def _select_heuristic(self): """ Choose a heuristic to be used (useful in bandits active learning). """ return None def _store_results(self, accuracy): """ Store results at the end of an iteration. """ self.learning_curve_.append(accuracy) def _print_progress(self): """ Print out current progress. """ if self.current_training_size % 1000 == 0: print(self.current_training_size, end='') elif self.current_training_size % 100 == 0: print('.', end='') def select_candidates(self, X, y, candidate_mask, train_mask): """ Return the indices of the best candidates. Parameters ---------- X : array The feature matrix of all the data points. y : array The target vector of all the data points. candidate_mask : boolean array The boolean array that tells us which data points the heuristic should examine. n_candidates : int The number of best candidates to be selected at each iteration. **h_kwargs : other keyword arguments All other keyword arguments will be passed onto the heuristic function. Returns ------- best_candidates : array The list of indices of the best candidates. """ return self.best_heuristic(X=X, y=y, candidate_mask=candidate_mask, train_mask=train_mask, classifier=self.classifier, n_candidates=self.n_candidates, random_state=self.pool_rng.randint(1000), **self.h_kwargs) def fit(self, X_train, y_train, X_test=None, y_test=None): """ Conduct active learning. Parameters ---------- X_train : array The feature matrix of all the data points. y_train : array The target vector of all the data points. X_test : array If supplied, this will be used to compute an accuracy score for the learning curve. y_test : array If supplied, this will be used to compute an accuracy score for the learning curve. """ pool_size = X_train.shape[0] n_features = X_train.shape[1] # boolean index of the samples which have been queried and are in the training set train_mask = np.zeros(pool_size, dtype=bool) # select an initial random sample from the pool and train the classifier sample = self.rng.choice(np.arange(pool_size), self.initial_n, replace=False) self.candidate_selections += list(sample) train_mask[sample] = True self.classifier.fit(X_train[train_mask], y_train[train_mask]) self.current_training_size += len(sample) # obtain the first data point of the learning curve if X_test is not None and y_test is not None: accuracy = self.accuracy_fn(self.classifier, X_test, y_test) self.learning_curve_.append(accuracy) # keep training the classifier until we have a desired sample size while np.sum(train_mask) < self.training_size: # select a random sample from the unlabelled pool candidate_mask = self._random_sample(pool_size, train_mask, self.sample_size) # select the heuristic to be used self._select_heuristic() # pick the index of the best candidates best_candidates = self.select_candidates(X_train, y_train, candidate_mask, train_mask) self.candidate_selections += list(best_candidates) # retrain the classifier train_mask[best_candidates] = True self.classifier.fit(X_train[train_mask], y_train[train_mask]) self.current_training_size += len(best_candidates) # obtain the next data point of the learning curve if X_test is not None and y_test is not None: accuracy = self.accuracy_fn(self.classifier, X_test, y_test) self._store_results(accuracy) # print progress after every 100 queries if self.verbose: self._print_progress() assert self.current_training_size == np.sum(train_mask), \ 'Mismatch detected in the training size. Check your heuristic.' def predict(self, X): """ Predict the target values of X given the model. Parameters ---------- X : array The feature matrix Returns ------- y : array Predicted values. """ return self.classifier.predict(X)
class ImageCollection(object): online = True def __init__(self, mode="random", random_state=2, nb=100, size=(224, 224), crop=False, folder=None, filename_to_label=None, process_dirs=None, recur=False, verbose=1, **kwargs): if not hasattr(self, "folder"): assert folder is not None self.folder = folder if not hasattr(self, "filename_to_label"): if filename_to_label is None: def filename_to_label(directory, filename): return hash(directory) self.filename_to_label = filename_to_label if not hasattr(self, "process_dirs"): if process_dirs is None: def process_dirs(dirs): return filter(lambda d: os.path.isdir(d), dirs) self.process_dirs = process_dirs path = os.path.join(self.folder) if recur: directories = (root for root, _, _ in os.walk(path)) all_dirs = list(directories) else: directories = os.listdir(path) all_dirs = map(lambda d: path + "/" + d, directories) all_dirs = self.process_dirs(all_dirs) self.all_dirs = all_dirs self.mode = mode self.rng = RandomState(random_state) self.nb = nb self.size = size self.crop = crop self.verbose = verbose if size is not None: self.img_dim = (size[1], size[0], 3) else: self.img_dim = None def load(self): X = [] y = [] if self.mode == 'random': def get_next(): while True: d = self.rng.choice(self.all_dirs) filenames = os.listdir(d) filename = self.rng.choice(filenames) yield d, filename elif self.mode == 'all': def get_next(): for d in self.all_dirs: for filename in os.listdir(d): yield d, filename else: raise Exception("invalid mode : {}".format(self.mode)) get_next_iter = get_next() while len(X) < self.nb: try: d, filename = next(get_next_iter) except StopIteration: break try: x = imread(d + "/" + filename) h, w = x.shape[0:2] if self.crop: if h >= self.size[0]: a = (h - self.size[0]) / 2 b = h - self.size[0] - a x = x[a:-b] if w >= self.size[1]: a = (w - self.size[1]) / 2 b = w - self.size[1] - a x = x[:, a:-b] x = resize(x, self.size) else: if self.size is not None: x = resize(x, self.size) except Exception as ex: if self.verbose > 0: print("Exception when processing {} : {}".format(filename, repr(ex))) continue if len(x.shape) == 2: x = x[:, :, None] * np.ones((1, 1, 3)) if len(x.shape) == 3 and x.shape[-1] == 4: x = x[:, :, 0:3] if len(x.shape) == 3 and x.shape[-1] > 4: # there is an image with shape[2]=90, wtf? continue X.append(x) l = self.filename_to_label(d, filename) y.append(l) X = np.array(X).astype(np.float32) if self.img_dim is None: self.img_dim = X.shape[1:] self.X = X self.y = y
class BatchWiseCntTrainer(object): def __init__(self, exp, n_updates_per_break, batch_size, learning_rate, n_min_trials, trial_start_offset, break_start_offset, break_stop_offset, train_param_values, deterministic_training=False, add_breaks=True): self.cnt_model = exp.final_layer self.exp = exp self.n_updates_per_break = n_updates_per_break self.batch_size = batch_size self.learning_rate = learning_rate self.n_min_trials = n_min_trials self.trial_start_offset = trial_start_offset self.break_start_offset = break_start_offset self.break_stop_offset = break_stop_offset self.train_param_values = train_param_values self.deterministic_training = deterministic_training self.add_breaks = add_breaks def set_predicting_model(self, model): """ Needed to keep trained and used params in sync, i.e. Update the params of the epo model used for prediction with those params of the trained cnt model.""" self.predicting_model = model def set_data_processor(self, data_processor): self.data_processor = data_processor def set_marker_buffer(self, marker_buffer): self.marker_buffer = marker_buffer def initialize(self): """ Initialize data containers and theano functions for training.""" self.rng = RandomState(30948348) self.data_batches = [] self.y_batches = [] self.input_time_length = get_input_time_length(self.cnt_model) self.n_sample_preds = get_n_sample_preds(self.cnt_model) self.n_classes = self.cnt_model.output_shape[1] # create train function log.info("Compile train function...") self._create_train_function() log.info("Done compiling train function.") def _create_train_function(self): # Maybe replace self.exp.final_layer by self.cnt_model? # not clear to me why I am using self.exp.final_layer here targets = T.ivector() input_var = get_input_var(self.exp.final_layer) updates_expression = FuncAndArgs(adam, learning_rate=self.learning_rate) prediction = lasagne.layers.get_output(self.exp.final_layer, deterministic=self.deterministic_training, input_var=input_var, inputs=input_var) # Loss function might need layers or not... try: loss = self.exp.loss_expression(prediction, targets).mean() except TypeError: loss = self.exp.loss_expression(prediction, targets, self.exp.final_layer).mean() # create parameter update expressions params = lasagne.layers.get_all_params(self.exp.final_layer, trainable=True) updates = updates_expression(loss, params) if self.exp.updates_modifier is not None: # put norm constraints on all layer, for now fixed to max kernel norm # 2 and max col norm 0.5 updates = self.exp.updates_modifier.modify(updates, self.exp.final_layer) # store only the parameters for training, # assumes parameters for layers already set self.train_params = [] all_update_params = updates.keys() for update_param in all_update_params: if update_param not in params: self.train_params.append(update_param) self.train_func = theano.function([input_var, targets], updates=updates) # Set optimizer/train parameter values if not done if self.train_param_values is not None: log.info("Setting train parameter values") for param, val in zip(self.train_params, self.train_param_values): param.set_value(val) log.info("...Done setting parameter train values") else: log.info("Not setting train parameter values, optimization values " "start from scratch (model params may be loaded anyways.)") def add_data_from_today(self, data_processor): # Check if old data exists, if yes add it now = datetime.datetime.now() day_string = now.strftime('%Y-%m-%d') data_folder = 'data/online/{:s}'.format(day_string) # sort should sort timewise for our timeformat... data_files = sorted(glob(os.path.join(data_folder, '*.npy'))) if len(data_files) > 0: log.info("Loading {:d} data files for adaptation:\n{:s}".format( len(data_files), str(data_files))) for filename in data_files: log.info("Add data from {:s}...".format(filename)) samples_markers = np.load(filename) samples = samples_markers[:,:-1] markers = np.int32(samples_markers[:,-1]) self.add_training_blocks_from_old_data(samples, markers, data_processor) log.info("Done loading, now have {:d} trials (including breaks)".format( len(self.data_batches))) else: log.info("No data files found to load for adaptation in {:s}".format( data_folder)) def add_training_blocks_from_old_data(self, old_samples, old_markers, data_processor): # first standardize data old_samples = exponential_running_standardize(old_samples, factor_new=data_processor.factor_new, init_block_size=1000, eps=data_processor.eps) trial_starts, trial_stops = self.get_trial_start_stop_indices( old_markers) log.info("Adding {:d} trials".format(len(trial_starts))) for trial_start, trial_stop in zip(trial_starts, trial_stops): self.add_blocks(trial_start + self.trial_start_offset, trial_stop, old_samples, old_markers) # now lets add breaks log.info("Adding {:d} breaks".format(len(trial_starts) - 1)) for break_start, break_stop in zip(trial_stops[:-1], trial_starts[1:]): self.add_break(break_start, break_stop, old_samples, old_markers) def process_markers(self, markers): # Check if a trial has ended with last samples # need marker samples with some overlap # so we do not miss trial boundaries inbetween two sample blocks marker_samples_with_overlap = np.copy( self.marker_buffer[-len(markers)-2:]) trial_has_ended = np.sum(np.diff(marker_samples_with_overlap) < 0) > 0 if trial_has_ended: trial_starts, trial_stops = self.get_trial_start_stop_indices( self.marker_buffer) trial_start = trial_starts[-1] trial_stop = trial_stops[-1] log.info("Trial has ended for class {:d}".format( self.marker_buffer[trial_start])) assert trial_start < trial_stop, ("trial start {:d} should be " "before trial stop {:d}, markers: {:s}").format(trial_start, trial_stop, str(marker_samples_with_overlap)) self.add_blocks(trial_start + self.trial_start_offset, trial_stop, self.data_processor.sample_buffer, self.marker_buffer) log.info("Now {:d} trials (including breaks)".format( len(self.data_batches))) with log_timing(log, None, final_msg='Time for training:'): self.train() trial_has_started = np.sum(np.diff(marker_samples_with_overlap) > 0) > 0 if trial_has_started: trial_end_in_marker_buffer = np.sum(np.diff(self.marker_buffer) < 0) > 0 if trial_end_in_marker_buffer: # +1 necessary since diff removes one index trial_start = np.flatnonzero(np.diff(self.marker_buffer) > 0)[-1] + 1 trial_stop = np.flatnonzero(np.diff(self.marker_buffer) < 0)[-1] + 1 assert trial_start > trial_stop, ("If trial has just started " "expect this to be after stop of last trial") self.add_break(break_start=trial_stop, break_stop=trial_start, all_samples=self.data_processor.sample_buffer, all_markers=self.marker_buffer) #log.info("Break added, now at {:d} batches".format(len(self.data_batches))) def add_break(self, break_start, break_stop, all_samples, all_markers): if self.add_breaks: all_markers = np.copy(all_markers) assert np.all(all_markers[break_start:break_stop] == 0) assert all_markers[break_start - 1] != 0 assert all_markers[break_stop] != 0 # keep n_classes for 1-based matlab indexing logic in markers all_markers[break_start:break_stop] = self.n_classes self.add_blocks(break_start + self.break_start_offset, break_stop + self.break_stop_offset, all_samples, all_markers) else: pass #Ignore break that was supposed to be added def get_trial_start_stop_indices(self, markers): # + 1 as diff "removes" one index, i.e. diff will be above zero # at the index 1 before the increase=> the trial start trial_starts = np.flatnonzero(np.diff(markers) > 0) + 1 # diff removing index, so this index is last sample of trial # but stop indices in python are exclusive so +1 trial_stops = np.flatnonzero(np.diff(markers) < 0) + 1 if trial_starts[0] >= trial_stops[0]: # cut out first trial which only has end marker trial_stops = trial_stops[1:] if trial_starts[-1] >= trial_stops[-1]: # cut out last trial which only has start marker trial_starts = trial_starts[:-1] assert(len(trial_starts) == len(trial_stops)) assert(np.all(trial_starts <= trial_stops)) return trial_starts, trial_stops def add_blocks(self, trial_start, trial_stop, all_samples, all_markers): """Trial start offset as parameter to give different offsets for break and normal trial.""" # n_sample_preds is how many predictions done for # one forward pass of the network -> how many crops predicted # together in one forward pass for given input time length of # the ConvNet # -> crop size is how many samples are needed for one prediction crop_size = self.input_time_length - self.n_sample_preds + 1 if trial_start + self.n_sample_preds > trial_stop: log.info("Too little data in this trial to train in it, only " "{:d} predictable samples, need atleast {:d}".format( trial_stop - trial_start, self.n_sample_preds)) return # Too little data in this trial to train on it... needed_sample_start = trial_start - crop_size + 1 # not sure if copy necessary, but why not :) needed_samples = np.copy(all_samples[needed_sample_start:trial_stop]) trial_markers = all_markers[needed_sample_start:trial_stop] # trial start can't be at zero atm or else we would have to take more data assert (len(np.unique(trial_markers[(crop_size - 1):])) == 1), ( ("Trial should have exactly one class, markers: {:s} " "trial start: {:d}, trial_stop: {:d}").format( np.unique(trial_markers[(crop_size - 1):]), # crop_size -1 is index of first prediction needed_sample_start, trial_stop)) self.add_trial_topo_trial_y(needed_samples, trial_markers) def add_trial_topo_trial_y(self, needed_samples, trial_markers): """ needed_samples are samples needed for predicting entire trial, i.e. they typically include a part before the first sample of the trial.""" crop_size = self.input_time_length - self.n_sample_preds + 1 assert (len(np.unique(trial_markers[(crop_size - 1):])) == 1), ( ("Trial should have exactly one class, markers: {:s} ").format( np.unique(trial_markers[(crop_size - 1):]))) trial_topo = needed_samples[:,:,np.newaxis,np.newaxis] trial_y = np.copy(trial_markers) - 1 # -1 as zero is non-trial marker trial_len = len(trial_topo) start_end_blocks = get_start_end_blocks_for_trial(crop_size-1, trial_len-1, self.input_time_length, self.n_sample_preds) assert start_end_blocks[0][0] == 0, "First block should start at first sample" batch = create_batch(trial_topo, trial_y, start_end_blocks, self.n_sample_preds) self.data_batches.append(batch[0]) self.y_batches.append(batch[1]) def train(self): n_trials = len(self.data_batches) if n_trials >= self.n_min_trials: log.info("Training model...") # Remember values as backup in case of NaNs model_param_vals_before = lasagne.layers.get_all_param_values(self.exp.final_layer) train_param_vals_before = [p.get_value() for p in self.train_params] all_blocks = np.concatenate(self.data_batches, axis=0) all_y_blocks = np.concatenate(self.y_batches, axis=0) # reshape to per block # assuming right now targets are simply labels # not one-hot encoded all_y_blocks = np.reshape(all_y_blocks, (-1, self.n_sample_preds)) # make classes balanced # hopefully this is correct?! any sample shd be fine, -10 is a random decision labels_per_block = all_y_blocks[:,-10] unique_labels = sorted(np.unique(labels_per_block)) if not np.array_equal(range(len(unique_labels)), unique_labels): missing_classes = np.setdiff1d(range(len(unique_labels)), unique_labels) log.info(("Do not have labels for all classes yet, " "missing: {:s}, Skipping training...".format( str(missing_classes)))) return class_probs = np.zeros(len(unique_labels)) for i_class in unique_labels: freq = np.mean(labels_per_block == i_class) prob = 1.0/ (len(unique_labels) * freq) class_probs[i_class] = prob block_probs = np.zeros(len(labels_per_block)) for i_class in unique_labels: block_probs[labels_per_block == i_class] = class_probs[i_class] block_probs = block_probs / np.sum(block_probs) assert len(all_blocks) == len(all_y_blocks) for _ in xrange(self.n_updates_per_break): i_blocks = self.rng.choice(len(all_y_blocks), size=self.batch_size, p=block_probs) this_y = np.concatenate(all_y_blocks[i_blocks], axis=0) this_topo = all_blocks[i_blocks] self.train_func(this_topo, this_y) # Check for Nans and if necessary reset to old values if np.any([np.any(np.isnan(p.get_value())) for p in self.train_params]): log.warn("Reset train parameters due to NaNs") for p, old_val in zip(self.train_params, train_param_vals_before): p.set_value(old_val) all_layers_trained = lasagne.layers.get_all_layers(self.exp.final_layer) if np.any([np.any(np.isnan(p_val)) for p_val in lasagne.layers.get_all_param_values(all_layers_trained)]): log.warn("Reset model params due to NaNs") lasagne.layers.set_all_param_values(self.exp.final_layer, model_param_vals_before) assert not np.any([np.any(np.isnan(p.get_value())) for p in self.train_params]) assert not np.any([np.any(np.isnan(p_val)) for p_val in lasagne.layers.get_all_param_values(all_layers_trained)]) # Copy over new values to model used for prediction all_layers_used = lasagne.layers.get_all_layers(self.predicting_model) lasagne.layers.set_all_param_values(all_layers_used, lasagne.layers.get_all_param_values(all_layers_trained)) else: log.info("Not training model yet, only have {:d} of {:d} trials ".format( n_trials, self.n_min_trials))
for _ in range(params["NUM_FEATURES"])] learner = Learner(features) sum_of_errors = 0.0 mse_history = np.zeros(10000) ema_history = np.zeros(10000) exponential_moving_avg = 0.0 for iteration in range(params["NUM_ITERATIONS"]) : if params["CONTINUOUS_INPUTS"] : random_input = rng.rand(params["NUM_INPUT_VARS"]) else : random_input = rng.choice([0,1], params["NUM_INPUT_VARS"]) if params["NON_STATIONARY"] : target_val = (target_functions[(iteration / 100000) % 2 ].get_output(random_input) + rng.normal(0,1)) else : target_val = (target_functions[0].get_output(random_input) + rng.normal(0,1)) mse = learner.train(random_input, target_val, params["BASE_LEARNING_RATE"], iteration) if iteration == 0 : exponential_moving_avg = mse else : exponential_moving_avg = exponential_moving_avg * 0.999 + mse*0.001
dress[f] = '' return dress data0_dress = utils_local.load_data0(fname='../../DATASETS/dress_attributes/data/json/data0.json') #i = 239 #1224 #357 data = {} data['items'] = [] # a list of dictionaries N = len(data0_dress['dresses']) # number of dresses test_val_split = prng.choice(N, 5000, replace=False) # randomly choose 2000 imgid for test and validations test_split = prng.choice(test_val_split, 1000, replace=False) # randomly choose 1000 for test. Rest is for validation # # sample from a bernoulli distribution N times # # toss a coin N times with prob. p of getting heads (1) # N = len(data0['dresess']) # number of dresses # p = 0.8 # with probability p a dress is assigned to the train split # split = np.random.binomial(1, p, N) # bernoulli is a binomial with only 1 trial, thus 1. # # # Make sure that we have at least 80% for training # while sum(split) < (p * N):
def balanced_train_test_split(X, y, test_size=None, train_size=None, bootstrap=False, random_state=None): """ Split the data into a balanced training set and test set of some given size. For a dataset with an unequal numer of samples in each class, one useful procedure is to split the data into a training and a test set in such a way that the classes are balanced. Parameters ---------- X : array, shape = [n_samples, n_features] Feature matrix. y : array, shape = [n_features] Target vector. test_size : float or int (default=0.3) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split. If int, represents the absolute number of test samples. If None, the value is automatically set to the complement of the train size. If train size is also None, test size is set to 0.3. train_size : float or int (default=1-test_size) If float, should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set to the complement of the test size. random_state : int, optional (default=None) Pseudo-random number generator state used for random sampling. Returns ------- X_train : array The feature vectors (stored as columns) in the training set. X_test : array The feature vectors (stored as columns) in the test set. y_train : array The target vector in the training set. y_test : array The target vector in the test set. """ # initialise the random number generator rng = RandomState(random_state) # make sure X and y are numpy arrays X = np.asarray(X) y = np.asarray(y) # get information about the class distribution classes, y_indices = np.unique(y, return_inverse=True) n_classes = len(classes) cls_count = np.bincount(y_indices) # get the training and test size train_size, test_size = _get_train_test_size(train_size, test_size, len(y)) # number of samples in each class that is included in the training and test set n_train = np.round(train_size / n_classes).astype(int) n_test = np.round(test_size / n_classes).astype(int) n_total = n_train + n_test # make sure we have enough samples to create a balanced split min_count = min(cls_count) if min_count < (n_train + n_test) and not bootstrap: raise ValueError('The smallest class contains {} examples, which is not ' 'enough to create a balanced split. Choose a smaller size ' 'or enable bootstraping.'.format(min_count)) # selected indices are stored here train = [] test = [] # get the desired sample from each class for i, cls in enumerate(classes): if bootstrap: shuffled = rng.choice(cls_count[i], n_total, replace=True) else: shuffled = rng.permutation(cls_count[i]) cls_i = np.where(y == cls)[0][shuffled] train.extend(cls_i[:n_train]) test.extend(cls_i[n_train:n_total]) train = list(rng.permutation(train)) test = list(rng.permutation(test)) return X[train], X[test], y[train], y[test]
content_draw = get_content_generator(rs, zipf_param, contentHistory, 5000000, 1, content_duration) # contentHistory = get_updated_history(contentHistory, content) # update_vcdn_storage(g, contentHistory) # winner, price = create_content_delivery(g=g, peers=servers, content=content,consumer=consumer) env = simpy.Environment() the_time = 30 ticker = get_ticker(rs, poisson_param, ) while the_time < max_time_experiment: location = rs.choice(consumers) the_time = ticker() + the_time User(g, {"CDN": cdns, "VCDN": vcdns, "MUCDN": mucdns}, env, location, the_time, content_draw) for vcdn in vcdns: TE(rs, env, vcdn, g, contentHistory, refresh_delay=vcdn_refresh_delay, download_delay=vcdn_download_delay, concurent_download=vcdn_concurent_download) for mucdn in mucdns: TE(rs, env, mucdn, g, contentHistory, refresh_delay=mucdn_refresh_delay, download_delay=mucdn_download_delay, concurent_download=mucdn_concurent_download) def capacity_vcdn_monitor(): while True: yield env.timeout(11)