def split(self, p, seed=None, stratify=True): """ Performs a stratified split of the dataset. Returns two DataSets with p and (1-p) fraction of the data, respectively. """ if p <= 0.0 or p >= 1.0: raise ValueError( 'Split proportion must be a float between 0 and 1') if stratify and not self.has_discrete_labels(): stratify = False with numpy_random_seed_temporarily(seed): instances_selected = np.zeros((self.num_instances, ), dtype='bool') if stratify: examples_in_class = { c: np.flatnonzero(self.targets == c) for c in self.classes } num_examples_in_class = { c: len(examples_in_class[c]) for c in self.classes } for c, indices in examples_in_class.iteritems(): np.random.shuffle(indices) # Pick data points for the first split n = int(p * num_examples_in_class[c]) if n == 0: raise Exception( 'Not enough data instances to select %.2f%%' % p * 100.) instances_selected[indices[:n]] = True else: indices = np.arange(self.num_instances) np.random.shuffle(indices) indices = list(indices) n = int(p * self.num_instances) if n == 0: raise Exception( 'Not enough data instances to select %.2f%%' % p * 100.) instances_selected[indices[:n]] = True first_indices = np.flatnonzero(instances_selected) second_indices = np.flatnonzero(np.negative(instances_selected)) return self.subset(first_indices), self.subset(second_indices)
def random_stratified_subset(self, p=None, n=None, seed=None): """ Gets a fraction of the documents in each class. Returns a list of document indices. """ # XXX this only selects documents with observed labels if not bool(p) ^ bool(n): raise ValueError('Must provide exactly one of p or n kwargs') docs_selected = np.zeros((self.num_docs,), dtype='bool') docs_in_classes = [np.flatnonzero((self.labels == c) * self.labels_observed) for c in range(self.num_classes)] num_docs_in_classes = [len(each) for each in docs_in_classes] # Determine number of documents to select per class if p is not None: # Ensure p is a float between 0 and 1 p = float(p) if p < 0.0 or p >= 1.0: raise ValueError("p must be between 0 and 1 (got: %f)" % p) if p == 0.0: return [] num_docs_to_select = [int(floor(p * each)) for each in num_docs_in_classes] if n is not None: n = int(n) if n == 0: return [] if n < 0: raise ValueError('n must be positive (got: %d)' % n) num_docs_to_select = [n] * self.num_classes with numpy_random_seed_temporarily(seed): for c, docs in enumerate(docs_in_classes): shuffle(docs) if num_docs_to_select[c] == 0 or num_docs_to_select[c] > num_docs_in_classes[c]: raise Exception("Class %d doesn't have enough labeled documents to select %d (has %d)" % \ (c, num_docs_to_select[c], num_docs_in_classes[c])) docs_selected[docs[0:num_docs_to_select[c]]] = True return np.flatnonzero(docs_selected)
def split(self, p, seed=None, stratify=True): """ Performs a stratified split of the dataset. Returns two DataSets with p and (1-p) fraction of the data, respectively. """ if p <= 0.0 or p >= 1.0: raise ValueError('Split proportion must be a float between 0 and 1') if stratify and not self.has_discrete_labels(): stratify = False with numpy_random_seed_temporarily(seed): instances_selected = np.zeros((self.num_instances,), dtype='bool') if stratify: examples_in_class = {c:np.flatnonzero(self.targets == c) for c in self.classes} num_examples_in_class = {c:len(examples_in_class[c]) for c in self.classes} for c, indices in examples_in_class.iteritems(): np.random.shuffle(indices) # Pick data points for the first split n = int(p * num_examples_in_class[c]) if n == 0: raise Exception('Not enough data instances to select %.2f%%' % p*100.) instances_selected[indices[:n]] = True else: indices = np.arange(self.num_instances) np.random.shuffle(indices) indices = list(indices) n = int(p * self.num_instances) if n == 0: raise Exception('Not enough data instances to select %.2f%%' % p*100.) instances_selected[indices[:n]] = True first_indices = np.flatnonzero(instances_selected) second_indices = np.flatnonzero(np.negative(instances_selected)) return self.subset(first_indices), self.subset(second_indices)