Пример #1
0
    def split(self, p, seed=None, stratify=True):
        """
        Performs a stratified split of the dataset.  Returns two DataSets with p and (1-p) fraction of the data,
        respectively.
        """
        if p <= 0.0 or p >= 1.0:
            raise ValueError(
                'Split proportion must be a float between 0 and 1')
        if stratify and not self.has_discrete_labels():
            stratify = False

        with numpy_random_seed_temporarily(seed):
            instances_selected = np.zeros((self.num_instances, ), dtype='bool')
            if stratify:
                examples_in_class = {
                    c: np.flatnonzero(self.targets == c)
                    for c in self.classes
                }
                num_examples_in_class = {
                    c: len(examples_in_class[c])
                    for c in self.classes
                }

                for c, indices in examples_in_class.iteritems():
                    np.random.shuffle(indices)

                    # Pick data points for the first split
                    n = int(p * num_examples_in_class[c])
                    if n == 0:
                        raise Exception(
                            'Not enough data instances to select %.2f%%' % p *
                            100.)
                    instances_selected[indices[:n]] = True
            else:
                indices = np.arange(self.num_instances)
                np.random.shuffle(indices)
                indices = list(indices)

                n = int(p * self.num_instances)
                if n == 0:
                    raise Exception(
                        'Not enough data instances to select %.2f%%' % p *
                        100.)
                instances_selected[indices[:n]] = True

        first_indices = np.flatnonzero(instances_selected)
        second_indices = np.flatnonzero(np.negative(instances_selected))
        return self.subset(first_indices), self.subset(second_indices)
Пример #2
0
    def random_stratified_subset(self, p=None, n=None, seed=None):
        """
        Gets a fraction of the documents in each class.  Returns a list of document indices.
        """
        # XXX this only selects documents with observed labels
        if not bool(p) ^ bool(n):
            raise ValueError('Must provide exactly one of p or n kwargs')

        docs_selected = np.zeros((self.num_docs,), dtype='bool')
        docs_in_classes = [np.flatnonzero((self.labels == c) * self.labels_observed) for c in range(self.num_classes)]
        num_docs_in_classes = [len(each) for each in docs_in_classes]

        # Determine number of documents to select per class
        if p is not None:
            # Ensure p is a float between 0 and 1
            p = float(p)
            if p < 0.0 or p >= 1.0:
                raise ValueError("p must be between 0 and 1 (got: %f)" % p)
            if p == 0.0:
                return []
            num_docs_to_select = [int(floor(p * each)) for each in num_docs_in_classes]
        if n is not None:
            n = int(n)
            if n == 0:
                return []
            if n < 0:
                raise ValueError('n must be positive (got: %d)' % n)
            num_docs_to_select = [n] * self.num_classes

        with numpy_random_seed_temporarily(seed):
            for c, docs in enumerate(docs_in_classes):
                shuffle(docs)

                if num_docs_to_select[c] == 0 or num_docs_to_select[c] > num_docs_in_classes[c]:
                    raise Exception("Class %d doesn't have enough labeled documents to select %d (has %d)" % \
                                    (c, num_docs_to_select[c], num_docs_in_classes[c]))
                docs_selected[docs[0:num_docs_to_select[c]]] = True
        return np.flatnonzero(docs_selected)
Пример #3
0
    def split(self, p, seed=None, stratify=True):
        """
        Performs a stratified split of the dataset.  Returns two DataSets with p and (1-p) fraction of the data,
        respectively.
        """
        if p <= 0.0 or p >= 1.0:
            raise ValueError('Split proportion must be a float between 0 and 1')
        if stratify and not self.has_discrete_labels():
            stratify = False

        with numpy_random_seed_temporarily(seed):
            instances_selected = np.zeros((self.num_instances,), dtype='bool')
            if stratify:
                examples_in_class = {c:np.flatnonzero(self.targets == c) for c in self.classes}
                num_examples_in_class = {c:len(examples_in_class[c]) for c in self.classes}

                for c, indices in examples_in_class.iteritems():
                    np.random.shuffle(indices)

                    # Pick data points for the first split
                    n = int(p * num_examples_in_class[c])
                    if n == 0:
                        raise Exception('Not enough data instances to select %.2f%%' % p*100.)
                    instances_selected[indices[:n]] = True
            else:
                indices = np.arange(self.num_instances)
                np.random.shuffle(indices)
                indices = list(indices)
                
                n = int(p * self.num_instances)
                if n == 0:
                    raise Exception('Not enough data instances to select %.2f%%' % p*100.)
                instances_selected[indices[:n]] = True

        first_indices = np.flatnonzero(instances_selected)
        second_indices = np.flatnonzero(np.negative(instances_selected))
        return self.subset(first_indices), self.subset(second_indices)