Пример #1
0
def test_label_binarize_with_class_order():
    out = label_binarize([1, 6], classes=[1, 2, 4, 6])
    expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
    assert_array_equal(out, expected)

    # Modified class order
    out = label_binarize([1, 6], classes=[1, 6, 4, 2])
    expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
    assert_array_equal(out, expected)
Пример #2
0
def test_label_binarize_with_class_order():
    out = label_binarize([1, 6], classes=[1, 2, 4, 6])
    expected = np.array([[1, 0, 0, 0], [0, 0, 0, 1]])
    assert_array_equal(out, expected)

    # Modified class order
    out = label_binarize([1, 6], classes=[1, 6, 4, 2])
    expected = np.array([[1, 0, 0, 0], [0, 1, 0, 0]])
    assert_array_equal(out, expected)
Пример #3
0
def test_label_binarizer_errors():
    # Check that invalid arguments yield ValueError
    one_class = np.array([0, 0, 0, 0])
    lb = LabelBinarizer().fit(one_class)

    multi_label = [(2, 3), (0, ), (0, 2)]
    with pytest.raises(ValueError):
        lb.transform(multi_label)

    lb = LabelBinarizer()
    with pytest.raises(ValueError):
        lb.transform([])
    with pytest.raises(ValueError):
        lb.inverse_transform([])

    with pytest.raises(ValueError):
        LabelBinarizer(neg_label=2, pos_label=1)
    with pytest.raises(ValueError):
        LabelBinarizer(neg_label=2, pos_label=2)

    with pytest.raises(ValueError):
        LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)

    # Fail on y_type
    with pytest.raises(ValueError):
        _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
                                       output_type="foo",
                                       classes=[1, 2],
                                       threshold=0)

    # Sequence of seq type should raise ValueError
    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
    with pytest.raises(ValueError):
        LabelBinarizer().fit_transform(y_seq_of_seqs)

    # Fail on the number of classes
    with pytest.raises(ValueError):
        _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
                                       output_type="foo",
                                       classes=[1, 2, 3],
                                       threshold=0)

    # Fail on the dimension of 'binary'
    with pytest.raises(ValueError):
        _inverse_binarize_thresholding(y=np.array([[1, 2, 3], [2, 1, 3]]),
                                       output_type="binary",
                                       classes=[1, 2, 3],
                                       threshold=0)

    # Fail on multioutput data
    with pytest.raises(ValueError):
        LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
    with pytest.raises(ValueError):
        label_binarize(np.array([[1, 3], [2, 1]]), [1, 2, 3])
Пример #4
0
def test_label_binarize_multiclass():
    y = [0, 1, 2]
    classes = [0, 1, 2]
    pos_label = 2
    neg_label = 0
    expected = 2 * np.eye(3)

    check_binarized_results(y, classes, pos_label, neg_label, expected)

    with pytest.raises(ValueError):
        label_binarize(y,
                       classes,
                       neg_label=-1,
                       pos_label=pos_label,
                       sparse_output=True)
Пример #5
0
def test_label_binarize_with_multilabel_indicator():
    """Check that passing a binary indicator matrix is not noop"""

    classes = np.arange(3)
    neg_label = -1
    pos_label = 2

    y = np.array([[0, 1, 0], [1, 1, 1]])
    expected = np.array([[-1, 2, -1], [2, 2, 2]])

    # With label binarize
    output = label_binarize(y,
                            classes,
                            multilabel=True,
                            neg_label=neg_label,
                            pos_label=pos_label)
    assert_array_equal(output, expected)

    # With the transformer
    lb = LabelBinarizer(pos_label=pos_label, neg_label=neg_label)
    output = lb.fit_transform(y)
    assert_array_equal(output, expected)

    output = lb.fit(y).transform(y)
    assert_array_equal(output, expected)
Пример #6
0
def load_kaggle_mnist_train(filen, nrows=None, zero_to_negone=False):
    """
    Reads in the Kaggle MNIST training dataset and returns X
    (input) and Y (label) data.

    Dataset should be a .csv or .csv.gz file
    (NOBSERVATIONS+1 x NDIMENSIONS+1), with first row =
    header, and first column = labels.
    """
    print 'Loading %s Kaggle MNIST train patterns from %s' % (str(nrows) if nrows else 'all', filen)
    t = Stopwatch()
    panda = pd.read_csv(filen, delimiter=',', dtype=int, header=None, nrows=nrows,
                        skiprows=1, compression=('gzip' if filen.endswith('.gz') else None))
    data = panda.values # numpy array
    x = data[:,1:]
    y_vec = data[:,0]

    assert x.shape[0] == y_vec.shape[0]
    assert x.shape[1] == 784
    assert len(y_vec.shape) == 1
    assert np.min(y_vec) == 0 and np.max(y_vec) == 9
    # turn labels from vector (with values from 0-9) to
    # NOBSERVATIONSx10 matrix, with a single 1 in each row (i.e. 1-vs-all)
    y = label_binarize(y_vec, classes=range(10))
    assert y.shape == (x.shape[0], 10)
    assert all(np.sum(y, axis=1) == 1)
    if zero_to_negone: x[x==0] = -1
    if nrows is not None: assert x.shape[0] == nrows
    print 'done: %r in %is' % (x.shape, t.finish(milli=False))
    return x, y
Пример #7
0
    def load_dataset(self):
        with open(self.file_name) as f:
            dataset = arff.load(f)

            if self.label_attribute is None:
                self.label_attribute = dataset["attributes"][-1][0]

            data = list(numpy.asarray(dataset["data"]).transpose())
            labels = None

            row = 0
            for attribute_name, attribute_type in dataset["attributes"]:
                if attribute_name == self.label_attribute:
                    # Labels found!
                    labels = data.pop(row)
                    continue
                # Nominal attribute
                if isinstance(attribute_type, list):
                    # Convert None in '?' for next check and to make label_binarize work
                    for j in range(len(data[row])):
                        if data[row][j] is None:
                            data[row][j] = "?"
                    if numpy.all(data[row] == "?"):
                        # If no data is present, just remove the row
                        data.pop(row)
                        continue
                    if self.binarize:
                        data[row] = numpy.asarray(label_binarize(
                            data[row], attribute_type),
                                                  dtype=numpy.float64)
                    else:
                        encoder = LabelEncoder()
                        encoder.classes_ = attribute_type
                        if "?" not in encoder.classes_:
                            encoder.classes_.insert(0, "?")
                        data[row] = encoder.transform(data[row]).reshape(
                            (len(data[row]), 1)).astype(numpy.float64)
                else:
                    # Numeric attributes: check for nan values
                    data[row] = data[row].astype(numpy.float64)
                    nans = numpy.isnan(data[row])
                    if numpy.all(nans):
                        # If everything is nan, remove the feature
                        data.pop(row)
                        continue
                    if numpy.any(nans):
                        mean = data[row][numpy.invert(
                            nans)].sum() / numpy.invert(nans).sum()
                        data[row][nans] = mean
                    # Reshape to do hstack later
                    data[row] = data[row].reshape((len(data[row]), 1))
                # Go to next row only if we have NOT removed the current one
                row += 1

            instances = numpy.hstack(tuple(data))
            useless_indices = numpy.where(instances.var(axis=0) == 0)
            instances = numpy.delete(instances, useless_indices, axis=1)

            return instances, labels
Пример #8
0
def otto_dataset(params):
    df = pd.read_csv('train.csv.gz', index_col='id', nrows=params.get('n_rows'))

    features = df.drop(['target'], axis=1)
    labels = df.target.apply(lambda e: e[6:]).astype(np.int16) - 1
    if params['est'] == 'keras':
        labels = label_binarize(labels, classes=sorted(set(labels)))

    return features, labels
Пример #9
0
def check_binarized_results(y, classes, pos_label, neg_label, expected):
    for sparse_output in [True, False]:
        if ((pos_label == 0 or neg_label != 0) and sparse_output):
            with pytest.raises(ValueError):
                label_binarize(y,
                               classes,
                               neg_label=neg_label,
                               pos_label=pos_label,
                               sparse_output=sparse_output)
            continue

        # check label_binarize
        binarized = label_binarize(y,
                                   classes,
                                   neg_label=neg_label,
                                   pos_label=pos_label,
                                   sparse_output=sparse_output)
        assert_array_equal(toarray(binarized), expected)
        assert issparse(binarized) == sparse_output

        # check inverse
        y_type = type_of_target(y)
        if y_type == "multiclass":
            inversed = _inverse_binarize_multiclass(binarized, classes=classes)

        else:
            inversed = _inverse_binarize_thresholding(
                binarized,
                output_type=y_type,
                classes=classes,
                threshold=((neg_label + pos_label) / 2.))

        assert_array_equal(toarray(inversed), toarray(y))

        # Check label binarizer
        lb = LabelBinarizer(neg_label=neg_label,
                            pos_label=pos_label,
                            sparse_output=sparse_output)
        binarized = lb.fit_transform(y)
        assert_array_equal(toarray(binarized), expected)
        assert issparse(binarized) == sparse_output
        inverse_output = lb.inverse_transform(binarized)
        assert_array_equal(toarray(inverse_output), toarray(y))
        assert issparse(inverse_output) == issparse(y)
Пример #10
0
    def load_dataset(self):
        with open(self.file_name) as f:
            dataset = arff.load(f)

            if self.label_attribute is None:
                self.label_attribute = dataset["attributes"][-1][0]

            data = list(numpy.asarray(dataset["data"]).transpose())
            labels = None

            row = 0
            for attribute_name, attribute_type in dataset["attributes"]:
                if attribute_name == self.label_attribute:
                    # Labels found!
                    labels = data.pop(row)
                    continue
                # Nominal attribute
                if isinstance(attribute_type, list):
                    # Convert None in '?' for next check and to make label_binarize work
                    for j in range(len(data[row])):
                        if data[row][j] is None:
                            data[row][j] = "?"
                    if numpy.all(data[row] == "?"):
                        # If no data is present, just remove the row
                        data.pop(row)
                        continue
                    if self.binarize:
                        data[row] = numpy.asarray(label_binarize(data[row], attribute_type), dtype=numpy.float64)
                    else:
                        encoder = LabelEncoder()
                        encoder.classes_ = attribute_type
                        if "?" not in encoder.classes_:
                            encoder.classes_.insert(0, "?")
                        data[row] = encoder.transform(data[row]).reshape((len(data[row]), 1)).astype(numpy.float64)
                else:
                    # Numeric attributes: check for nan values
                    data[row] = data[row].astype(numpy.float64)
                    nans = numpy.isnan(data[row])
                    if numpy.all(nans):
                        # If everything is nan, remove the feature
                        data.pop(row)
                        continue
                    if numpy.any(nans):
                        mean = data[row][numpy.invert(nans)].sum() / numpy.invert(nans).sum()
                        data[row][nans] = mean
                    # Reshape to do hstack later
                    data[row] = data[row].reshape((len(data[row]), 1))
                # Go to next row only if we have NOT removed the current one
                row += 1

            instances = numpy.hstack(tuple(data))
            useless_indices = numpy.where(instances.var(axis=0) == 0)
            instances = numpy.delete(instances, useless_indices, axis=1)

            return instances, labels
Пример #11
0
def test_label_binarize_multilabel():
    y_ind = np.array([[0, 1, 0], [1, 1, 1], [0, 0, 0]])
    classes = [0, 1, 2]
    pos_label = 2
    neg_label = 0
    expected = pos_label * y_ind
    y_sparse = [
        sparse_matrix(y_ind) for sparse_matrix in
        [coo_matrix, csc_matrix, csr_matrix, dok_matrix, lil_matrix]
    ]

    for y in [y_ind] + y_sparse:
        check_binarized_results(y, classes, pos_label, neg_label, expected)

    with pytest.raises(ValueError):
        label_binarize(y,
                       classes,
                       neg_label=-1,
                       pos_label=pos_label,
                       sparse_output=True)
Пример #12
0
def check_binarized_results(y, classes, pos_label, neg_label, expected):
    for sparse_output in [True, False]:
        if ((pos_label == 0 or neg_label != 0) and sparse_output):
            assert_raises(ValueError, label_binarize, y, classes,
                          neg_label=neg_label, pos_label=pos_label,
                          sparse_output=sparse_output)
            continue

        # check label_binarize
        binarized = label_binarize(y, classes, neg_label=neg_label,
                                   pos_label=pos_label,
                                   sparse_output=sparse_output)
        assert_array_equal(toarray(binarized), expected)
        assert_equal(issparse(binarized), sparse_output)

        # check inverse
        y_type = type_of_target(y)
        if y_type == "multiclass":
            inversed = _inverse_binarize_multiclass(binarized, classes=classes)

        else:
            inversed = _inverse_binarize_thresholding(binarized,
                                                      output_type=y_type,
                                                      classes=classes,
                                                      threshold=((neg_label +
                                                                 pos_label) /
                                                                 2.))

        assert_array_equal(toarray(inversed), toarray(y))

        # Check label binarizer
        lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label,
                            sparse_output=sparse_output)
        binarized = lb.fit_transform(y)
        assert_array_equal(toarray(binarized), expected)
        assert_equal(issparse(binarized), sparse_output)
        inverse_output = lb.inverse_transform(binarized)
        assert_array_equal(toarray(inverse_output), toarray(y))
        assert_equal(issparse(inverse_output), issparse(y))
Пример #13
0
def test_label_binarize_with_multilabel_indicator():
    """Check that passing a binary indicator matrix is not noop"""

    classes = np.arange(3)
    neg_label = -1
    pos_label = 2

    y = np.array([[0, 1, 0], [1, 1, 1]])
    expected = np.array([[-1, 2, -1], [2, 2, 2]])

    # With label binarize
    output = label_binarize(y, classes, multilabel=True, neg_label=neg_label,
                            pos_label=pos_label)
    assert_array_equal(output, expected)

    # With the transformer
    lb = LabelBinarizer(pos_label=pos_label, neg_label=neg_label)
    output = lb.fit_transform(y)
    assert_array_equal(output, expected)

    output = lb.fit(y).transform(y)
    assert_array_equal(output, expected)
Пример #14
0
def roc_auc_avg_score(y_true, y_score):
    y_bin = label_binarize(y_true, classes=sorted(set(y_true)))
    return roc_auc_score(y_bin, y_score)
Пример #15
0
def test_invalid_input_label_binarize():
    with pytest.raises(ValueError):
        label_binarize([0, 2], classes=[0, 2], pos_label=0, neg_label=1)