예제 #1
0
def test_label_binarizer_multilabel_unlabeled():
    """Check that LabelBinarizer can handle an unlabeled sample"""
    lb = LabelBinarizer()
    y = [[1, 2], [1], []]
    Y = np.array([[1, 1],
                  [1, 0],
                  [0, 0]])
    assert_array_equal(lb.fit_transform(y), Y)
예제 #2
0
def test_label_binarizer_unseen_labels():
    lb = LabelBinarizer()

    expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
    got = lb.fit_transform(["b", "d", "e"])
    assert_array_equal(expected, got)

    expected = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]])
    got = lb.transform(["a", "b", "c", "d", "e", "f"])
    assert_array_equal(expected, got)
예제 #3
0
def test_label_binarizer_multilabel():
    lb = LabelBinarizer()

    # test input as lists of tuples
    inp = [(2, 3), (1,), (1, 2)]
    indicator_mat = np.array([[0, 1, 1],
                              [1, 0, 0],
                              [1, 1, 0]])
    got = lb.fit_transform(inp)
    assert_true(lb.multilabel_)
    assert_array_equal(indicator_mat, got)
    assert_equal(lb.inverse_transform(got), inp)

    # test input as label indicator matrix
    lb.fit(indicator_mat)
    assert_array_equal(indicator_mat,
                       lb.inverse_transform(indicator_mat))

    # regression test for the two-class multilabel case
    lb = LabelBinarizer()
    inp = [[1, 0], [0], [1], [0, 1]]
    expected = np.array([[1, 1],
                         [1, 0],
                         [0, 1],
                         [1, 1]])
    got = lb.fit_transform(inp)
    assert_true(lb.multilabel_)
    assert_array_equal(expected, got)
    assert_equal([set(x) for x in lb.inverse_transform(got)],
                 [set(x) for x in inp])
예제 #4
0
def test_label_binarizer_iris():
    lb = LabelBinarizer()
    Y = lb.fit_transform(iris.target)
    clfs = [SGDClassifier().fit(iris.data, Y[:, k])
            for k in range(len(lb.classes_))]
    Y_pred = np.array([clf.decision_function(iris.data) for clf in clfs]).T
    y_pred = lb.inverse_transform(Y_pred)
    accuracy = np.mean(iris.target == y_pred)
    y_pred2 = SGDClassifier().fit(iris.data, iris.target).predict(iris.data)
    accuracy2 = np.mean(iris.target == y_pred2)
    assert_almost_equal(accuracy, accuracy2)
def dbpedia_convgemb(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True)
    vocab = Dictionary(train_docs)
    vocab.filter_extremes(keep_n=5000)
    bin = LabelBinarizer()

    x_train = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                      for s in train_docs],
                                     max_length=100, padding_word=0))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True)
    x_test = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                      for s in test_docs],
                                     max_length=100, padding_word=0))
    y_test = bin.transform(test_df.category.values)

    emb_weights = load_w2v_weights(vocab)

    model = Sequential()
    model.add(Embedding(5001, 300, input_length=100, dropout=.2, weights=[emb_weights], trainable=False))
    model.add(Convolution1D(nb_filter=50, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=model.output_shape[1]))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(.2))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.fit(x_train, y_train)

    print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
예제 #6
0
def test_label_binarizer():
    lb = LabelBinarizer()

    # one-class case defaults to negative label
    inp = ["pos", "pos", "pos", "pos"]
    expected = np.array([[0, 0, 0, 0]]).T
    got = lb.fit_transform(inp)
    assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_"))
    assert_array_equal(lb.classes_, ["pos"])
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)

    # two-class case
    inp = ["neg", "pos", "pos", "neg"]
    expected = np.array([[0, 1, 1, 0]]).T
    got = lb.fit_transform(inp)
    assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_"))
    assert_array_equal(lb.classes_, ["neg", "pos"])
    assert_array_equal(expected, got)

    to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
    assert_array_equal(lb.inverse_transform(to_invert), inp)

    # multi-class case
    inp = ["spam", "ham", "eggs", "ham", "0"]
    expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0],
                         [0, 0, 1, 0], [1, 0, 0, 0]])
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam'])
    assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_"))
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)
예제 #7
0
def test_label_binarizer_column_y():
    # first for binary classification vs multi-label with 1 possible class
    # lists are multi-label, array is multi-class :-/
    inp_list = [[1], [2], [1]]
    inp_array = np.array(inp_list)

    multilabel_indicator = np.array([[1, 0], [0, 1], [1, 0]])
    binaryclass_array = np.array([[0], [1], [0]])

    lb_1 = LabelBinarizer()
    out_1 = lb_1.fit_transform(inp_list)

    lb_2 = LabelBinarizer()
    out_2 = lb_2.fit_transform(inp_array)

    assert_array_equal(out_1, multilabel_indicator)
    assert_array_equal(out_2, binaryclass_array)

    # second for multiclass classification vs multi-label with multiple
    # classes
    inp_list = [[1], [2], [1], [3]]
    inp_array = np.array(inp_list)

    # the indicator matrix output is the same in this case
    indicator = np.array([[1, 0, 0], [0, 1, 0], [1, 0, 0], [0, 0, 1]])

    lb_1 = LabelBinarizer()
    out_1 = lb_1.fit_transform(inp_list)

    lb_2 = LabelBinarizer()
    out_2 = lb_2.fit_transform(inp_array)

    assert_array_equal(out_1, out_2)
    assert_array_equal(out_2, indicator)
예제 #8
0
def test_label_binarizer_errors():
    """Check that invalid arguments yield ValueError"""
    one_class = np.array([0, 0, 0, 0])
    lb = LabelBinarizer().fit(one_class)

    multi_label = [(2, 3), (0, ), (0, 2)]
    assert_raises(ValueError, lb.transform, multi_label)

    lb = LabelBinarizer()
    assert_raises(ValueError, lb.transform, [])
    assert_raises(ValueError, lb.inverse_transform, [])

    assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=1)
    assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=2)

    assert_raises(ValueError,
                  LabelBinarizer,
                  neg_label=1,
                  pos_label=2,
                  sparse_output=True)

    # Fail on y_type
    assert_raises(ValueError,
                  _inverse_binarize_thresholding,
                  y=csr_matrix([[1, 2], [2, 1]]),
                  output_type="foo",
                  classes=[1, 2],
                  threshold=0)

    # Fail on the number of classes
    assert_raises(ValueError,
                  _inverse_binarize_thresholding,
                  y=csr_matrix([[1, 2], [2, 1]]),
                  output_type="foo",
                  classes=[1, 2, 3],
                  threshold=0)

    # Fail on the dimension of 'binary'
    assert_raises(ValueError,
                  _inverse_binarize_thresholding,
                  y=np.array([[1, 2, 3], [2, 1, 3]]),
                  output_type="binary",
                  classes=[1, 2, 3],
                  threshold=0)

    # Fail on multioutput data
    assert_raises(ValueError, LabelBinarizer().fit, np.array([[1, 3], [2, 1]]))
    assert_raises(ValueError, label_binarize, np.array([[1, 3], [2, 1]]),
                  [1, 2, 3])
    def __init__(self, trainFile, devFile, testFile, d=100, yita=0.1):
        self.d = d
        self.yita = yita
        self.languages = {"ENGLISH": 1, "FRENCH": 3, "ITALIAN": 2}
        self.punctuations = [".", "'", ":", ",", "-", "...", "!", "_", "(", ")", "?", '"', ";", "/", "\\", "{", "}", \
                             "[", "]", "|", "<", ">", "+", "=", "@", "#", "$", "%", "^","&", "*"]
        self.noPunctuation = False
        self.answerLables = LabelBinarizer()
        self.answerLables.fit([1, 2, 3])
        self.c = set()

        self.Initialize(trainFile, devFile, testFile)

        self.input = len(self.c) * 5 + 1
        self.setParameters(d, yita)
예제 #10
0
def test_label_binarizer_errors():
    """Check that invalid arguments yield ValueError"""
    one_class = np.array([0, 0, 0, 0])
    lb = LabelBinarizer().fit(one_class)
    assert_false(lb.multilabel_)

    multi_label = [(2, 3), (0, ), (0, 2)]
    assert_raises(ValueError, lb.transform, multi_label)

    lb = LabelBinarizer()
    assert_raises(ValueError, lb.transform, [])
    assert_raises(ValueError, lb.inverse_transform, [])

    assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=1)
    assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=2)
예제 #11
0
def test_label_binarizer_multilabel_unlabeled():
    """Check that LabelBinarizer can handle an unlabeled sample"""
    lb = LabelBinarizer()
    y = [[1, 2], [1], []]
    Y = np.array([[1, 1], [1, 0], [0, 0]])
    assert_array_equal(assert_warns(DeprecationWarning, lb.fit_transform, y),
                       Y)
예제 #12
0
def fit_binarizers(all_values):
    binarizers = {}
    for f in range(len(all_values[0])):
        cur_features = [context[f] for context in all_values]
        # only categorical values need to be binarized, ints/floats are left as they are
        if type(cur_features[0]) == str or type(cur_features[0]) == unicode:
            lb = LabelBinarizer()
            lb.fit(cur_features)
            binarizers[f] = lb
        elif type(cur_features[0]) == list:
            mlb = MultiLabelBinarizer()
            # default feature for unknown values
            cur_features.append(tuple(("__unk__",)))
            mlb.fit([tuple(x) for x in cur_features])
            binarizers[f] = mlb
    return binarizers
예제 #13
0
 def __init__(self, neg_label=0, pos_label=1, sparse_output=False):
     self._hyperparams = {
         'neg_label': neg_label,
         'pos_label': pos_label,
         'sparse_output': sparse_output
     }
     self._wrapped_model = SKLModel(**self._hyperparams)
예제 #14
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
예제 #15
0
def fit_binarizers(all_values):
    binarizers = {}
    for f in range(len(all_values[0])):
        cur_features = [context[f] for context in all_values]
        # only categorical values need to be binarized, ints/floats are left as they are
        if type(cur_features[0]) == str or type(cur_features[0]) == unicode:
            lb = LabelBinarizer()
            lb.fit(cur_features)
            binarizers[f] = lb
        elif type(cur_features[0]) == list:
            mlb = MultiLabelBinarizer()
            # default feature for unknown values
            cur_features.append(tuple(("__unk__",)))
            mlb.fit([tuple(x) for x in cur_features])
            binarizers[f] = mlb
    return binarizers
예제 #16
0
def test_label_binarizer_unseen_labels():
    lb = LabelBinarizer()

    expected = np.array([[1, 0, 0],
                         [0, 1, 0],
                         [0, 0, 1]])
    got = lb.fit_transform(['b', 'd', 'e'])
    assert_array_equal(expected, got)

    expected = np.array([[0, 0, 0],
                         [1, 0, 0],
                         [0, 0, 0],
                         [0, 1, 0],
                         [0, 0, 1],
                         [0, 0, 0]])
    got = lb.transform(['a', 'b', 'c', 'd', 'e', 'f'])
    assert_array_equal(expected, got)
예제 #17
0
def test_label_binarizer_unseen_labels():
    lb = LabelBinarizer()

    expected = np.array([[1, 0, 0],
                         [0, 1, 0],
                         [0, 0, 1]])
    got = lb.fit_transform(['b', 'd', 'e'])
    assert_array_equal(expected, got)

    expected = np.array([[0, 0, 0],
                         [1, 0, 0],
                         [0, 0, 0],
                         [0, 1, 0],
                         [0, 0, 1],
                         [0, 0, 0]])
    got = lb.transform(['a', 'b', 'c', 'd', 'e', 'f'])
    assert_array_equal(expected, got)
class CategoryBinarizer(TransformerMixin):
    def __init__(self):
        self.__encoder = LabelBinarizer(sparse_output=False)

    def fit(self, X, y=None):
        # X = X.astype(str)
        X = X.values
        self.__encoder.fit(X)
        return self

    def transform(self, X):
        X = X.values
        result = self.__encoder.transform(X)
        result = pd.DataFrame(result)
        result.columns = self.__encoder.classes_

        return result
예제 #19
0
def test_label_binarizer():
    lb = LabelBinarizer()

    # one-class case defaults to negative label
    inp = ["pos", "pos", "pos", "pos"]
    expected = np.array([[0, 0, 0, 0]]).T
    got = lb.fit_transform(inp)
    assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_"))
    assert_array_equal(lb.classes_, ["pos"])
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)

    # two-class case
    inp = ["neg", "pos", "pos", "neg"]
    expected = np.array([[0, 1, 1, 0]]).T
    got = lb.fit_transform(inp)
    assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_"))
    assert_array_equal(lb.classes_, ["neg", "pos"])
    assert_array_equal(expected, got)

    to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
    assert_array_equal(lb.inverse_transform(to_invert), inp)

    # multi-class case
    inp = ["spam", "ham", "eggs", "ham", "0"]
    expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]])
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ["0", "eggs", "ham", "spam"])
    assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_"))
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)
예제 #20
0
def test_label_binarizer():
    lb = LabelBinarizer()

    # one-class case defaults to negative label
    inp = ["pos", "pos", "pos", "pos"]
    expected = np.array([[0, 0, 0, 0]]).T
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ["pos"])
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)

    # two-class case
    inp = ["neg", "pos", "pos", "neg"]
    expected = np.array([[0, 1, 1, 0]]).T
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ["neg", "pos"])
    assert_array_equal(expected, got)

    to_invert = np.array([[1, 0],
                          [0, 1],
                          [0, 1],
                          [1, 0]])
    assert_array_equal(lb.inverse_transform(to_invert), inp)

    # multi-class case
    inp = ["spam", "ham", "eggs", "ham", "0"]
    expected = np.array([[0, 0, 0, 1],
                         [0, 0, 1, 0],
                         [0, 1, 0, 0],
                         [0, 0, 1, 0],
                         [1, 0, 0, 0]])
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam'])
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)
예제 #21
0
def check_binarized_results(y, classes, pos_label, neg_label, expected):
    for sparse_output in [True, False]:
        if ((pos_label == 0 or neg_label != 0) and sparse_output):
            assert_raises(ValueError,
                          label_binarize,
                          y,
                          classes,
                          neg_label=neg_label,
                          pos_label=pos_label,
                          sparse_output=sparse_output)
            continue

        # check label_binarize
        binarized = label_binarize(y,
                                   classes,
                                   neg_label=neg_label,
                                   pos_label=pos_label,
                                   sparse_output=sparse_output)
        assert_array_equal(toarray(binarized), expected)
        assert_equal(issparse(binarized), sparse_output)

        # check inverse
        y_type = type_of_target(y)
        if y_type == "multiclass":
            inversed = _inverse_binarize_multiclass(binarized, classes=classes)

        else:
            inversed = _inverse_binarize_thresholding(
                binarized,
                output_type=y_type,
                classes=classes,
                threshold=((neg_label + pos_label) / 2.))

        assert_array_equal(toarray(inversed), toarray(y))

        # Check label binarizer
        lb = LabelBinarizer(neg_label=neg_label,
                            pos_label=pos_label,
                            sparse_output=sparse_output)
        binarized = lb.fit_transform(y)
        assert_array_equal(toarray(binarized), expected)
        assert_equal(issparse(binarized), sparse_output)
        inverse_output = lb.inverse_transform(binarized)
        assert_array_equal(toarray(inverse_output), toarray(y))
        assert_equal(issparse(inverse_output), issparse(y))
예제 #22
0
class LabelBinarizerImpl():
    def __init__(self, neg_label=0, pos_label=1, sparse_output=False):
        self._hyperparams = {
            'neg_label': neg_label,
            'pos_label': pos_label,
            'sparse_output': sparse_output
        }
        self._wrapped_model = SKLModel(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
예제 #23
0
def getLoss(w, x, y, lam):
    m = x.shape[0]  # First we get the number of training examples
    #y_mat = oneHotIt(y) #Next we convert the integer class coding into a one-hot representation
    lb = LabelBinarizer()
    y_mat = lb.fit_transform(y)
    b = np.random.rand(len(x), 10)
    #scores = np.sum(np.dot(x, w), b)  # Then we compute raw class scores given our input and current weights
    scores = np.dot(
        x, w
    )  # Then we compute raw class scores given our input and current weights
    prob = softmax(
        scores
    )  # Next we perform a softmax on these scores to get their probabilities
    loss = (-1 / m) * np.sum(y_mat * np.log(prob)) + (lam / 2) * np.sum(
        w * w)  # We then find the loss of the probabilities
    grad = (-1 / m) * np.dot(
        x.T,
        (y_mat - prob)) + lam * w  # And compute the gradient for that loss
    return loss, grad
예제 #24
0
def _log_loss(y_true, y_pred, eps=1e-10, sample_weight=None):
    """ This is shorter ans simpler version og log_loss, which supports sample_weight """
    sample_weight = check_sample_weight(y_true, sample_weight=sample_weight)
    y_true, y_pred, sample_weight = check_arrays(y_true, y_pred, sample_weight)
    y_true = column_or_1d(y_true)

    lb = LabelBinarizer()
    T = lb.fit_transform(y_true)
    if T.shape[1] == 1:
        T = numpy.append(1 - T, T, axis=1)

    # Clipping
    Y = numpy.clip(y_pred, eps, 1 - eps)

    # Check if dimensions are consistent.
    T, Y = check_arrays(T, Y)

    # Renormalize
    Y /= Y.sum(axis=1)[:, numpy.newaxis]
    loss = -(T * numpy.log(Y) * sample_weight[:, numpy.newaxis]).sum() / numpy.sum(sample_weight)
    return loss
예제 #25
0
def _log_loss(y_true, y_pred, eps=1e-10, sample_weight=None):
    """ This is shorter ans simpler version og log_loss, which supports sample_weight """
    sample_weight = check_sample_weight(y_true, sample_weight=sample_weight)
    y_true, y_pred, sample_weight = check_arrays(y_true, y_pred, sample_weight)
    y_true = column_or_1d(y_true)

    lb = LabelBinarizer()
    T = lb.fit_transform(y_true)
    if T.shape[1] == 1:
        T = numpy.append(1 - T, T, axis=1)

    # Clipping
    Y = numpy.clip(y_pred, eps, 1 - eps)

    # Check if dimensions are consistent.
    T, Y = check_arrays(T, Y)

    # Renormalize
    Y /= Y.sum(axis=1)[:, numpy.newaxis]
    loss = -(T * numpy.log(Y) *
             sample_weight[:, numpy.newaxis]).sum() / numpy.sum(sample_weight)
    return loss
예제 #26
0
def check_binarized_results(y, classes, pos_label, neg_label, expected):
    for sparse_output in [True, False]:
        if ((pos_label == 0 or neg_label != 0) and sparse_output):
            assert_raises(ValueError, label_binarize, y, classes,
                          neg_label=neg_label, pos_label=pos_label,
                          sparse_output=sparse_output)
            continue

        # check label_binarize
        binarized = label_binarize(y, classes, neg_label=neg_label,
                                   pos_label=pos_label,
                                   sparse_output=sparse_output)
        assert_array_equal(toarray(binarized), expected)
        assert_equal(issparse(binarized), sparse_output)

        # check inverse
        y_type = type_of_target(y)
        if y_type == "multiclass":
            inversed = _inverse_binarize_multiclass(binarized, classes=classes)

        else:
            inversed = _inverse_binarize_thresholding(binarized,
                                                      output_type=y_type,
                                                      classes=classes,
                                                      threshold=((neg_label +
                                                                 pos_label) /
                                                                 2.))

        assert_array_equal(toarray(inversed), toarray(y))

        # Check label binarizer
        lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label,
                            sparse_output=sparse_output)
        binarized = lb.fit_transform(y)
        assert_array_equal(toarray(binarized), expected)
        assert_equal(issparse(binarized), sparse_output)
        inverse_output = lb.inverse_transform(binarized)
        assert_array_equal(toarray(inverse_output), toarray(y))
        assert_equal(issparse(inverse_output), issparse(y))
예제 #27
0
def test_label_binarize_with_multilabel_indicator():
    """Check that passing a binary indicator matrix is not noop"""

    classes = np.arange(3)
    neg_label = -1
    pos_label = 2

    y = np.array([[0, 1, 0], [1, 1, 1]])
    expected = np.array([[-1, 2, -1], [2, 2, 2]])

    # With label binarize
    output = label_binarize(y, classes, multilabel=True, neg_label=neg_label,
                            pos_label=pos_label)
    assert_array_equal(output, expected)

    # With the transformer
    lb = LabelBinarizer(pos_label=pos_label, neg_label=neg_label)
    output = lb.fit_transform(y)
    assert_array_equal(output, expected)

    output = lb.fit(y).transform(y)
    assert_array_equal(output, expected)
예제 #28
0
def test_label_binarizer_set_label_encoding():
    lb = LabelBinarizer(neg_label=-2, pos_label=2)

    # two-class case
    inp = np.array([0, 1, 1, 0])
    expected = np.array([[-2, 2, 2, -2]]).T
    got = lb.fit_transform(inp)
    assert_false(lb.multilabel_)
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)

    # multi-class case
    inp = np.array([3, 2, 1, 2, 0])
    expected = np.array([[-2, -2, -2, +2],
                         [-2, -2, +2, -2],
                         [-2, +2, -2, -2],
                         [-2, -2, +2, -2],
                         [+2, -2, -2, -2]])
    got = lb.fit_transform(inp)
    assert_false(lb.multilabel_)
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)
예제 #29
0
def test_label_binarizer_set_label_encoding():
    lb = LabelBinarizer(neg_label=-2, pos_label=0)

    # two-class case with pos_label=0
    inp = np.array([0, 1, 1, 0])
    expected = np.array([[-2, 0, 0, -2]]).T
    got = lb.fit_transform(inp)
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)

    lb = LabelBinarizer(neg_label=-2, pos_label=2)

    # multi-class case
    inp = np.array([3, 2, 1, 2, 0])
    expected = np.array([[-2, -2, -2, +2], [-2, -2, +2, -2], [-2, +2, -2, -2],
                         [-2, -2, +2, -2], [+2, -2, -2, -2]])
    got = lb.fit_transform(inp)
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)
예제 #30
0
def test_label_binarizer_multilabel():
    lb = LabelBinarizer()

    # test input as lists of tuples
    inp = [(2, 3), (1, ), (1, 2)]
    indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
    got = assert_warns(DeprecationWarning, lb.fit_transform, inp)
    assert_true(lb.multilabel_)
    assert_array_equal(indicator_mat, got)
    assert_equal(lb.inverse_transform(got), inp)

    # test input as label indicator matrix
    lb.fit(indicator_mat)
    assert_array_equal(indicator_mat, lb.inverse_transform(indicator_mat))

    # regression test for the two-class multilabel case
    lb = LabelBinarizer()
    inp = [[1, 0], [0], [1], [0, 1]]
    expected = np.array([[1, 1], [1, 0], [0, 1], [1, 1]])
    got = assert_warns(DeprecationWarning, lb.fit_transform, inp)
    assert_true(lb.multilabel_)
    assert_array_equal(expected, got)
    assert_equal([set(x) for x in lb.inverse_transform(got)],
                 [set(x) for x in inp])
예제 #31
0
def test_label_binarizer_set_label_encoding():
    lb = LabelBinarizer(neg_label=-2, pos_label=0)

    # two-class case with pos_label=0
    inp = np.array([0, 1, 1, 0])
    expected = np.array([[-2, 0, 0, -2]]).T
    got = lb.fit_transform(inp)
    assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_"))
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)

    lb = LabelBinarizer(neg_label=-2, pos_label=2)

    # multi-class case
    inp = np.array([3, 2, 1, 2, 0])
    expected = np.array([[-2, -2, -2, +2], [-2, -2, +2, -2], [-2, +2, -2, -2], [-2, -2, +2, -2], [+2, -2, -2, -2]])
    got = lb.fit_transform(inp)
    assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_"))
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)
예제 #32
0
def test_label_binarizer():
    lb = LabelBinarizer()

    # two-class case
    inp = ["neg", "pos", "pos", "neg"]
    expected = np.array([[0, 1, 1, 0]]).T
    got = lb.fit_transform(inp)
    assert_false(lb.multilabel_)
    assert_array_equal(lb.classes_, ["neg", "pos"])
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)

    # multi-class case
    inp = ["spam", "ham", "eggs", "ham", "0"]
    expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0],
                         [0, 0, 1, 0], [1, 0, 0, 0]])
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam'])
    assert_false(lb.multilabel_)
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)
 def __init__(self):
     self.__encoder = LabelBinarizer(sparse_output=False)
class GOAMultilayerPerceptron:
    def __init__(self, N, hidden_layer_sizes, max_iter, random_state, x_val, y_val, activation="relu"):
        self.N = N
        self.hidden_layer_sizes = hidden_layer_sizes
        self.activation = activation
        self.max_iter = max_iter
        self.random_state = check_random_state(random_state)
        self.xval = x_val
        self.yval = y_val
    def _forward_pass(self, activations, coefs, intercepts):
        hidden_activation = ACTIVATIONS[self.activation]
        # Iterate over the hidden layers
        for i in range(self.n_layers_ - 1):
            activations[i + 1] = safe_sparse_dot(activations[i], coefs[i])
            activations[i + 1] += intercepts[i]
            # For the hidden layers
            if (i + 1) != (self.n_layers_ - 1):
                activations[i + 1] = hidden_activation(activations[i + 1])
        # For the last layer
        activations[self.n_layers_-1] = logistic(activations[self.n_layers_-1])
        return activations

    def initialize(self, y, layer_units, coefs_, intercepts_):
        self.n_outputs_ = y.shape[1]
        self.n_layers_ = len(layer_units)
        self.out_activation_ = 'logistic'
        self.n_coefs = []
        self.n_intercepts = []
        self.bound = 0
        bound = 0
        self.coefs_ = coefs_
        self.intercepts_ = intercepts_
        grasshopper_vector = self.encode(coefs_, intercepts_)
        for x in grasshopper_vector:
            if abs(x) > bound:
                bound = abs(x)
        bound = math.ceil(bound)
        self.grasshopper_vector = grasshopper_vector
        self.dim = len(grasshopper_vector)
        self.ub = bound
        self.lb = -bound

    def fit(self, X, y):
        inicial_mlp = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=self.hidden_layer_sizes, random_state=8997)
        inicial_mlp.fit(X, y)
        N = self.N
        max_iter = self.max_iter
        hidden_layer_sizes = self.hidden_layer_sizes
        hidden_layer_sizes = list(hidden_layer_sizes)
        X, y = self.validate_input(X, y)
        n_samples, n_features = X.shape
        if y.ndim == 1:
            y = y.reshape((-1, 1))
        self.n_outputs_ = y.shape[1]
        layer_units = ([n_features] + hidden_layer_sizes +
                       [self.n_outputs_])
        self.initialize(y, layer_units, inicial_mlp.coefs_, inicial_mlp.intercepts_)
        y = self.label_binarizer.inverse_transform(y)
        bestauc = 0
        flag = 0
        dim = self.dim
        print("dim:", dim)
        lb = self.lb
        ub = self.ub
        ub = np.ones((dim, 1)) * ub
        lb = np.ones((dim, 1)) * lb
        if dim % 2 != 0:
            dim = dim + 1
            ub = np.append(ub, self.ub)
            lb = np.append(lb, self.lb)
            flag = 1
        if flag == 1:
            self.grasshopper_vector.append(0)
        grasshopper_positions = []
        for i in range(N):
            grasshopper_positions.append(self.grasshopper_vector)
        # grasshopper_positions = initialization(N, dim, self.lb, self.ub)
        grasshopper_positions = np.array(grasshopper_positions)
        grasshopper_fitness = []
        cmax = 1
        cmin = 0.00004
        for i in range(np.size(grasshopper_positions, 0)):
            if flag == 1:
                grasshopper_position = grasshopper_positions[i][0:-1]
                coefs, intercepts = self.decode(grasshopper_position)
                y_pred = self._predict(X, coefs, intercepts)
                y_pred = y_pred.ravel()
                self.label_binarizer.inverse_transform(y_pred)
                fpr, tpr, thresholds = roc_curve(y, y_pred)
                auc1 = auc(fpr, tpr)
                grasshopper_fitness.append(auc1)
                # grasshopper_fitness.append(binary_log_loss(y, y_pred))
            else:
                grasshopper_position = grasshopper_positions[i]
                coefs, intercepts = self.decode(grasshopper_position)
                y_pred = self._predict(X, coefs, intercepts)
                y_pred = y_pred.ravel()
                self.label_binarizer.inverse_transform(y_pred)
                fpr, tpr, thresholds = roc_curve(y, y_pred)
                auc1 = auc(fpr, tpr)
                grasshopper_fitness.append(auc1)
                # grasshopper_fitness.append(binary_log_loss(y, y_pred))
        sorted_indexes = list(np.array(grasshopper_fitness).argsort())
        grasshopper_fitness.sort(reverse=True)
        sorted_grasshopper = []
        for new_index in range(N):
            sorted_grasshopper.append(grasshopper_positions[sorted_indexes[new_index]])
        target_position = sorted_grasshopper[0]
        target_fitness = grasshopper_fitness[0]
        print("target_position:",  target_position)
        print("target_fitness:", target_fitness)
        l = 2
        grasshopper_positions = np.array(grasshopper_positions)
        print(np.shape(grasshopper_positions))
        while l < max_iter + 1:
            print("iteration ", l)
            tp = np.array(target_position)
            cc = cmax - l * ((cmax - cmin) / max_iter)
            for i in range(np.size(grasshopper_positions, 0)):
                temp = np.transpose(grasshopper_positions)
                s_i = np.zeros((dim, 1))
                for j in range(N):
                    if i != j:
                        dist = distance(temp[:, j], temp[:, i])
                        r_ij_vec = (temp[:, j] - temp[:, i]) / (dist + eps(1))
                        xj_xi = 2 + dist % 2
                        s_ij = np.multiply((ub - lb)*cc/2*s_func(xj_xi), r_ij_vec)
                        s_i = s_i + np.transpose(s_ij)
                X_new = cc * np.transpose(s_i) + tp
                grasshopper_positions[i, :] = np.squeeze(np.transpose(X_new))
            for i in range(N):
                # Relocate grasshoppers that go outside the search space
                tp = np.greater(grasshopper_positions[i, :], np.transpose(ub))
                tm = np.less(grasshopper_positions[i, :], np.transpose(lb))
                grasshopper_positions[i, :] = grasshopper_positions[i, :] * np.logical_not(tp + tm) + np.transpose(
                    ub) * tp + np.transpose(lb) * tm
                if flag == 1:
                    grasshopper_position = grasshopper_positions[i][0:-1]
                    coefs, intercepts = self.decode(grasshopper_position)
                    y_pred = self._predict(X, coefs, intercepts)
                    y_pred = y_pred.ravel()
                    self.label_binarizer.inverse_transform(y_pred)
                    fpr, tpr, thresholds = roc_curve(y, y_pred)
                    auc1 = auc(fpr, tpr)
                    grasshopper_fitness = auc1
                    # grasshopper_fitness = binary_log_loss(y, y_pred)
                else:
                    grasshopper_position = grasshopper_positions[i]
                    coefs, intercepts = self.decode(grasshopper_position)
                    y_pred = self._predict(X, coefs, intercepts)
                    y_pred = y_pred.ravel()
                    self.label_binarizer.inverse_transform(y_pred)
                    fpr, tpr, thresholds = roc_curve(y, y_pred)
                    auc1 = auc(fpr, tpr)
                    grasshopper_fitness = auc1
                    #grasshopper_fitness = binary_log_loss(y, y_pred)
                if grasshopper_fitness > target_fitness:
                    target_position = grasshopper_positions[i]
                    target_fitness = grasshopper_fitness
                    print("new_fitness:", target_fitness)
                    y_pred = self._predict(X, coefs, intercepts)
                    y_pred = y_pred.ravel()
                    self.label_binarizer.inverse_transform(y_pred)
                    fpr, tpr, thresholds = roc_curve(y, y_pred)
                    auc1 = auc(fpr, tpr)
                    print("training auc:", auc1)

                    y_pred = self._predict(self.xval, coefs, intercepts)
                    y_pred = y_pred.ravel()
                    self.label_binarizer.inverse_transform(y_pred)
                    fpr, tpr, thresholds = roc_curve(self.yval, y_pred)
                    auc1 = auc(fpr, tpr)
                    if auc1>bestauc:
                        bestauc = auc1
                        print("best auc on validation set:", bestauc)
            l=l+1
        if flag == 1:
            target_position = target_position[0:-1]
        coefss, interceptss = self.decode(target_position)
        self.coefs_ = coefss
        self.intercepts_ = interceptss

    def init_coef(self, fan_in, fan_out):
        # Use the initialization method recommended by
        # Glorot et al.
        factor = 6.
        if self.activation == 'logistic':
            factor = 2.
        init_bound = np.sqrt(factor / (fan_in + fan_out))

        # Generate weights and bias:
        coef_init = self.random_state.uniform(-init_bound, init_bound, (fan_in, fan_out))
        intercept_init = self.random_state.uniform(-init_bound, init_bound, fan_out)
        return coef_init, intercept_init, init_bound
    def encode(self, coefs, intercepts):
        self.n_coefs = []
        self.n_intercepts = []
        grasshopper_position = []
        for array in coefs:
            self.n_coefs.append(np.shape(array))
            for line in array:
                grasshopper_position += list(line)
        for array in intercepts:
            self.n_intercepts.append(np.shape(array))
            grasshopper_position += list(array)
        return grasshopper_position
    def decode(self, grasshopper_position:list):
        coefs = []
        intercepts = []
        pos = 0
        for shape in self.n_coefs:
            coef = []
            for j in range(shape[0]):
                coe = []
                for k in range(shape[1]):
                    coe.append(grasshopper_position[pos])
                    pos = pos+1
                coef.append(coe)
            coefs.append(np.array(coef))
        for shape in self.n_intercepts:
            intercept = []
            for j in range(shape[0]):
                intercept.append(grasshopper_position[pos])
                pos = pos+1
            intercepts.append(np.array(intercept))
        return coefs, intercepts

    def _predict(self, X, coefs, intercepts):
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        # Make sure self.hidden_layer_sizes is a list
        hidden_layer_sizes = self.hidden_layer_sizes
        if not hasattr(hidden_layer_sizes, "__iter__"):
            hidden_layer_sizes = [hidden_layer_sizes]
        hidden_layer_sizes = list(hidden_layer_sizes)

        layer_units = [X.shape[1]] + hidden_layer_sizes + [self.n_outputs_]

        # Initialize layers
        activations = [X]

        for i in range(self.n_layers_ - 1):
            activations.append(np.empty((X.shape[0], layer_units[i + 1])))
        # forward propagate
        self._forward_pass(activations, coefs, intercepts)
        y_pred = activations[-1]
        return y_pred

    def predict(self, X):
        X = check_array(X, accept_sparse=['csr', 'csc', 'coo'])
        # Make sure self.hidden_layer_sizes is a list
        hidden_layer_sizes = self.hidden_layer_sizes
        if not hasattr(hidden_layer_sizes, "__iter__"):
            hidden_layer_sizes = [hidden_layer_sizes]
        hidden_layer_sizes = list(hidden_layer_sizes)

        layer_units = [X.shape[1]] + hidden_layer_sizes + [self.n_outputs_]

        # Initialize layers
        activations = [X]

        for i in range(self.n_layers_ - 1):
            activations.append(np.empty((X.shape[0], layer_units[i + 1])))
        # forward propagate
        self._forward_pass(activations, self.coefs_, self.intercepts_)
        y_pred = activations[-1]
        if self.n_outputs_ == 1:
            y_pred = y_pred.ravel()
        return self.label_binarizer.inverse_transform(y_pred)

    def validate_input(self, X, y):
        X, y = check_X_y(X, y, accept_sparse=['csr', 'csc', 'coo'],
                         multi_output=True)
        if y.ndim == 2 and y.shape[1] == 1:
            y = column_or_1d(y, warn=True)
        classes = unique_labels(y)
        self.label_binarizer = LabelBinarizer()
        self.label_binarizer.fit(classes)
        y = self.label_binarizer.transform(y)
        return X, y
예제 #35
0
def main():
    print("Loading samples and labels")
    samples, labels, _ = load_files("data")
    print("Loaded {} samples".format(samples.shape[0]))

    sequence_dim = 100
    print("Converting to sequences of length {}".format(sequence_dim))
    samples, labels = make_sequences(samples, labels, sequence_dim)

    print("Number of samples from sequences: {}".format(samples.shape[0]))

    lb = LabelBinarizer()
    labels = lb.fit_transform(labels)

    # flattened samples for Decision Tree
    flatSamples = samples.reshape(samples.shape[0], -1)  #tree!
    (trainSamples, testSamples, trainLabels,
     testLabels) = train_test_split(flatSamples,
                                    labels,
                                    test_size=0.25,
                                    random_state=42)

    print("=" * 20)
    print("Building DecisionTree model")
    model = DecisionTreeClassifier()
    model.fit(trainSamples, trainLabels)
    treeResults = model.predict(testSamples)
    print(
        confusion_matrix(testLabels.argmax(axis=1),
                         treeResults.argmax(axis=1)))
    print(
        classification_report(testLabels.argmax(axis=1),
                              treeResults.argmax(axis=1)))
    treeAcc = accuracy_score(testLabels.argmax(axis=1),
                             treeResults.argmax(axis=1))
    print("Accuracy Tree: {:.2f}".format(treeAcc))
    print("Cohen's Kappa {:.2f}".format(
        cohen_kappa_score(testLabels.argmax(axis=1),
                          treeResults.argmax(axis=1))))

    print("=" * 20)
    print("Building CNN model")

    (trainSamples, testSamples, trainLabels,
     testLabels) = train_test_split(samples,
                                    labels,
                                    test_size=0.25,
                                    random_state=42)
    inputShape = (samples.shape[1], samples.shape[2])
    model = Sequential()
    model.add(Conv1D(32, 10, padding="same", input_shape=inputShape))
    model.add(Activation("relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))

    model.add(Conv1D(64, 10, padding="same"))
    model.add(Activation("relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Conv1D(128, 10, padding="same"))
    model.add(Activation("relu"))
    model.add(Dropout(0.2))
    model.add(Flatten(input_shape=inputShape))
    model.add(Dense(128, activation='sigmoid'))
    model.add(Dense(64, activation='sigmoid'))
    model.add(Dense(labels.shape[1], activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer="adam",
                  metrics=['accuracy'])

    EPOCHS = 10
    BATCH = 128
    model.fit(trainSamples,
              trainLabels,
              batch_size=BATCH,
              epochs=EPOCHS,
              validation_data=(testSamples, testLabels))

    cnnResults = model.predict(testSamples)

    print(
        confusion_matrix(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)))
    print(
        classification_report(testLabels.argmax(axis=1),
                              cnnResults.argmax(axis=1),
                              target_names=lb.classes_))
    print("CNN Accuracy: {:.2f}".format(
        accuracy_score(testLabels.argmax(axis=1), cnnResults.argmax(axis=1))))
    print("Cohen's Kappa {:.2f}".format(
        cohen_kappa_score(testLabels.argmax(axis=1),
                          cnnResults.argmax(axis=1))))
    input("")
예제 #36
0
# find pairs for samplesIMG in samplesCSV
samples_paired = []
for i in range(samplesIMG.shape[0]):
    for j in range(samplesCSV.shape[0]):
        if namesCSV[j] == namesIMG[i]:
            samples_paired.append(samplesCSV[j])

samplesCSV = np.array(samples_paired)
samplesIMG = np.expand_dims(samplesIMG, axis=3)

print("Paired")
print("Samples IMG: {}".format(len(samplesIMG)))
print("Samples CSV: {}".format(len(samplesCSV)))

# one-hot encoding
lb = LabelBinarizer()
labels = lb.fit_transform(labels)
numClasses = labels.shape[1]

inputShape = (108, 192, 1)  #samplesIMG.shape

#model for images
cnnmodel = Sequential()
cnnmodel.add(Conv2D(16, (3, 3), padding="same", input_shape=inputShape))
cnnmodel.add(Activation("relu"))
cnnmodel.add(MaxPooling2D(pool_size=(2, 2)))
cnnmodel.add(Conv2D(32, (3, 3), padding="same"))
cnnmodel.add(Activation("relu"))
cnnmodel.add(MaxPooling2D(pool_size=(2, 2)))
cnnmodel.add(Dropout(0.25))
cnnmodel.add(Flatten())
def dbpedia_smallcharconv(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(
            round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    logging.info('creating train test split ...')
    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    logging.info('preprocessing, padding and binarizing data ...')
    train_docs = [[
        CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text
    ] for text in train_df[['title', 'abstract']].apply(
        lambda cols: u'\n'.join(cols), axis=1).values]
    bin = LabelBinarizer()

    x_train = np.array(
        pad_sentences(train_docs,
                      max_length=1014,
                      padding_word=CHAR_MAP.index(' ')))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = [[
        CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text
    ] for text in test_df[['title', 'abstract']].apply(
        lambda cols: u'\n'.join(cols), axis=1).values]
    x_test = np.array(pad_sentences(test_docs, max_length=1014,
                                    padding_word=0))
    y_test = bin.transform(test_df.category.values)

    logging.info('building model ...')
    model = Sequential()
    model.add(
        Embedding(len(CHAR_MAP) + 1,
                  len(CHAR_MAP) + 1,
                  input_length=1014,
                  weights=[char_embedding()],
                  trainable=False))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=7,
                      border_mode='valid',
                      activation='relu'))
    model.add(MaxPooling1D(pool_length=3))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=7,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=3))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=3))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    print(model.summary())

    model.fit(x_train,
              y_train,
              batch_size=64,
              nb_epoch=5,
              validation_data=[x_test, y_test])

    print(
        accuracy_score(
            np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
def dbpedia_smallwordconv(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(
            round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    logging.info('creating train test split ...')
    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    logging.info('preprocessing, padding and binarizing data ...')
    train_docs = DataframeSentences(train_df,
                                    cols=['title', 'abstract'],
                                    flatten=True)
    vocab = Dictionary(train_docs)
    vocab.filter_extremes(keep_n=5000)
    bin = LabelBinarizer()

    x_train = np.array(
        pad_sentences(
            [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
             for s in train_docs],
            max_length=100,
            padding_word=0))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = DataframeSentences(test_df,
                                   cols=['title', 'abstract'],
                                   flatten=True)
    x_test = np.array(
        pad_sentences(
            [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
             for s in test_docs],
            max_length=100,
            padding_word=0))
    y_test = bin.transform(test_df.category.values)

    logging.info('building model ...')
    model = Sequential()
    model.add(Embedding(5001, 300, input_length=100))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=7,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=7,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    model.fit(x_train,
              y_train,
              batch_size=32,
              nb_epoch=5,
              validation_data=[x_test, y_test])

    print(
        accuracy_score(
            np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
class languageIdentification(object):
    """
    Using characters as features, each encoded by sklearn OneHotEncoder
    Languages are encoded into vectors using sklearn LabelBinarizer
    """
    def __init__(self, trainFile, devFile, testFile, d=100, yita=0.1):
        self.d = d
        self.yita = yita
        self.languages = {"ENGLISH": 1, "FRENCH": 3, "ITALIAN": 2}
        self.punctuations = [".", "'", ":", ",", "-", "...", "!", "_", "(", ")", "?", '"', ";", "/", "\\", "{", "}", \
                             "[", "]", "|", "<", ">", "+", "=", "@", "#", "$", "%", "^","&", "*"]
        self.noPunctuation = False
        self.answerLables = LabelBinarizer()
        self.answerLables.fit([1, 2, 3])
        self.c = set()

        self.Initialize(trainFile, devFile, testFile)

        self.input = len(self.c) * 5 + 1
        self.setParameters(d, yita)

    def Initialize(self, trainFileName, devFileName, testFileName):
        trainList = []
        trainResult = []
        self.testFeatures = []
        self.devFeatures = []
        self.trainFeatures = []
        self.train = []
        #self.dev = []
        #self.test = []
        self.devResult = []
        self.rawResult = []

        print "train feature processing..."
        with open(trainFileName) as trainFile:
            for line in trainFile:
                line = line.decode('utf-8').strip()
                if not line:
                    continue
                space = line.find(" ")
                if space < 5:
                    continue
                answer, train = line[:space].upper(), line[space + 1:]
                li, ans = self.lineProc(train, answer, True)
                trainList += li
                trainResult += ans
                self.trainFeatures.append(li)
                self.rawResult.append(self.languages[answer])

        with open(devFileName) as devFile:
            for line in devFile:
                line = line.decode('utf-8').strip()
                if not line:
                    continue
                space = line.find(" ")
                if space < 5:
                    continue
                answer, train = line[:space].upper(), line[space + 1:]
                li = self.lineProc(train, answer, False)
                self.devFeatures.append(li)
                self.devResult.append(self.languages[answer])

        with open(testFileName) as testFile:
            for line in testFile:
                if not line:
                    continue
                line = line.decode('latin-1').strip()
                test = self.lineProc(line, "", False)
                self.testFeatures.append(test)

        trainList, trainResult = self.FisherYatesShuffle(
            trainList, trainResult)
        trainResult = np.array(trainResult)
        self.trainResult = self.answerLables.fit_transform(trainResult)

        self.trainLabels = preprocessing.LabelEncoder()
        featureList = list(self.c)

        self.trainLabels.fit(featureList)
        #print self.trainLabels.classes_
        length = len(self.c)
        print "feature length:", length
        self.v = preprocessing.OneHotEncoder(n_values=length)

        trainList = np.array(trainList)
        self.train = self.trainLabels.transform(
            trainList.ravel()).reshape(*trainList.shape)

        self.train = self.v.fit_transform(self.train).toarray()
        print "train shape", self.train.shape

    def directPredict(self, featureList, type):
        types = {
            "train": "self.rawResult",
            "dev": "self.devResult",
            "test": "self.testResult"
        }
        prediction = self.predictAll(featureList)
        accuracy = self.evaluate(prediction, eval(types[type]))

        return prediction, accuracy

    def devProcess(self, epoch, initial=True):
        trainAccuracy = []
        devAccuracy = []

        if initial:
            print "initial predictions..."
            initial_train = self.directPredict(self.trainFeatures, "train")[1]
            trainAccuracy.append(initial_train)
            print "initial train accuracy: ", initial_train

            initial_dev = self.directPredict(self.devFeatures, "dev")[1]
            print "initial dev accuracy: ", initial_dev
            devAccuracy.append(initial_dev)

        for i in xrange(epoch):
            print "************************************epoch:", i + 1, "************************************"
            self.trainNN(1)
            trainac = self.directPredict(self.trainFeatures, "train")[1]
            print "train accuracy:", trainac
            trainAccuracy.append(trainac)

            devac = self.directPredict(self.devFeatures, "dev")[1]
            print "dev accuracy:", devac
            devAccuracy.append(devac)

        if initial:
            x = [i for i in xrange(epoch + 1)]
            pl.plot(x, trainAccuracy, 'r--', x, devAccuracy, 'bs')
            pl.show()

    def getTestResult(self):
        test_results = open('languageIdentification.data/test_solutions', 'r')
        self.testResult = []
        for line in test_results.readlines():
            self.testResult.append(
                solution.languages[line.strip().split(" ")[1].upper()])

    def setParameters(self, d, yita):
        self.d = d
        self.yita = yita
        self.hidden = d
        self.output = 3

        self.ai = np.array([1.0] * self.input)
        self.ah = np.array([1.0] * (self.hidden + 1))
        self.ao = [1.0] * self.output

        self.wi = np.random.uniform(size=(self.input, self.hidden))
        self.wo = np.random.randn(self.hidden + 1, self.output)

        self.ci = np.zeros((self.input, self.hidden))
        self.co = np.zeros((self.hidden + 1, self.output))

    def resetParameters(self):
        self.ai = np.array([1.0] * self.input)
        self.ah = np.array([1.0] * (self.hidden + 1))
        self.ao = [1.0] * self.output

        self.ci = np.zeros((self.input, self.hidden))
        self.co = np.zeros((self.hidden + 1, self.output))

    def lineProc(self, line, answer, isTraining=True):
        text = []
        result = []
        for ch in line:
            self.c.add(ch)
        if len(line) < 5:
            line += " " * (5 - len(line))
        for i in xrange(len(line) - 4):
            text.append(list(line[i:i + 5]))
            if isTraining:
                result.append(self.languages[answer])
        if isTraining:
            return (text, result)
        else:
            return text

    def FisherYatesShuffle(self, train, result):
        l = len(train)
        for i in xrange(l - 1, 0, -1):
            j = randint(0, i)
            train[i], train[j] = train[j], train[i]
            result[i], result[j] = result[j], result[i]
        #print result
        return train[:], result[:]

    def feedForward(self, inputs):
        self.resetParameters()
        for i in range(self.input - 1):
            self.ai[i] = inputs[i]

        self.ah[:self.hidden] = np.dot(self.ai, self.wi)
        self.ah[-1] = 1
        self.ah = self.sigmoid(self.ah)

        self.ao = np.dot(self.ah, self.wo)

        self.ao = self.softMax(self.ao)
        return self.ao[:]

    def softMax(self, out):
        total = sum(np.exp(out))
        #for i in xrange(self.output):
        out = np.exp(out) * 1.0 / total

        return out

    def backPropagate(self, result):
        # p(L, y) = y - y_hat
        d4 = self.ao - np.array(result)
        # kronecker delta: P(L, y_hat) = P(L, y) * P(y, y_hat)
        #print "before tune:", self.ao, result
        d3 = np.array([0.0] * self.output)
        for j in xrange(self.output):
            for i in xrange(self.output):
                if i == j:
                    d3[j] += d4[i] * self.ao[i] * (1 - self.ao[j])
                else:
                    d3[j] += d4[i] * self.ao[i] * -self.ao[j]
        # p(L, ah) = P(L, y) * P(y, y_hat) * p(y_hat, ah)
        d2 = np.dot(self.wo, d3)
        # p(L, ah_hat) = p(L, y) * P(y, y_hat) * p(y_hat, ah) * P(ah, ah_hat)
        d1 = d2 * self.partialDerivativeSigmoid(self.ah)
        # p(L, W2) = p(L, y) * p(y, y_hat) * p(y_hat, W2)
        D2 = self.yita * np.outer(self.ah, d3)
        self.wo -= D2 + self.co
        self.co = D2
        # p(L, w1) = p(L, y) * P(y, y_hat) * p(y_hat, ah) * P(ah, ah_hat) * P(ah_hat, w1)
        D1 = self.yita * np.outer(self.ai, d1[1:])
        self.wi -= D1 + self.ci
        self.ci = D1

        error = 1.0 / 2 * np.dot(d4, d4)

        return error

    def trainNN(self, epoch=3):
        for i in xrange(epoch):
            error = 0.0
            for j in xrange(len(self.train)):
                entry = self.train[j]
                res = self.trainResult[j]
                self.feedForward(entry)
                error += self.backPropagate(res)
            print "error:", error
            self.resetParameters()

    def predict(self, test):
        result = Counter()
        for entry in test:
            r = self.feedForward(entry)
            #print r
            idx = np.argmax(r) + 1
            result[idx] += 1

        return result.most_common(1)[0][0]

    def partialDerivativeSigmoid(self, out):
        return out * 1.0 * (1.0 - out)

    def sigmoid(self, x):
        #x =  np.clip(x, -500, 500)
        return 1.0 / (1 + np.exp(-x))

    def evaluate(self, predictions, golden):
        return accuracy_score(golden, predictions)

    def predictAll(self, features):
        predict_result = []
        for f in features:
            f = np.array(f)
            feature = self.trainLabels.transform(f.ravel()).reshape(*f.shape)
            feature = self.v.transform(feature).toarray()
            res = self.predict(feature)
            predict_result.append(res)
        return predict_result

    def testResultOutput(self, testFile, testPrediction):
        inverse = {1: "ENGLISH", 3: "FRENCH", 2: "ITALIAN"}
        testFile = open(testFileName, 'r')
        with open('./languageIdentification.output', 'w') as output:
            i = 0
            for line in testFile.readlines():
                output.write(line.strip() + " " + inverse[testPrediction[i]] +
                             '\n')
                i += 1
예제 #40
0
def test_label_binarizer():
    # one-class case defaults to negative label
    # For dense case:
    inp = ["pos", "pos", "pos", "pos"]
    lb = LabelBinarizer(sparse_output=False)
    expected = np.array([[0, 0, 0, 0]]).T
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ["pos"])
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)

    # For sparse case:
    lb = LabelBinarizer(sparse_output=True)
    got = lb.fit_transform(inp)
    assert_true(issparse(got))
    assert_array_equal(lb.classes_, ["pos"])
    assert_array_equal(expected, got.toarray())
    assert_array_equal(lb.inverse_transform(got.toarray()), inp)

    lb = LabelBinarizer(sparse_output=False)
    # two-class case
    inp = ["neg", "pos", "pos", "neg"]
    expected = np.array([[0, 1, 1, 0]]).T
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ["neg", "pos"])
    assert_array_equal(expected, got)

    to_invert = np.array([[1, 0],
                          [0, 1],
                          [0, 1],
                          [1, 0]])
    assert_array_equal(lb.inverse_transform(to_invert), inp)

    # multi-class case
    inp = ["spam", "ham", "eggs", "ham", "0"]
    expected = np.array([[0, 0, 0, 1],
                         [0, 0, 1, 0],
                         [0, 1, 0, 0],
                         [0, 0, 1, 0],
                         [1, 0, 0, 0]])
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam'])
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)
예제 #41
0
def main():
    samples, labels, _ = loader.load_img("radio_img")

    #add the fourth dimension (color)
    samples = np.expand_dims(samples, axis=4)

    print("shape = {}".format(samples.shape))
    inputShape = (samples.shape[1], samples.shape[2], samples.shape[3])
    print("inputShape = {}".format(inputShape))

    #weights
    class_weights = class_weight.compute_class_weight('balanced',
                                                      np.unique(labels),
                                                      labels)
    d_class_weights = dict(enumerate(class_weights))
    print("weights {}".format(d_class_weights))

    #one-hot encoding
    lb = LabelBinarizer()
    labels = lb.fit_transform(labels)
    classesNum = labels.shape[1]
    print("Classes: {}".format(classesNum))

    #split to training and test
    (trainSamples, testSamples, trainLabels,
     testLabels) = train_test_split(samples,
                                    labels,
                                    test_size=0.25,
                                    random_state=42)

    model = cnn_model(inputShape, classesNum)

    ## checkpoints
    #    checkpt1 = ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5', save_best_only=True)
    #    checkpt2 = EarlyStopping(monitor='val_loss', patience=3)

    EPOCHS = 20
    BATCH = 50
    model.fit(
        trainSamples,
        trainLabels,
        batch_size=BATCH,
        epochs=EPOCHS,
        class_weight=d_class_weights,
        verbose=1,
        #callbacks = [checkpt1,checkpt2],
        validation_data=(testSamples, testLabels))

    cnnResults = model.predict(testSamples)

    print(
        confusion_matrix(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)))
    print(
        classification_report(testLabels.argmax(axis=1),
                              cnnResults.argmax(axis=1)))
    cnnAcc = accuracy_score(testLabels.argmax(axis=1),
                            cnnResults.argmax(axis=1))
    print("Accuracy CNN: {:.2f}".format(cnnAcc))
    print("Cohen's Kappa {:.2f}".format(
        cohen_kappa_score(testLabels.argmax(axis=1),
                          cnnResults.argmax(axis=1))))
    input("")
def dbpedia_convgemb(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(
            round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    train_docs = DataframeSentences(train_df,
                                    cols=['title', 'abstract'],
                                    flatten=True)
    vocab = Dictionary(train_docs)
    vocab.filter_extremes(keep_n=5000)
    bin = LabelBinarizer()

    x_train = np.array(
        pad_sentences(
            [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
             for s in train_docs],
            max_length=100,
            padding_word=0))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = DataframeSentences(test_df,
                                   cols=['title', 'abstract'],
                                   flatten=True)
    x_test = np.array(
        pad_sentences(
            [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
             for s in test_docs],
            max_length=100,
            padding_word=0))
    y_test = bin.transform(test_df.category.values)

    emb_weights = load_w2v_weights(vocab)

    model = Sequential()
    model.add(
        Embedding(5001,
                  300,
                  input_length=100,
                  dropout=.2,
                  weights=[emb_weights],
                  trainable=False))
    model.add(
        Convolution1D(nb_filter=50,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=model.output_shape[1]))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(.2))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.fit(x_train, y_train)

    print(
        accuracy_score(
            np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
예제 #43
0
def main():

    #load data 
    file = "datasetA_3c.csv"
    dataframe = pandas.read_csv(file)
    dataset = dataframe.values
    samples = dataset[:,1:]
    labels = dataset[:,0]
    samples = np.array(samples)
    labels = np.array(labels)
    labels = labels.astype(str)

    print("Class distribution:")
    print(Counter(labels))

### choose k best attributes
#    from sklearn.feature_selection.univariate_selection import SelectKBest
#    newSamples = SelectKBest(k=100).fit_transform(samples, labels)
#    print(newSamples.shape) 
#    samples = newSamples

### Calculate weights for unbalanced classes
#    from sklearn.utils import class_weight
#    d_class_weights = None
#    class_weights = class_weight.compute_class_weight('balanced',np.unique(labels),labels)
#    print("Class weights:")
#    print(class_weights)
#    d_class_weights = dict(enumerate(class_weights))

### Normalize samples
#    from sklearn.preprocessing.data import normalize
#    normalize(samples)


    ## convert to one-hot encoding
    lb = LabelBinarizer()
    labels = lb.fit_transform(labels)
    classesNum = labels.shape[1]
    print ("Classes: {}".format(classesNum))

    trainSamples = samples
    trainLabels = labels
    testSamples = samples
    testLabels = labels
    
### Division into training and test samples
#    from sklearn.model_selection._split import train_test_split
#    (trainSamples, testSamples, trainLabels, testLabels) = train_test_split(samples, labels, test_size=0.25, random_state=42)
    
    
    model = Sequential()
    model.add(Dense(250, activation='sigmoid'))
    model.add(Dense(250, activation='sigmoid'))
    model.add(Dense(250, activation='sigmoid'))
    model.add(Dense(classesNum, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer="adam",metrics=['accuracy'])

    EPOCHS=50
    BATCH=50
    H = model.fit(trainSamples, trainLabels, batch_size=BATCH, epochs=EPOCHS
              #,class_weight=d_class_weights
              #,validation_data=(testSamples,testLabels)
              #,validation_split=0.1
              )
    mlpResults = model.predict(testSamples)

    print(confusion_matrix(testLabels.argmax(axis=1), mlpResults.argmax(axis=1)))
    print(classification_report(testLabels.argmax(axis=1), mlpResults.argmax(axis=1),target_names=lb.classes_))
    print("MLP Accuracy: {:.2f}".format(accuracy_score(testLabels.argmax(axis=1), mlpResults.argmax(axis=1))))
    print("Cohen's Kappa {:.2f}".format(cohen_kappa_score(testLabels.argmax(axis=1), mlpResults.argmax(axis=1))))

    N = np.arange(0, EPOCHS)
    plt.style.use("ggplot")
    plt.figure()
    plt.plot(N, H.history["loss"], label="train_loss")
    plt.plot(N, H.history["acc"], label="train_acc")
    #plt.plot(N, H.history["val_loss"], label="val_loss")
    #plt.plot(N, H.history["val_acc"], label="val_acc")
    
    plt.title("Training Loss and Accuracy")
    plt.xlabel("Epoch #")
    plt.ylabel("Loss/Accuracy")
    plt.legend()
    plt.show()
def dbpedia_smallcharconv(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    logging.info('creating train test split ...')
    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    logging.info('preprocessing, padding and binarizing data ...')
    train_docs = [[CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text] for text
                  in train_df[['title', 'abstract']].apply(lambda cols: u'\n'.join(cols), axis=1).values]
    bin = LabelBinarizer()

    x_train = np.array(pad_sentences(train_docs, max_length=1014, padding_word=CHAR_MAP.index(' ')))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = [[CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text] for text
                 in test_df[['title', 'abstract']].apply(lambda cols: u'\n'.join(cols), axis=1).values]
    x_test = np.array(pad_sentences(test_docs, max_length=1014, padding_word=0))
    y_test = bin.transform(test_df.category.values)

    logging.info('building model ...')
    model = Sequential()
    model.add(Embedding(len(CHAR_MAP) + 1, len(CHAR_MAP) + 1, input_length=1014,
                        weights=[char_embedding()], trainable=False))
    model.add(Convolution1D(nb_filter=256, filter_length=7, border_mode='valid',
                            activation='relu'))
    model.add(MaxPooling1D(pool_length=3))
    model.add(Convolution1D(nb_filter=256, filter_length=7, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3))
    model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    print(model.summary())

    model.fit(x_train, y_train, batch_size=64, nb_epoch=5, validation_data=[x_test, y_test])

    print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
예제 #45
0
def test_label_binarizer():
    # one-class case defaults to negative label
    # For dense case:
    inp = ["pos", "pos", "pos", "pos"]
    lb = LabelBinarizer(sparse_output=False)
    expected = np.array([[0, 0, 0, 0]]).T
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ["pos"])
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)

    # For sparse case:
    lb = LabelBinarizer(sparse_output=True)
    got = lb.fit_transform(inp)
    assert issparse(got)
    assert_array_equal(lb.classes_, ["pos"])
    assert_array_equal(expected, got.toarray())
    assert_array_equal(lb.inverse_transform(got.toarray()), inp)

    lb = LabelBinarizer(sparse_output=False)
    # two-class case
    inp = ["neg", "pos", "pos", "neg"]
    expected = np.array([[0, 1, 1, 0]]).T
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ["neg", "pos"])
    assert_array_equal(expected, got)

    to_invert = np.array([[1, 0],
                          [0, 1],
                          [0, 1],
                          [1, 0]])
    assert_array_equal(lb.inverse_transform(to_invert), inp)

    # multi-class case
    inp = ["spam", "ham", "eggs", "ham", "0"]
    expected = np.array([[0, 0, 0, 1],
                         [0, 0, 1, 0],
                         [0, 1, 0, 0],
                         [0, 0, 1, 0],
                         [1, 0, 0, 0]])
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam'])
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)
def prepocess_label(y):
    return LabelBinarizer().fit_transform(y)
def dbpedia_smallwordconv(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    logging.info('creating train test split ...')
    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    logging.info('preprocessing, padding and binarizing data ...')
    train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True)
    vocab = Dictionary(train_docs)
    vocab.filter_extremes(keep_n=5000)
    bin = LabelBinarizer()

    x_train = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                      for s in train_docs],
                                     max_length=100, padding_word=0))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True)
    x_test = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
                                      for s in test_docs],
                                     max_length=100, padding_word=0))
    y_test = bin.transform(test_df.category.values)

    logging.info('building model ...')
    model = Sequential()
    model.add(Embedding(5001, 300, input_length=100))
    model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Convolution1D(nb_filter=300, filter_length=7, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(Convolution1D(nb_filter=300, filter_length=3, border_mode='valid',
                            activation='relu', subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    model.fit(x_train, y_train, batch_size=32, nb_epoch=5, validation_data=[x_test, y_test])

    print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
예제 #48
0
def test_label_binarizer_column_y():
    # first for binary classification vs multi-label with 1 possible class
    # lists are multi-label, array is multi-class :-/
    inp_list = [[1], [2], [1]]
    inp_array = np.array(inp_list)

    multilabel_indicator = np.array([[1, 0], [0, 1], [1, 0]])
    binaryclass_array = np.array([[0], [1], [0]])

    lb_1 = LabelBinarizer()
    out_1 = lb_1.fit_transform(inp_list)

    lb_2 = LabelBinarizer()
    out_2 = lb_2.fit_transform(inp_array)

    assert_array_equal(out_1, multilabel_indicator)
    assert_true(assert_warns(DeprecationWarning, getattr, lb_1, "multilabel_"))
    assert_false(assert_warns(DeprecationWarning, getattr, lb_1,
                              "indicator_matrix_"))

    assert_array_equal(out_2, binaryclass_array)
    assert_false(assert_warns(DeprecationWarning, getattr, lb_2,
                              "multilabel_"))

    # second for multiclass classification vs multi-label with multiple
    # classes
    inp_list = [[1], [2], [1], [3]]
    inp_array = np.array(inp_list)

    # the indicator matrix output is the same in this case
    indicator = np.array([[1, 0, 0], [0, 1, 0], [1, 0, 0], [0, 0, 1]])

    lb_1 = LabelBinarizer()
    out_1 = lb_1.fit_transform(inp_list)

    lb_2 = LabelBinarizer()
    out_2 = lb_2.fit_transform(inp_array)

    assert_array_equal(out_1, out_2)
    assert_true(assert_warns(DeprecationWarning, getattr, lb_1, "multilabel_"))

    assert_array_equal(out_2, indicator)
    assert_false(assert_warns(DeprecationWarning, getattr, lb_2,
                              "multilabel_"))