def test_label_binarizer_set_label_encoding():
    lb = LabelBinarizer(neg_label=-2, pos_label=0)

    # two-class case with pos_label=0
    inp = np.array([0, 1, 1, 0])
    expected = np.array([[-2, 0, 0, -2]]).T
    got = lb.fit_transform(inp)
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)

    lb = LabelBinarizer(neg_label=-2, pos_label=2)

    # multi-class case
    inp = np.array([3, 2, 1, 2, 0])
    expected = np.array([[-2, -2, -2, +2], [-2, -2, +2, -2], [-2, +2, -2, -2],
                         [-2, -2, +2, -2], [+2, -2, -2, -2]])
    got = lb.fit_transform(inp)
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)
예제 #2
0
def test_label_binarizer_unseen_labels():
    lb = LabelBinarizer()

    expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
    got = lb.fit_transform(['b', 'd', 'e'])
    assert_array_equal(expected, got)

    expected = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1],
                         [0, 0, 0]])
    got = lb.transform(['a', 'b', 'c', 'd', 'e', 'f'])
    assert_array_equal(expected, got)
예제 #3
0
def test_label_binarizer_errors():
    # Check that invalid arguments yield ValueError
    one_class = np.array([0, 0, 0, 0])
    lb = LabelBinarizer().fit(one_class)

    multi_label = [(2, 3), (0,), (0, 2)]
    assert_raises(ValueError, lb.transform, multi_label)

    lb = LabelBinarizer()
    assert_raises(ValueError, lb.transform, [])
    assert_raises(ValueError, lb.inverse_transform, [])

    assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=1)
    assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=2)

    assert_raises(ValueError, LabelBinarizer, neg_label=1, pos_label=2,
                  sparse_output=True)

    # Fail on y_type
    assert_raises(ValueError, _inverse_binarize_thresholding,
                  y=csr_matrix([[1, 2], [2, 1]]), output_type="foo",
                  classes=[1, 2], threshold=0)

    # Sequence of seq type should raise ValueError
    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
    assert_raises(ValueError, LabelBinarizer().fit_transform, y_seq_of_seqs)

    # Fail on the number of classes
    assert_raises(ValueError, _inverse_binarize_thresholding,
                  y=csr_matrix([[1, 2], [2, 1]]), output_type="foo",
                  classes=[1, 2, 3], threshold=0)

    # Fail on the dimension of 'binary'
    assert_raises(ValueError, _inverse_binarize_thresholding,
                  y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary",
                  classes=[1, 2, 3], threshold=0)

    # Fail on multioutput data
    assert_raises(ValueError, LabelBinarizer().fit, np.array([[1, 3], [2, 1]]))
    assert_raises(ValueError, label_binarize, np.array([[1, 3], [2, 1]]),
                  [1, 2, 3])
예제 #4
0
def test_label_binarizer_column_y():
    # first for binary classification vs multi-label with 1 possible class
    # lists are multi-label, array is multi-class :-/
    inp_list = [[1], [2], [1]]
    inp_array = np.array(inp_list)

    multilabel_indicator = np.array([[1, 0], [0, 1], [1, 0]])
    binaryclass_array = np.array([[0], [1], [0]])

    lb_1 = LabelBinarizer()
    out_1 = lb_1.fit_transform(inp_list)

    lb_2 = LabelBinarizer()
    out_2 = lb_2.fit_transform(inp_array)

    assert_array_equal(out_1, multilabel_indicator)
    assert_true(lb_1.multilabel_)

    assert_array_equal(out_2, binaryclass_array)
    assert_false(lb_2.multilabel_)

    # second for multiclass classification vs multi-label with multiple
    # classes
    inp_list = [[1], [2], [1], [3]]
    inp_array = np.array(inp_list)

    # the indicator matrix output is the same in this case
    indicator = np.array([[1, 0, 0], [0, 1, 0], [1, 0, 0], [0, 0, 1]])

    lb_1 = LabelBinarizer()
    out_1 = lb_1.fit_transform(inp_list)

    lb_2 = LabelBinarizer()
    out_2 = lb_2.fit_transform(inp_array)

    assert_array_equal(out_1, out_2)
    assert_true(lb_1.multilabel_)

    assert_array_equal(out_2, indicator)
    assert_false(lb_2.multilabel_)
예제 #5
0
def test_label_binarizer_errors():
    """Check that invalid arguments yield ValueError"""
    one_class = np.array([0, 0, 0, 0])
    lb = LabelBinarizer().fit(one_class)
    assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_"))

    multi_label = [(2, 3), (0,), (0, 2)]
    assert_raises(ValueError, lb.transform, multi_label)

    lb = LabelBinarizer()
    assert_raises(ValueError, lb.transform, [])
    assert_raises(ValueError, lb.inverse_transform, [])

    y = np.array([[0, 1, 0], [1, 1, 1]])
    classes = np.arange(3)
    assert_raises(ValueError, label_binarize, y, classes, multilabel=True,
                  neg_label=2, pos_label=1)
    assert_raises(ValueError, label_binarize, y, classes, multilabel=True,
                  neg_label=2, pos_label=2)

    assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=1)
    assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=2)

    assert_raises(ValueError, LabelBinarizer, neg_label=1, pos_label=2,
                  sparse_output=True)

    # Fail on y_type
    assert_raises(ValueError, _inverse_binarize_thresholding,
                  y=csr_matrix([[1, 2], [2, 1]]), output_type="foo",
                  classes=[1, 2], threshold=0)

    # Fail on the number of classes
    assert_raises(ValueError, _inverse_binarize_thresholding,
                  y=csr_matrix([[1, 2], [2, 1]]), output_type="foo",
                  classes=[1, 2, 3], threshold=0)

    # Fail on the dimension of 'binary'
    assert_raises(ValueError, _inverse_binarize_thresholding,
                  y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary",
                  classes=[1, 2, 3], threshold=0)
예제 #6
0
def test_label_binarizer_iris():
    lb = LabelBinarizer()
    Y = lb.fit_transform(iris.target)
    clfs = [
        SGDClassifier().fit(iris.data, Y[:, k])
        for k in range(len(lb.classes_))
    ]
    Y_pred = np.array([clf.decision_function(iris.data) for clf in clfs]).T
    y_pred = lb.inverse_transform(Y_pred)
    accuracy = np.mean(iris.target == y_pred)
    y_pred2 = SGDClassifier().fit(iris.data, iris.target).predict(iris.data)
    accuracy2 = np.mean(iris.target == y_pred2)
    assert_almost_equal(accuracy, accuracy2)
예제 #7
0
def test_label_binarizer_errors():
    # Check that invalid arguments yield ValueError
    one_class = np.array([0, 0, 0, 0])
    lb = LabelBinarizer().fit(one_class)

    multi_label = [(2, 3), (0, ), (0, 2)]
    with pytest.raises(ValueError):
        lb.transform(multi_label)

    lb = LabelBinarizer()
    with pytest.raises(ValueError):
        lb.transform([])
    with pytest.raises(ValueError):
        lb.inverse_transform([])

    with pytest.raises(ValueError):
        LabelBinarizer(neg_label=2, pos_label=1)
    with pytest.raises(ValueError):
        LabelBinarizer(neg_label=2, pos_label=2)

    with pytest.raises(ValueError):
        LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True)

    # Fail on y_type
    with pytest.raises(ValueError):
        _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
                                       output_type="foo",
                                       classes=[1, 2],
                                       threshold=0)

    # Sequence of seq type should raise ValueError
    y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]]
    with pytest.raises(ValueError):
        LabelBinarizer().fit_transform(y_seq_of_seqs)

    # Fail on the number of classes
    with pytest.raises(ValueError):
        _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]),
                                       output_type="foo",
                                       classes=[1, 2, 3],
                                       threshold=0)

    # Fail on the dimension of 'binary'
    with pytest.raises(ValueError):
        _inverse_binarize_thresholding(y=np.array([[1, 2, 3], [2, 1, 3]]),
                                       output_type="binary",
                                       classes=[1, 2, 3],
                                       threshold=0)

    # Fail on multioutput data
    with pytest.raises(ValueError):
        LabelBinarizer().fit(np.array([[1, 3], [2, 1]]))
    with pytest.raises(ValueError):
        label_binarize(np.array([[1, 3], [2, 1]]), [1, 2, 3])
예제 #8
0
def test_label_binarizer():
    # one-class case defaults to negative label
    # For dense case:
    inp = ["pos", "pos", "pos", "pos"]
    lb = LabelBinarizer(sparse_output=False)
    expected = np.array([[0, 0, 0, 0]]).T
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ["pos"])
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)

    # For sparse case:
    lb = LabelBinarizer(sparse_output=True)
    got = lb.fit_transform(inp)
    assert_true(issparse(got))
    assert_array_equal(lb.classes_, ["pos"])
    assert_array_equal(expected, got.toarray())
    assert_array_equal(lb.inverse_transform(got.toarray()), inp)

    lb = LabelBinarizer(sparse_output=False)
    # two-class case
    inp = ["neg", "pos", "pos", "neg"]
    expected = np.array([[0, 1, 1, 0]]).T
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ["neg", "pos"])
    assert_array_equal(expected, got)

    to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]])
    assert_array_equal(lb.inverse_transform(to_invert), inp)

    # multi-class case
    inp = ["spam", "ham", "eggs", "ham", "0"]
    expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0],
                         [0, 0, 1, 0], [1, 0, 0, 0]])
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam'])
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)
    def __init__(self, trainFile, devFile, testFile, d=100, yita=0.1):
        self.d = d
        self.yita = yita
        self.languages = {"ENGLISH": 1, "FRENCH": 3, "ITALIAN": 2}
        self.punctuations = [".", "'", ":", ",", "-", "...", "!", "_", "(", ")", "?", '"', ";", "/", "\\", "{", "}", \
                             "[", "]", "|", "<", ">", "+", "=", "@", "#", "$", "%", "^","&", "*"]
        self.noPunctuation = False
        self.answerLables = LabelBinarizer()
        self.answerLables.fit([1, 2, 3])
        self.c = set()

        self.Initialize(trainFile, devFile, testFile)

        self.input = len(self.c) * 5 + 1
        self.setParameters(d, yita)
예제 #10
0
def test_label_binarizer_multilabel():
    lb = LabelBinarizer()

    # test input as lists of tuples
    inp = [(2, 3), (1, ), (1, 2)]
    indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]])
    got = lb.fit_transform(inp)
    assert_true(lb.multilabel_)
    assert_array_equal(indicator_mat, got)
    assert_equal(lb.inverse_transform(got), inp)

    # test input as label indicator matrix
    lb.fit(indicator_mat)
    assert_array_equal(indicator_mat, lb.inverse_transform(indicator_mat))

    # regression test for the two-class multilabel case
    lb = LabelBinarizer()
    inp = [[1, 0], [0], [1], [0, 1]]
    expected = np.array([[1, 1], [1, 0], [0, 1], [1, 1]])
    got = lb.fit_transform(inp)
    assert_true(lb.multilabel_)
    assert_array_equal(expected, got)
    assert_equal([set(x) for x in lb.inverse_transform(got)],
                 [set(x) for x in inp])
예제 #11
0
def fit_binarizers(all_values):
    binarizers = {}
    for f in range(len(all_values[0])):
        cur_features = [context[f] for context in all_values]
        # only categorical values need to be binarized, ints/floats are left as they are
        if type(cur_features[0]) == str or type(cur_features[0]) == unicode:
            lb = LabelBinarizer()
            lb.fit(cur_features)
            binarizers[f] = lb
        elif type(cur_features[0]) == list:
            mlb = MultiLabelBinarizer()
            # default feature for unknown values
            cur_features.append(tuple(("__unk__",)))
            mlb.fit([tuple(x) for x in cur_features])
            binarizers[f] = mlb
    return binarizers
예제 #12
0
def check_binarized_results(y, classes, pos_label, neg_label, expected):
    for sparse_output in [True, False]:
        if ((pos_label == 0 or neg_label != 0) and sparse_output):
            assert_raises(ValueError,
                          label_binarize,
                          y,
                          classes,
                          neg_label=neg_label,
                          pos_label=pos_label,
                          sparse_output=sparse_output)
            continue

        # check label_binarize
        binarized = label_binarize(y,
                                   classes,
                                   neg_label=neg_label,
                                   pos_label=pos_label,
                                   sparse_output=sparse_output)
        assert_array_equal(toarray(binarized), expected)
        assert_equal(issparse(binarized), sparse_output)

        # check inverse
        y_type = type_of_target(y)
        if y_type == "multiclass":
            inversed = _inverse_binarize_multiclass(binarized, classes=classes)

        else:
            inversed = _inverse_binarize_thresholding(
                binarized,
                output_type=y_type,
                classes=classes,
                threshold=((neg_label + pos_label) / 2.))

        assert_array_equal(toarray(inversed), toarray(y))

        # Check label binarizer
        lb = LabelBinarizer(neg_label=neg_label,
                            pos_label=pos_label,
                            sparse_output=sparse_output)
        binarized = lb.fit_transform(y)
        assert_array_equal(toarray(binarized), expected)
        assert_equal(issparse(binarized), sparse_output)
        inverse_output = lb.inverse_transform(binarized)
        assert_array_equal(toarray(inverse_output), toarray(y))
        assert_equal(issparse(inverse_output), issparse(y))
예제 #13
0
def getLoss(w, x, y, lam):
    m = x.shape[0]  # First we get the number of training examples
    #y_mat = oneHotIt(y) #Next we convert the integer class coding into a one-hot representation
    lb = LabelBinarizer()
    y_mat = lb.fit_transform(y)
    b = np.random.rand(len(x), 10)
    #scores = np.sum(np.dot(x, w), b)  # Then we compute raw class scores given our input and current weights
    scores = np.dot(
        x, w
    )  # Then we compute raw class scores given our input and current weights
    prob = softmax(
        scores
    )  # Next we perform a softmax on these scores to get their probabilities
    loss = (-1 / m) * np.sum(y_mat * np.log(prob)) + (lam / 2) * np.sum(
        w * w)  # We then find the loss of the probabilities
    grad = (-1 / m) * np.dot(
        x.T,
        (y_mat - prob)) + lam * w  # And compute the gradient for that loss
    return loss, grad
예제 #14
0
def test_label_binarizer():
    lb = LabelBinarizer()

    # two-class case
    inp = ["neg", "pos", "pos", "neg"]
    expected = np.array([[0, 1, 1, 0]]).T
    got = lb.fit_transform(inp)
    assert_false(lb.multilabel_)
    assert_array_equal(lb.classes_, ["neg", "pos"])
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)

    # multi-class case
    inp = ["spam", "ham", "eggs", "ham", "0"]
    expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0],
                         [0, 0, 1, 0], [1, 0, 0, 0]])
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam'])
    assert_false(lb.multilabel_)
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)
예제 #15
0
def _log_loss(y_true, y_pred, eps=1e-10, sample_weight=None):
    """ This is shorter ans simpler version og log_loss, which supports sample_weight """
    sample_weight = check_sample_weight(y_true, sample_weight=sample_weight)
    y_true, y_pred, sample_weight = check_arrays(y_true, y_pred, sample_weight)
    y_true = column_or_1d(y_true)

    lb = LabelBinarizer()
    T = lb.fit_transform(y_true)
    if T.shape[1] == 1:
        T = numpy.append(1 - T, T, axis=1)

    # Clipping
    Y = numpy.clip(y_pred, eps, 1 - eps)

    # Check if dimensions are consistent.
    T, Y = check_arrays(T, Y)

    # Renormalize
    Y /= Y.sum(axis=1)[:, numpy.newaxis]
    loss = -(T * numpy.log(Y) *
             sample_weight[:, numpy.newaxis]).sum() / numpy.sum(sample_weight)
    return loss
예제 #16
0
def test_label_binarizer():
    lb = LabelBinarizer()

    # one-class case defaults to negative label
    inp = ["pos", "pos", "pos", "pos"]
    expected = np.array([[0, 0, 0, 0]]).T
    got = lb.fit_transform(inp)
    assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_"))
    assert_array_equal(lb.classes_, ["pos"])
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)

    # two-class case
    inp = ["neg", "pos", "pos", "neg"]
    expected = np.array([[0, 1, 1, 0]]).T
    got = lb.fit_transform(inp)
    assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_"))
    assert_array_equal(lb.classes_, ["neg", "pos"])
    assert_array_equal(expected, got)

    to_invert = np.array([[1, 0],
                          [0, 1],
                          [0, 1],
                          [1, 0]])
    assert_array_equal(lb.inverse_transform(to_invert), inp)

    # multi-class case
    inp = ["spam", "ham", "eggs", "ham", "0"]
    expected = np.array([[0, 0, 0, 1],
                         [0, 0, 1, 0],
                         [0, 1, 0, 0],
                         [0, 0, 1, 0],
                         [1, 0, 0, 0]])
    got = lb.fit_transform(inp)
    assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam'])
    assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_"))
    assert_array_equal(expected, got)
    assert_array_equal(lb.inverse_transform(got), inp)
def dbpedia_smallcharconv(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(
            round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    logging.info('creating train test split ...')
    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    logging.info('preprocessing, padding and binarizing data ...')
    train_docs = [[
        CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text
    ] for text in train_df[['title', 'abstract']].apply(
        lambda cols: u'\n'.join(cols), axis=1).values]
    bin = LabelBinarizer()

    x_train = np.array(
        pad_sentences(train_docs,
                      max_length=1014,
                      padding_word=CHAR_MAP.index(' ')))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = [[
        CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text
    ] for text in test_df[['title', 'abstract']].apply(
        lambda cols: u'\n'.join(cols), axis=1).values]
    x_test = np.array(pad_sentences(test_docs, max_length=1014,
                                    padding_word=0))
    y_test = bin.transform(test_df.category.values)

    logging.info('building model ...')
    model = Sequential()
    model.add(
        Embedding(len(CHAR_MAP) + 1,
                  len(CHAR_MAP) + 1,
                  input_length=1014,
                  weights=[char_embedding()],
                  trainable=False))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=7,
                      border_mode='valid',
                      activation='relu'))
    model.add(MaxPooling1D(pool_length=3))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=7,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=3))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=256,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=3))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    print(model.summary())

    model.fit(x_train,
              y_train,
              batch_size=64,
              nb_epoch=5,
              validation_data=[x_test, y_test])

    print(
        accuracy_score(
            np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
예제 #18
0
# find pairs for samplesIMG in samplesCSV
samples_paired = []
for i in range(samplesIMG.shape[0]):
    for j in range(samplesCSV.shape[0]):
        if namesCSV[j] == namesIMG[i]:
            samples_paired.append(samplesCSV[j])

samplesCSV = np.array(samples_paired)
samplesIMG = np.expand_dims(samplesIMG, axis=3)

print("Paired")
print("Samples IMG: {}".format(len(samplesIMG)))
print("Samples CSV: {}".format(len(samplesCSV)))

# one-hot encoding
lb = LabelBinarizer()
labels = lb.fit_transform(labels)
numClasses = labels.shape[1]

inputShape = (108, 192, 1)  #samplesIMG.shape

#model for images
cnnmodel = Sequential()
cnnmodel.add(Conv2D(16, (3, 3), padding="same", input_shape=inputShape))
cnnmodel.add(Activation("relu"))
cnnmodel.add(MaxPooling2D(pool_size=(2, 2)))
cnnmodel.add(Conv2D(32, (3, 3), padding="same"))
cnnmodel.add(Activation("relu"))
cnnmodel.add(MaxPooling2D(pool_size=(2, 2)))
cnnmodel.add(Dropout(0.25))
cnnmodel.add(Flatten())
예제 #19
0
def main():

    #load data 
    file = "datasetA_3c.csv"
    dataframe = pandas.read_csv(file)
    dataset = dataframe.values
    samples = dataset[:,1:]
    labels = dataset[:,0]
    samples = np.array(samples)
    labels = np.array(labels)
    labels = labels.astype(str)

    print("Class distribution:")
    print(Counter(labels))

### choose k best attributes
#    from sklearn.feature_selection.univariate_selection import SelectKBest
#    newSamples = SelectKBest(k=100).fit_transform(samples, labels)
#    print(newSamples.shape) 
#    samples = newSamples

### Calculate weights for unbalanced classes
#    from sklearn.utils import class_weight
#    d_class_weights = None
#    class_weights = class_weight.compute_class_weight('balanced',np.unique(labels),labels)
#    print("Class weights:")
#    print(class_weights)
#    d_class_weights = dict(enumerate(class_weights))

### Normalize samples
#    from sklearn.preprocessing.data import normalize
#    normalize(samples)


    ## convert to one-hot encoding
    lb = LabelBinarizer()
    labels = lb.fit_transform(labels)
    classesNum = labels.shape[1]
    print ("Classes: {}".format(classesNum))

    trainSamples = samples
    trainLabels = labels
    testSamples = samples
    testLabels = labels
    
### Division into training and test samples
#    from sklearn.model_selection._split import train_test_split
#    (trainSamples, testSamples, trainLabels, testLabels) = train_test_split(samples, labels, test_size=0.25, random_state=42)
    
    
    model = Sequential()
    model.add(Dense(250, activation='sigmoid'))
    model.add(Dense(250, activation='sigmoid'))
    model.add(Dense(250, activation='sigmoid'))
    model.add(Dense(classesNum, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer="adam",metrics=['accuracy'])

    EPOCHS=50
    BATCH=50
    H = model.fit(trainSamples, trainLabels, batch_size=BATCH, epochs=EPOCHS
              #,class_weight=d_class_weights
              #,validation_data=(testSamples,testLabels)
              #,validation_split=0.1
              )
    mlpResults = model.predict(testSamples)

    print(confusion_matrix(testLabels.argmax(axis=1), mlpResults.argmax(axis=1)))
    print(classification_report(testLabels.argmax(axis=1), mlpResults.argmax(axis=1),target_names=lb.classes_))
    print("MLP Accuracy: {:.2f}".format(accuracy_score(testLabels.argmax(axis=1), mlpResults.argmax(axis=1))))
    print("Cohen's Kappa {:.2f}".format(cohen_kappa_score(testLabels.argmax(axis=1), mlpResults.argmax(axis=1))))

    N = np.arange(0, EPOCHS)
    plt.style.use("ggplot")
    plt.figure()
    plt.plot(N, H.history["loss"], label="train_loss")
    plt.plot(N, H.history["acc"], label="train_acc")
    #plt.plot(N, H.history["val_loss"], label="val_loss")
    #plt.plot(N, H.history["val_acc"], label="val_acc")
    
    plt.title("Training Loss and Accuracy")
    plt.xlabel("Epoch #")
    plt.ylabel("Loss/Accuracy")
    plt.legend()
    plt.show()
def prepocess_label(y):
    return LabelBinarizer().fit_transform(y)
예제 #21
0
def main():
    print("Loading samples and labels")
    samples, labels, _ = load_files("data")
    print("Loaded {} samples".format(samples.shape[0]))

    sequence_dim = 100
    print("Converting to sequences of length {}".format(sequence_dim))
    samples, labels = make_sequences(samples, labels, sequence_dim)

    print("Number of samples from sequences: {}".format(samples.shape[0]))

    lb = LabelBinarizer()
    labels = lb.fit_transform(labels)

    # flattened samples for Decision Tree
    flatSamples = samples.reshape(samples.shape[0], -1)  #tree!
    (trainSamples, testSamples, trainLabels,
     testLabels) = train_test_split(flatSamples,
                                    labels,
                                    test_size=0.25,
                                    random_state=42)

    print("=" * 20)
    print("Building DecisionTree model")
    model = DecisionTreeClassifier()
    model.fit(trainSamples, trainLabels)
    treeResults = model.predict(testSamples)
    print(
        confusion_matrix(testLabels.argmax(axis=1),
                         treeResults.argmax(axis=1)))
    print(
        classification_report(testLabels.argmax(axis=1),
                              treeResults.argmax(axis=1)))
    treeAcc = accuracy_score(testLabels.argmax(axis=1),
                             treeResults.argmax(axis=1))
    print("Accuracy Tree: {:.2f}".format(treeAcc))
    print("Cohen's Kappa {:.2f}".format(
        cohen_kappa_score(testLabels.argmax(axis=1),
                          treeResults.argmax(axis=1))))

    print("=" * 20)
    print("Building CNN model")

    (trainSamples, testSamples, trainLabels,
     testLabels) = train_test_split(samples,
                                    labels,
                                    test_size=0.25,
                                    random_state=42)
    inputShape = (samples.shape[1], samples.shape[2])
    model = Sequential()
    model.add(Conv1D(32, 10, padding="same", input_shape=inputShape))
    model.add(Activation("relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))

    model.add(Conv1D(64, 10, padding="same"))
    model.add(Activation("relu"))
    model.add(BatchNormalization())
    model.add(Dropout(0.2))
    model.add(Conv1D(128, 10, padding="same"))
    model.add(Activation("relu"))
    model.add(Dropout(0.2))
    model.add(Flatten(input_shape=inputShape))
    model.add(Dense(128, activation='sigmoid'))
    model.add(Dense(64, activation='sigmoid'))
    model.add(Dense(labels.shape[1], activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer="adam",
                  metrics=['accuracy'])

    EPOCHS = 10
    BATCH = 128
    model.fit(trainSamples,
              trainLabels,
              batch_size=BATCH,
              epochs=EPOCHS,
              validation_data=(testSamples, testLabels))

    cnnResults = model.predict(testSamples)

    print(
        confusion_matrix(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)))
    print(
        classification_report(testLabels.argmax(axis=1),
                              cnnResults.argmax(axis=1),
                              target_names=lb.classes_))
    print("CNN Accuracy: {:.2f}".format(
        accuracy_score(testLabels.argmax(axis=1), cnnResults.argmax(axis=1))))
    print("Cohen's Kappa {:.2f}".format(
        cohen_kappa_score(testLabels.argmax(axis=1),
                          cnnResults.argmax(axis=1))))
    input("")
 def __init__(self):
     self.__encoder = LabelBinarizer(sparse_output=False)
예제 #23
0
def main():
    samples, labels, _ = loader.load_img("radio_img")

    #add the fourth dimension (color)
    samples = np.expand_dims(samples, axis=4)

    print("shape = {}".format(samples.shape))
    inputShape = (samples.shape[1], samples.shape[2], samples.shape[3])
    print("inputShape = {}".format(inputShape))

    #weights
    class_weights = class_weight.compute_class_weight('balanced',
                                                      np.unique(labels),
                                                      labels)
    d_class_weights = dict(enumerate(class_weights))
    print("weights {}".format(d_class_weights))

    #one-hot encoding
    lb = LabelBinarizer()
    labels = lb.fit_transform(labels)
    classesNum = labels.shape[1]
    print("Classes: {}".format(classesNum))

    #split to training and test
    (trainSamples, testSamples, trainLabels,
     testLabels) = train_test_split(samples,
                                    labels,
                                    test_size=0.25,
                                    random_state=42)

    model = cnn_model(inputShape, classesNum)

    ## checkpoints
    #    checkpt1 = ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5', save_best_only=True)
    #    checkpt2 = EarlyStopping(monitor='val_loss', patience=3)

    EPOCHS = 20
    BATCH = 50
    model.fit(
        trainSamples,
        trainLabels,
        batch_size=BATCH,
        epochs=EPOCHS,
        class_weight=d_class_weights,
        verbose=1,
        #callbacks = [checkpt1,checkpt2],
        validation_data=(testSamples, testLabels))

    cnnResults = model.predict(testSamples)

    print(
        confusion_matrix(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)))
    print(
        classification_report(testLabels.argmax(axis=1),
                              cnnResults.argmax(axis=1)))
    cnnAcc = accuracy_score(testLabels.argmax(axis=1),
                            cnnResults.argmax(axis=1))
    print("Accuracy CNN: {:.2f}".format(cnnAcc))
    print("Cohen's Kappa {:.2f}".format(
        cohen_kappa_score(testLabels.argmax(axis=1),
                          cnnResults.argmax(axis=1))))
    input("")
def dbpedia_convgemb(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(
            round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    train_docs = DataframeSentences(train_df,
                                    cols=['title', 'abstract'],
                                    flatten=True)
    vocab = Dictionary(train_docs)
    vocab.filter_extremes(keep_n=5000)
    bin = LabelBinarizer()

    x_train = np.array(
        pad_sentences(
            [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
             for s in train_docs],
            max_length=100,
            padding_word=0))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = DataframeSentences(test_df,
                                   cols=['title', 'abstract'],
                                   flatten=True)
    x_test = np.array(
        pad_sentences(
            [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
             for s in test_docs],
            max_length=100,
            padding_word=0))
    y_test = bin.transform(test_df.category.values)

    emb_weights = load_w2v_weights(vocab)

    model = Sequential()
    model.add(
        Embedding(5001,
                  300,
                  input_length=100,
                  dropout=.2,
                  weights=[emb_weights],
                  trainable=False))
    model.add(
        Convolution1D(nb_filter=50,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=model.output_shape[1]))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dropout(.2))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    model.fit(x_train, y_train)

    print(
        accuracy_score(
            np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
def dbpedia_smallwordconv(sample=None, n_procs=None):
    if not n_procs:
        n_procs = cpu_count()

    df = get_dbpedia_data(size=sample)

    if sample:
        test_size = int(
            round(np.sum(5000 * df.category.value_counts().values / 45000)))
    else:
        test_size = 5000 * 14

    logging.info('creating train test split ...')
    split = StratifiedShuffleSplit(df.category, test_size=test_size)
    train_split, test_split = next(iter(split))
    train_df = df.iloc[train_split]
    test_df = df.iloc[test_split]

    logging.info('preprocessing, padding and binarizing data ...')
    train_docs = DataframeSentences(train_df,
                                    cols=['title', 'abstract'],
                                    flatten=True)
    vocab = Dictionary(train_docs)
    vocab.filter_extremes(keep_n=5000)
    bin = LabelBinarizer()

    x_train = np.array(
        pad_sentences(
            [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
             for s in train_docs],
            max_length=100,
            padding_word=0))
    y_train = bin.fit_transform(train_df.category.values)

    test_docs = DataframeSentences(test_df,
                                   cols=['title', 'abstract'],
                                   flatten=True)
    x_test = np.array(
        pad_sentences(
            [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id]
             for s in test_docs],
            max_length=100,
            padding_word=0))
    y_test = bin.transform(test_df.category.values)

    logging.info('building model ...')
    model = Sequential()
    model.add(Embedding(5001, 300, input_length=100))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=7,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=7,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(
        Convolution1D(nb_filter=300,
                      filter_length=3,
                      border_mode='valid',
                      activation='relu',
                      subsample_length=1))
    model.add(MaxPooling1D(pool_length=3, stride=1))
    model.add(Flatten())
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(1024, activation='relu'))
    model.add(Dropout(.5))
    model.add(Dense(14, activation='sigmoid'))

    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['categorical_accuracy'])

    model.fit(x_train,
              y_train,
              batch_size=32,
              nb_epoch=5,
              validation_data=[x_test, y_test])

    print(
        accuracy_score(
            np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
예제 #26
0
def test_label_binarizer_multilabel_unlabeled():
    """Check that LabelBinarizer can handle an unlabeled sample"""
    lb = LabelBinarizer()
    y = [[1, 2], [1], []]
    Y = np.array([[1, 1], [1, 0], [0, 0]])
    assert_array_equal(lb.fit_transform(y), Y)
예제 #27
0
 def __init__(self, *args, **kwargs):
     self.encoder = LabelBinarizer(*args, **kwargs)