def test_label_binarizer_set_label_encoding(): lb = LabelBinarizer(neg_label=-2, pos_label=0) # two-class case with pos_label=0 inp = np.array([0, 1, 1, 0]) expected = np.array([[-2, 0, 0, -2]]).T got = lb.fit_transform(inp) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) lb = LabelBinarizer(neg_label=-2, pos_label=2) # multi-class case inp = np.array([3, 2, 1, 2, 0]) expected = np.array([[-2, -2, -2, +2], [-2, -2, +2, -2], [-2, +2, -2, -2], [-2, -2, +2, -2], [+2, -2, -2, -2]]) got = lb.fit_transform(inp) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def test_label_binarizer_unseen_labels(): lb = LabelBinarizer() expected = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) got = lb.fit_transform(['b', 'd', 'e']) assert_array_equal(expected, got) expected = np.array([[0, 0, 0], [1, 0, 0], [0, 0, 0], [0, 1, 0], [0, 0, 1], [0, 0, 0]]) got = lb.transform(['a', 'b', 'c', 'd', 'e', 'f']) assert_array_equal(expected, got)
def test_label_binarizer_errors(): # Check that invalid arguments yield ValueError one_class = np.array([0, 0, 0, 0]) lb = LabelBinarizer().fit(one_class) multi_label = [(2, 3), (0,), (0, 2)] assert_raises(ValueError, lb.transform, multi_label) lb = LabelBinarizer() assert_raises(ValueError, lb.transform, []) assert_raises(ValueError, lb.inverse_transform, []) assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=1) assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=2) assert_raises(ValueError, LabelBinarizer, neg_label=1, pos_label=2, sparse_output=True) # Fail on y_type assert_raises(ValueError, _inverse_binarize_thresholding, y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2], threshold=0) # Sequence of seq type should raise ValueError y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]] assert_raises(ValueError, LabelBinarizer().fit_transform, y_seq_of_seqs) # Fail on the number of classes assert_raises(ValueError, _inverse_binarize_thresholding, y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2, 3], threshold=0) # Fail on the dimension of 'binary' assert_raises(ValueError, _inverse_binarize_thresholding, y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary", classes=[1, 2, 3], threshold=0) # Fail on multioutput data assert_raises(ValueError, LabelBinarizer().fit, np.array([[1, 3], [2, 1]])) assert_raises(ValueError, label_binarize, np.array([[1, 3], [2, 1]]), [1, 2, 3])
def test_label_binarizer_column_y(): # first for binary classification vs multi-label with 1 possible class # lists are multi-label, array is multi-class :-/ inp_list = [[1], [2], [1]] inp_array = np.array(inp_list) multilabel_indicator = np.array([[1, 0], [0, 1], [1, 0]]) binaryclass_array = np.array([[0], [1], [0]]) lb_1 = LabelBinarizer() out_1 = lb_1.fit_transform(inp_list) lb_2 = LabelBinarizer() out_2 = lb_2.fit_transform(inp_array) assert_array_equal(out_1, multilabel_indicator) assert_true(lb_1.multilabel_) assert_array_equal(out_2, binaryclass_array) assert_false(lb_2.multilabel_) # second for multiclass classification vs multi-label with multiple # classes inp_list = [[1], [2], [1], [3]] inp_array = np.array(inp_list) # the indicator matrix output is the same in this case indicator = np.array([[1, 0, 0], [0, 1, 0], [1, 0, 0], [0, 0, 1]]) lb_1 = LabelBinarizer() out_1 = lb_1.fit_transform(inp_list) lb_2 = LabelBinarizer() out_2 = lb_2.fit_transform(inp_array) assert_array_equal(out_1, out_2) assert_true(lb_1.multilabel_) assert_array_equal(out_2, indicator) assert_false(lb_2.multilabel_)
def test_label_binarizer_errors(): """Check that invalid arguments yield ValueError""" one_class = np.array([0, 0, 0, 0]) lb = LabelBinarizer().fit(one_class) assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_")) multi_label = [(2, 3), (0,), (0, 2)] assert_raises(ValueError, lb.transform, multi_label) lb = LabelBinarizer() assert_raises(ValueError, lb.transform, []) assert_raises(ValueError, lb.inverse_transform, []) y = np.array([[0, 1, 0], [1, 1, 1]]) classes = np.arange(3) assert_raises(ValueError, label_binarize, y, classes, multilabel=True, neg_label=2, pos_label=1) assert_raises(ValueError, label_binarize, y, classes, multilabel=True, neg_label=2, pos_label=2) assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=1) assert_raises(ValueError, LabelBinarizer, neg_label=2, pos_label=2) assert_raises(ValueError, LabelBinarizer, neg_label=1, pos_label=2, sparse_output=True) # Fail on y_type assert_raises(ValueError, _inverse_binarize_thresholding, y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2], threshold=0) # Fail on the number of classes assert_raises(ValueError, _inverse_binarize_thresholding, y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2, 3], threshold=0) # Fail on the dimension of 'binary' assert_raises(ValueError, _inverse_binarize_thresholding, y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary", classes=[1, 2, 3], threshold=0)
def test_label_binarizer_iris(): lb = LabelBinarizer() Y = lb.fit_transform(iris.target) clfs = [ SGDClassifier().fit(iris.data, Y[:, k]) for k in range(len(lb.classes_)) ] Y_pred = np.array([clf.decision_function(iris.data) for clf in clfs]).T y_pred = lb.inverse_transform(Y_pred) accuracy = np.mean(iris.target == y_pred) y_pred2 = SGDClassifier().fit(iris.data, iris.target).predict(iris.data) accuracy2 = np.mean(iris.target == y_pred2) assert_almost_equal(accuracy, accuracy2)
def test_label_binarizer_errors(): # Check that invalid arguments yield ValueError one_class = np.array([0, 0, 0, 0]) lb = LabelBinarizer().fit(one_class) multi_label = [(2, 3), (0, ), (0, 2)] with pytest.raises(ValueError): lb.transform(multi_label) lb = LabelBinarizer() with pytest.raises(ValueError): lb.transform([]) with pytest.raises(ValueError): lb.inverse_transform([]) with pytest.raises(ValueError): LabelBinarizer(neg_label=2, pos_label=1) with pytest.raises(ValueError): LabelBinarizer(neg_label=2, pos_label=2) with pytest.raises(ValueError): LabelBinarizer(neg_label=1, pos_label=2, sparse_output=True) # Fail on y_type with pytest.raises(ValueError): _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2], threshold=0) # Sequence of seq type should raise ValueError y_seq_of_seqs = [[], [1, 2], [3], [0, 1, 3], [2]] with pytest.raises(ValueError): LabelBinarizer().fit_transform(y_seq_of_seqs) # Fail on the number of classes with pytest.raises(ValueError): _inverse_binarize_thresholding(y=csr_matrix([[1, 2], [2, 1]]), output_type="foo", classes=[1, 2, 3], threshold=0) # Fail on the dimension of 'binary' with pytest.raises(ValueError): _inverse_binarize_thresholding(y=np.array([[1, 2, 3], [2, 1, 3]]), output_type="binary", classes=[1, 2, 3], threshold=0) # Fail on multioutput data with pytest.raises(ValueError): LabelBinarizer().fit(np.array([[1, 3], [2, 1]])) with pytest.raises(ValueError): label_binarize(np.array([[1, 3], [2, 1]]), [1, 2, 3])
def test_label_binarizer(): # one-class case defaults to negative label # For dense case: inp = ["pos", "pos", "pos", "pos"] lb = LabelBinarizer(sparse_output=False) expected = np.array([[0, 0, 0, 0]]).T got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ["pos"]) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) # For sparse case: lb = LabelBinarizer(sparse_output=True) got = lb.fit_transform(inp) assert_true(issparse(got)) assert_array_equal(lb.classes_, ["pos"]) assert_array_equal(expected, got.toarray()) assert_array_equal(lb.inverse_transform(got.toarray()), inp) lb = LabelBinarizer(sparse_output=False) # two-class case inp = ["neg", "pos", "pos", "neg"] expected = np.array([[0, 1, 1, 0]]).T got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ["neg", "pos"]) assert_array_equal(expected, got) to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]]) assert_array_equal(lb.inverse_transform(to_invert), inp) # multi-class case inp = ["spam", "ham", "eggs", "ham", "0"] expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]) got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam']) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def __init__(self, trainFile, devFile, testFile, d=100, yita=0.1): self.d = d self.yita = yita self.languages = {"ENGLISH": 1, "FRENCH": 3, "ITALIAN": 2} self.punctuations = [".", "'", ":", ",", "-", "...", "!", "_", "(", ")", "?", '"', ";", "/", "\\", "{", "}", \ "[", "]", "|", "<", ">", "+", "=", "@", "#", "$", "%", "^","&", "*"] self.noPunctuation = False self.answerLables = LabelBinarizer() self.answerLables.fit([1, 2, 3]) self.c = set() self.Initialize(trainFile, devFile, testFile) self.input = len(self.c) * 5 + 1 self.setParameters(d, yita)
def test_label_binarizer_multilabel(): lb = LabelBinarizer() # test input as lists of tuples inp = [(2, 3), (1, ), (1, 2)] indicator_mat = np.array([[0, 1, 1], [1, 0, 0], [1, 1, 0]]) got = lb.fit_transform(inp) assert_true(lb.multilabel_) assert_array_equal(indicator_mat, got) assert_equal(lb.inverse_transform(got), inp) # test input as label indicator matrix lb.fit(indicator_mat) assert_array_equal(indicator_mat, lb.inverse_transform(indicator_mat)) # regression test for the two-class multilabel case lb = LabelBinarizer() inp = [[1, 0], [0], [1], [0, 1]] expected = np.array([[1, 1], [1, 0], [0, 1], [1, 1]]) got = lb.fit_transform(inp) assert_true(lb.multilabel_) assert_array_equal(expected, got) assert_equal([set(x) for x in lb.inverse_transform(got)], [set(x) for x in inp])
def fit_binarizers(all_values): binarizers = {} for f in range(len(all_values[0])): cur_features = [context[f] for context in all_values] # only categorical values need to be binarized, ints/floats are left as they are if type(cur_features[0]) == str or type(cur_features[0]) == unicode: lb = LabelBinarizer() lb.fit(cur_features) binarizers[f] = lb elif type(cur_features[0]) == list: mlb = MultiLabelBinarizer() # default feature for unknown values cur_features.append(tuple(("__unk__",))) mlb.fit([tuple(x) for x in cur_features]) binarizers[f] = mlb return binarizers
def check_binarized_results(y, classes, pos_label, neg_label, expected): for sparse_output in [True, False]: if ((pos_label == 0 or neg_label != 0) and sparse_output): assert_raises(ValueError, label_binarize, y, classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) continue # check label_binarize binarized = label_binarize(y, classes, neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) assert_array_equal(toarray(binarized), expected) assert_equal(issparse(binarized), sparse_output) # check inverse y_type = type_of_target(y) if y_type == "multiclass": inversed = _inverse_binarize_multiclass(binarized, classes=classes) else: inversed = _inverse_binarize_thresholding( binarized, output_type=y_type, classes=classes, threshold=((neg_label + pos_label) / 2.)) assert_array_equal(toarray(inversed), toarray(y)) # Check label binarizer lb = LabelBinarizer(neg_label=neg_label, pos_label=pos_label, sparse_output=sparse_output) binarized = lb.fit_transform(y) assert_array_equal(toarray(binarized), expected) assert_equal(issparse(binarized), sparse_output) inverse_output = lb.inverse_transform(binarized) assert_array_equal(toarray(inverse_output), toarray(y)) assert_equal(issparse(inverse_output), issparse(y))
def getLoss(w, x, y, lam): m = x.shape[0] # First we get the number of training examples #y_mat = oneHotIt(y) #Next we convert the integer class coding into a one-hot representation lb = LabelBinarizer() y_mat = lb.fit_transform(y) b = np.random.rand(len(x), 10) #scores = np.sum(np.dot(x, w), b) # Then we compute raw class scores given our input and current weights scores = np.dot( x, w ) # Then we compute raw class scores given our input and current weights prob = softmax( scores ) # Next we perform a softmax on these scores to get their probabilities loss = (-1 / m) * np.sum(y_mat * np.log(prob)) + (lam / 2) * np.sum( w * w) # We then find the loss of the probabilities grad = (-1 / m) * np.dot( x.T, (y_mat - prob)) + lam * w # And compute the gradient for that loss return loss, grad
def test_label_binarizer(): lb = LabelBinarizer() # two-class case inp = ["neg", "pos", "pos", "neg"] expected = np.array([[0, 1, 1, 0]]).T got = lb.fit_transform(inp) assert_false(lb.multilabel_) assert_array_equal(lb.classes_, ["neg", "pos"]) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) # multi-class case inp = ["spam", "ham", "eggs", "ham", "0"] expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]) got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam']) assert_false(lb.multilabel_) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def _log_loss(y_true, y_pred, eps=1e-10, sample_weight=None): """ This is shorter ans simpler version og log_loss, which supports sample_weight """ sample_weight = check_sample_weight(y_true, sample_weight=sample_weight) y_true, y_pred, sample_weight = check_arrays(y_true, y_pred, sample_weight) y_true = column_or_1d(y_true) lb = LabelBinarizer() T = lb.fit_transform(y_true) if T.shape[1] == 1: T = numpy.append(1 - T, T, axis=1) # Clipping Y = numpy.clip(y_pred, eps, 1 - eps) # Check if dimensions are consistent. T, Y = check_arrays(T, Y) # Renormalize Y /= Y.sum(axis=1)[:, numpy.newaxis] loss = -(T * numpy.log(Y) * sample_weight[:, numpy.newaxis]).sum() / numpy.sum(sample_weight) return loss
def test_label_binarizer(): lb = LabelBinarizer() # one-class case defaults to negative label inp = ["pos", "pos", "pos", "pos"] expected = np.array([[0, 0, 0, 0]]).T got = lb.fit_transform(inp) assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_")) assert_array_equal(lb.classes_, ["pos"]) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp) # two-class case inp = ["neg", "pos", "pos", "neg"] expected = np.array([[0, 1, 1, 0]]).T got = lb.fit_transform(inp) assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_")) assert_array_equal(lb.classes_, ["neg", "pos"]) assert_array_equal(expected, got) to_invert = np.array([[1, 0], [0, 1], [0, 1], [1, 0]]) assert_array_equal(lb.inverse_transform(to_invert), inp) # multi-class case inp = ["spam", "ham", "eggs", "ham", "0"] expected = np.array([[0, 0, 0, 1], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1, 0, 0, 0]]) got = lb.fit_transform(inp) assert_array_equal(lb.classes_, ['0', 'eggs', 'ham', 'spam']) assert_false(assert_warns(DeprecationWarning, getattr, lb, "multilabel_")) assert_array_equal(expected, got) assert_array_equal(lb.inverse_transform(got), inp)
def dbpedia_smallcharconv(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int( round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 logging.info('creating train test split ...') split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] logging.info('preprocessing, padding and binarizing data ...') train_docs = [[ CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text ] for text in train_df[['title', 'abstract']].apply( lambda cols: u'\n'.join(cols), axis=1).values] bin = LabelBinarizer() x_train = np.array( pad_sentences(train_docs, max_length=1014, padding_word=CHAR_MAP.index(' '))) y_train = bin.fit_transform(train_df.category.values) test_docs = [[ CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text ] for text in test_df[['title', 'abstract']].apply( lambda cols: u'\n'.join(cols), axis=1).values] x_test = np.array(pad_sentences(test_docs, max_length=1014, padding_word=0)) y_test = bin.transform(test_df.category.values) logging.info('building model ...') model = Sequential() model.add( Embedding(len(CHAR_MAP) + 1, len(CHAR_MAP) + 1, input_length=1014, weights=[char_embedding()], trainable=False)) model.add( Convolution1D(nb_filter=256, filter_length=7, border_mode='valid', activation='relu')) model.add(MaxPooling1D(pool_length=3)) model.add( Convolution1D(nb_filter=256, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3)) model.add( Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3)) model.add(Flatten()) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) print(model.summary()) model.fit(x_train, y_train, batch_size=64, nb_epoch=5, validation_data=[x_test, y_test]) print( accuracy_score( np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
# find pairs for samplesIMG in samplesCSV samples_paired = [] for i in range(samplesIMG.shape[0]): for j in range(samplesCSV.shape[0]): if namesCSV[j] == namesIMG[i]: samples_paired.append(samplesCSV[j]) samplesCSV = np.array(samples_paired) samplesIMG = np.expand_dims(samplesIMG, axis=3) print("Paired") print("Samples IMG: {}".format(len(samplesIMG))) print("Samples CSV: {}".format(len(samplesCSV))) # one-hot encoding lb = LabelBinarizer() labels = lb.fit_transform(labels) numClasses = labels.shape[1] inputShape = (108, 192, 1) #samplesIMG.shape #model for images cnnmodel = Sequential() cnnmodel.add(Conv2D(16, (3, 3), padding="same", input_shape=inputShape)) cnnmodel.add(Activation("relu")) cnnmodel.add(MaxPooling2D(pool_size=(2, 2))) cnnmodel.add(Conv2D(32, (3, 3), padding="same")) cnnmodel.add(Activation("relu")) cnnmodel.add(MaxPooling2D(pool_size=(2, 2))) cnnmodel.add(Dropout(0.25)) cnnmodel.add(Flatten())
def main(): #load data file = "datasetA_3c.csv" dataframe = pandas.read_csv(file) dataset = dataframe.values samples = dataset[:,1:] labels = dataset[:,0] samples = np.array(samples) labels = np.array(labels) labels = labels.astype(str) print("Class distribution:") print(Counter(labels)) ### choose k best attributes # from sklearn.feature_selection.univariate_selection import SelectKBest # newSamples = SelectKBest(k=100).fit_transform(samples, labels) # print(newSamples.shape) # samples = newSamples ### Calculate weights for unbalanced classes # from sklearn.utils import class_weight # d_class_weights = None # class_weights = class_weight.compute_class_weight('balanced',np.unique(labels),labels) # print("Class weights:") # print(class_weights) # d_class_weights = dict(enumerate(class_weights)) ### Normalize samples # from sklearn.preprocessing.data import normalize # normalize(samples) ## convert to one-hot encoding lb = LabelBinarizer() labels = lb.fit_transform(labels) classesNum = labels.shape[1] print ("Classes: {}".format(classesNum)) trainSamples = samples trainLabels = labels testSamples = samples testLabels = labels ### Division into training and test samples # from sklearn.model_selection._split import train_test_split # (trainSamples, testSamples, trainLabels, testLabels) = train_test_split(samples, labels, test_size=0.25, random_state=42) model = Sequential() model.add(Dense(250, activation='sigmoid')) model.add(Dense(250, activation='sigmoid')) model.add(Dense(250, activation='sigmoid')) model.add(Dense(classesNum, activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer="adam",metrics=['accuracy']) EPOCHS=50 BATCH=50 H = model.fit(trainSamples, trainLabels, batch_size=BATCH, epochs=EPOCHS #,class_weight=d_class_weights #,validation_data=(testSamples,testLabels) #,validation_split=0.1 ) mlpResults = model.predict(testSamples) print(confusion_matrix(testLabels.argmax(axis=1), mlpResults.argmax(axis=1))) print(classification_report(testLabels.argmax(axis=1), mlpResults.argmax(axis=1),target_names=lb.classes_)) print("MLP Accuracy: {:.2f}".format(accuracy_score(testLabels.argmax(axis=1), mlpResults.argmax(axis=1)))) print("Cohen's Kappa {:.2f}".format(cohen_kappa_score(testLabels.argmax(axis=1), mlpResults.argmax(axis=1)))) N = np.arange(0, EPOCHS) plt.style.use("ggplot") plt.figure() plt.plot(N, H.history["loss"], label="train_loss") plt.plot(N, H.history["acc"], label="train_acc") #plt.plot(N, H.history["val_loss"], label="val_loss") #plt.plot(N, H.history["val_acc"], label="val_acc") plt.title("Training Loss and Accuracy") plt.xlabel("Epoch #") plt.ylabel("Loss/Accuracy") plt.legend() plt.show()
def prepocess_label(y): return LabelBinarizer().fit_transform(y)
def main(): print("Loading samples and labels") samples, labels, _ = load_files("data") print("Loaded {} samples".format(samples.shape[0])) sequence_dim = 100 print("Converting to sequences of length {}".format(sequence_dim)) samples, labels = make_sequences(samples, labels, sequence_dim) print("Number of samples from sequences: {}".format(samples.shape[0])) lb = LabelBinarizer() labels = lb.fit_transform(labels) # flattened samples for Decision Tree flatSamples = samples.reshape(samples.shape[0], -1) #tree! (trainSamples, testSamples, trainLabels, testLabels) = train_test_split(flatSamples, labels, test_size=0.25, random_state=42) print("=" * 20) print("Building DecisionTree model") model = DecisionTreeClassifier() model.fit(trainSamples, trainLabels) treeResults = model.predict(testSamples) print( confusion_matrix(testLabels.argmax(axis=1), treeResults.argmax(axis=1))) print( classification_report(testLabels.argmax(axis=1), treeResults.argmax(axis=1))) treeAcc = accuracy_score(testLabels.argmax(axis=1), treeResults.argmax(axis=1)) print("Accuracy Tree: {:.2f}".format(treeAcc)) print("Cohen's Kappa {:.2f}".format( cohen_kappa_score(testLabels.argmax(axis=1), treeResults.argmax(axis=1)))) print("=" * 20) print("Building CNN model") (trainSamples, testSamples, trainLabels, testLabels) = train_test_split(samples, labels, test_size=0.25, random_state=42) inputShape = (samples.shape[1], samples.shape[2]) model = Sequential() model.add(Conv1D(32, 10, padding="same", input_shape=inputShape)) model.add(Activation("relu")) model.add(BatchNormalization()) model.add(Dropout(0.2)) model.add(Conv1D(64, 10, padding="same")) model.add(Activation("relu")) model.add(BatchNormalization()) model.add(Dropout(0.2)) model.add(Conv1D(128, 10, padding="same")) model.add(Activation("relu")) model.add(Dropout(0.2)) model.add(Flatten(input_shape=inputShape)) model.add(Dense(128, activation='sigmoid')) model.add(Dense(64, activation='sigmoid')) model.add(Dense(labels.shape[1], activation='softmax')) model.compile(loss='categorical_crossentropy', optimizer="adam", metrics=['accuracy']) EPOCHS = 10 BATCH = 128 model.fit(trainSamples, trainLabels, batch_size=BATCH, epochs=EPOCHS, validation_data=(testSamples, testLabels)) cnnResults = model.predict(testSamples) print( confusion_matrix(testLabels.argmax(axis=1), cnnResults.argmax(axis=1))) print( classification_report(testLabels.argmax(axis=1), cnnResults.argmax(axis=1), target_names=lb.classes_)) print("CNN Accuracy: {:.2f}".format( accuracy_score(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)))) print("Cohen's Kappa {:.2f}".format( cohen_kappa_score(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)))) input("")
def __init__(self): self.__encoder = LabelBinarizer(sparse_output=False)
def main(): samples, labels, _ = loader.load_img("radio_img") #add the fourth dimension (color) samples = np.expand_dims(samples, axis=4) print("shape = {}".format(samples.shape)) inputShape = (samples.shape[1], samples.shape[2], samples.shape[3]) print("inputShape = {}".format(inputShape)) #weights class_weights = class_weight.compute_class_weight('balanced', np.unique(labels), labels) d_class_weights = dict(enumerate(class_weights)) print("weights {}".format(d_class_weights)) #one-hot encoding lb = LabelBinarizer() labels = lb.fit_transform(labels) classesNum = labels.shape[1] print("Classes: {}".format(classesNum)) #split to training and test (trainSamples, testSamples, trainLabels, testLabels) = train_test_split(samples, labels, test_size=0.25, random_state=42) model = cnn_model(inputShape, classesNum) ## checkpoints # checkpt1 = ModelCheckpoint(filepath='model.{epoch:02d}-{val_loss:.2f}.h5', save_best_only=True) # checkpt2 = EarlyStopping(monitor='val_loss', patience=3) EPOCHS = 20 BATCH = 50 model.fit( trainSamples, trainLabels, batch_size=BATCH, epochs=EPOCHS, class_weight=d_class_weights, verbose=1, #callbacks = [checkpt1,checkpt2], validation_data=(testSamples, testLabels)) cnnResults = model.predict(testSamples) print( confusion_matrix(testLabels.argmax(axis=1), cnnResults.argmax(axis=1))) print( classification_report(testLabels.argmax(axis=1), cnnResults.argmax(axis=1))) cnnAcc = accuracy_score(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)) print("Accuracy CNN: {:.2f}".format(cnnAcc)) print("Cohen's Kappa {:.2f}".format( cohen_kappa_score(testLabels.argmax(axis=1), cnnResults.argmax(axis=1)))) input("")
def dbpedia_convgemb(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int( round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True) vocab = Dictionary(train_docs) vocab.filter_extremes(keep_n=5000) bin = LabelBinarizer() x_train = np.array( pad_sentences( [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in train_docs], max_length=100, padding_word=0)) y_train = bin.fit_transform(train_df.category.values) test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True) x_test = np.array( pad_sentences( [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in test_docs], max_length=100, padding_word=0)) y_test = bin.transform(test_df.category.values) emb_weights = load_w2v_weights(vocab) model = Sequential() model.add( Embedding(5001, 300, input_length=100, dropout=.2, weights=[emb_weights], trainable=False)) model.add( Convolution1D(nb_filter=50, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=model.output_shape[1])) model.add(Flatten()) model.add(Dense(100, activation='relu')) model.add(Dropout(.2)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x_train, y_train) print( accuracy_score( np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
def dbpedia_smallwordconv(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int( round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 logging.info('creating train test split ...') split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] logging.info('preprocessing, padding and binarizing data ...') train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True) vocab = Dictionary(train_docs) vocab.filter_extremes(keep_n=5000) bin = LabelBinarizer() x_train = np.array( pad_sentences( [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in train_docs], max_length=100, padding_word=0)) y_train = bin.fit_transform(train_df.category.values) test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True) x_test = np.array( pad_sentences( [[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in test_docs], max_length=100, padding_word=0)) y_test = bin.transform(test_df.category.values) logging.info('building model ...') model = Sequential() model.add(Embedding(5001, 300, input_length=100)) model.add( Convolution1D(nb_filter=300, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add( Convolution1D(nb_filter=300, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add( Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add( Convolution1D(nb_filter=300, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3, stride=1)) model.add(Flatten()) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) model.fit(x_train, y_train, batch_size=32, nb_epoch=5, validation_data=[x_test, y_test]) print( accuracy_score( np.argwhere(y_test)[:, 1], model.predict_classes(x_test)))
def test_label_binarizer_multilabel_unlabeled(): """Check that LabelBinarizer can handle an unlabeled sample""" lb = LabelBinarizer() y = [[1, 2], [1], []] Y = np.array([[1, 1], [1, 0], [0, 0]]) assert_array_equal(lb.fit_transform(y), Y)
def __init__(self, *args, **kwargs): self.encoder = LabelBinarizer(*args, **kwargs)