def test_dataset_works(self): data_home = dt.get_data_home(data_home=None, subdirectory='test') for set_name, variant in random.sample(ALL_SET_TEST_CASES, NUMBER_OF_SETS_TO_DOWNLOAD_IN_TEST): X, y, feature_names, label_names = dt.load_dataset(set_name=set_name, variant=variant , data_home=data_home) self.assertEqual(len(X.shape), 2) self.assertEqual(len(y.shape), 2) self.assertEqual(len(feature_names), X.shape[1]) self.assertEqual(len(label_names), y.shape[1]) self.assertEqual(X.shape[0], y.shape[0]) dt.clear_data_home(data_home)
def test_dataset_works(self): data_home = dt.get_data_home(data_home=None, subdirectory='test') for set_name, variant in random.sample( ALL_SET_TEST_CASES, NUMBER_OF_SETS_TO_DOWNLOAD_IN_TEST): X, y, feature_names, label_names = dt.load_dataset( set_name=set_name, variant=variant, data_home=data_home) self.assertEqual(len(X.shape), 2) self.assertEqual(len(y.shape), 2) self.assertEqual(len(feature_names), X.shape[1]) self.assertEqual(len(label_names), y.shape[1]) self.assertEqual(X.shape[0], y.shape[0]) dt.clear_data_home(data_home)
def test_dataset_works(self): data_home = dt.get_data_home(data_home=None, subdirectory='test') for set_name, variant in random.sample( ALL_SET_TEST_CASES, NUMBER_OF_SETS_TO_DOWNLOAD_IN_TEST): data_dump = dt.load_dataset(set_name=set_name, variant=variant, data_home=data_home) self.assertIsInstance(data_dump, dict) self.assertIn('X', data_dump) self.assertIn('y', data_dump) self.assertEqual(len(data_dump['X'].shape), 2) self.assertEqual(len(data_dump['y'].shape), 2) dt.clear_data_home(data_home)
num_instance = emb_feature.shape[0] c1 = emb_feature - emb_label c2 = tf.linalg.matmul(emb_feature, emb_feature, transpose_b=True) - tf.eye(num_instance) c3 = tf.linalg.matmul(emb_label, emb_label, transpose_b=True) - tf.eye(num_instance) loss = tf.linalg.trace(tf.matmul( c1, c1, transpose_a=True)) + self.regularization_factor * tf.linalg.trace( tf.matmul(c2, c2, transpose_a=True) + tf.matmul(c3, c3, transpose_a=True)) return loss #%% X, y, feature_names, label_names = load_dataset('tmc2007_500', 'train') #%% c2ae = C2AE([512, 512], [512], [22]) cl = CCA_Loss() max_iter = 1000 # optimizer = K.optimizers.Adam(learning_rate=0.001) optimizer = K.optimizers.RMSprop(learning_rate=0.001) #%% tmc2007 = tf.data.Dataset.from_tensor_slices( (np.array(X.todense(), dtype=np.float32), np.array(y.todense(), dtype=np.float32))) #%% BATCH_SIZE = 1000 tmc_data = tmc2007.shuffle(buffer_size=15000) tmc_data_batch = tmc_data.batch(BATCH_SIZE) feature, label = next(iter(tmc_data_batch))
def splitted_tmc(self, N, Km, Ksm, shuffle=False, verbose=True, focus="pref"): X_tr, y_tr, feature_names, label_names = load_dataset( 'tmc2007_500', 'train') X_te, y_te, _, _ = load_dataset('tmc2007_500', 'test') X_tr = X_tr.todense() X_te = X_te.todense() y_tr = y_tr.todense() y_te = y_te.todense() if verbose: print("Shape of Train Data: ", X_tr.shape) print("Shape of Test Data: ", X_te.shape) print("Shape of Train Labels: ", y_tr.shape) print("Shape of Test Data: ", y_te.shape) X = np.concatenate((X_tr, X_te), axis=0) y = np.concatenate((y_tr, y_te), axis=0) most_feat, least_feat = self.partition_features(Km, X, y) if verbose: print("\nMost Relevant ", Km, " Features:", most_feat) #print("\nLeast Relevant ", X.shape[1]-Km ," Features:", least_feat) red_X = X.copy() red_X[:, most_feat] = 1 red_most_feat, red_least_feat = self.partition_features(Ksm, red_X, y) if verbose: print("\nSecond Most Relevant ", Ksm, " Features:", red_most_feat) #print("\nLeast Relevant ", X.shape[1]-Ksm ," Features:", red_least_feat) if focus == "pref": pref = X[:, most_feat] context = X[:, red_most_feat] elif focus == "context": context = X[:, most_feat] pref = X[:, red_most_feat] if verbose: print("\n Preferences Shape: ", pref.shape) print("\n Contexts Shape: ", context.shape) if shuffle: c = list(zip(context, pref, y)) np.random.shuffle(c) context, pref, y = zip(*c) context = np.array(context).squeeze(axis=1) pref = np.array(pref).squeeze(axis=1) y = np.array(y).squeeze(axis=1) else: context = np.array(context) pref = np.array(pref) y = np.array(y) if verbose: print("\n Contexts Shape: ", context.shape) print("\n Preferences Shape: ", pref.shape) print("\n Actions Shape: ", y.shape) if verbose: plt.rcParams["figure.figsize"] = 16, 4 plt.bar(range(context.shape[1]), np.asarray(context.sum(axis=0)), label="Contexts") plt.legend(prop={'size': 20}) plt.show() plt.bar(range(pref.shape[1]), np.asarray(pref.sum(axis=0)), label="Preferences") plt.legend(prop={'size': 20}) plt.show() plt.bar(range(y.shape[1]), np.asarray(y.sum(axis=0)), label="Responses") plt.legend(prop={'size': 20}) plt.show() sp_context = np.array_split(context, N) sp_pref = np.array_split(pref, N) sp_response = np.array_split(y, N) return sp_context, sp_pref, sp_response
def splitted_mediamill(self, N, red_K, shuffle=False, verbose=True, focus="pref"): X_tr, y_tr, feature_names, label_names = load_dataset( 'mediamill', 'train') X_te, y_te, _, _ = load_dataset('mediamill', 'test') X_tr = X_tr.todense() X_te = X_te.todense() y_tr = y_tr.todense() y_te = y_te.todense() if verbose: print("Shape of Train Data: ", X_tr.shape) print("Shape of Test Data: ", X_te.shape) print("Shape of Train Labels: ", y_tr.shape) print("Shape of Test Data: ", y_te.shape) X = np.concatenate((X_tr, X_te), axis=0) y = np.concatenate((y_tr, y_te), axis=0) y = y[:, np.asarray(y.sum(axis=0) > 100)[0]] if verbose: print("Shape of All Data:", X.shape) print("Shape of All Labels:", y.shape) K = y.shape[1] most_feat, least_feat = self.partition_features(K, X, y) if verbose: print("\nMost Relevant ", K, " Features:", most_feat) print("\nLeast Relevant ", X.shape[1] - K, " Features:", least_feat) if focus == "pref": pref = (X[:, most_feat] > 0.45).astype(float) context = (X[:, least_feat] > 0.45).astype(float) if verbose: print("\n Preferences Shape: ", pref.shape) print("\n Contexts Shape: ", context.shape) pref = pref[:, np.asarray(pref.sum(axis=0) > 2400)[0]] y = y[:, np.asarray(y.sum(axis=0) > 450)[0]] context = context[:, np.asarray( np.logical_and( context.sum(axis=0) > 2000, context.sum(axis=0) < 40000))[0]] elif focus == "context": context = (X[:, most_feat] > 0.45).astype(float) pref = (X[:, least_feat] > 0.45).astype(float) if verbose: print("\n Preferences Shape: ", pref.shape) print("\n Contexts Shape: ", context.shape) pref = pref[:, np.asarray(pref.sum(axis=0) > 500)[0]] y = y[:, np.asarray(y.sum(axis=0) > 450)[0]] context = context[:, np.asarray( np.logical_and( context.sum(axis=0) > 9999, context.sum(axis=0) < 29000))[0]] if shuffle: c = list(zip(context, pref, y)) np.random.shuffle(c) context, pref, y = zip(*c) context = np.array(context).squeeze(axis=1) pref = np.array(pref).squeeze(axis=1) y = np.array(y).squeeze(axis=1) else: context = np.array(context) pref = np.array(pref) y = np.array(y) if verbose: print("\n Contexts Shape: ", context.shape) print("\n Preferences Shape: ", pref.shape) print("\n Actions Shape: ", y.shape) matrix_clusterer = MatrixLabelSpaceClusterer(clusterer=KMeans( n_clusters=red_K)) similar_ys = matrix_clusterer.fit_predict(context, y) if verbose: print("Silimar Labeles: ", similar_ys) y_red = np.zeros((y.shape[0], red_K)) for k, lbs in enumerate(similar_ys): for lb in lbs: y_red[:, k] += y[:, lb] y_red = (y_red >= 1).astype(float) if verbose: plt.rcParams["figure.figsize"] = 16, 4 plt.bar(range(context.shape[1]), np.asarray(context.sum(axis=0)), label="Contexts") plt.legend(prop={'size': 20}) plt.show() plt.bar(range(pref.shape[1]), np.asarray(pref.sum(axis=0)), label="Preferences") plt.legend(prop={'size': 20}) plt.show() plt.bar(range(y_red.shape[1]), np.asarray(y_red.sum(axis=0)), label="Responses") plt.legend(prop={'size': 20}) plt.show() sp_context = np.array_split(context, N) sp_pref = np.array_split(pref, N) sp_response = np.array_split(y_red, N) return sp_context, sp_pref, sp_response
# Adapted from http://scikit.ml/multilabeldnn.html import numpy import sklearn.metrics as metrics from skmultilearn.dataset import load_dataset from keras.models import Sequential from keras.layers import Dense from skmultilearn.problem_transform import BinaryRelevance from skmultilearn.problem_transform import LabelPowerset from skmultilearn.ext import Keras from sklearn.metrics import accuracy_score X_train, y_train, feature_names, label_names = load_dataset( 'emotions', 'train') X_test, y_test, _, _ = load_dataset('emotions', 'test') def create_model_single_class(input_dim, output_dim): # create model model = Sequential() model.add(Dense(12, input_dim=input_dim, activation='relu')) model.add(Dense(8, activation='relu')) model.add(Dense(output_dim, activation='sigmoid')) # Compile model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) return model def create_model_multiclass(input_dim, output_dim):
import jax import numpy as np from sklearn.preprocessing import StandardScaler from skmultilearn.dataset import load_dataset from MCRegressor import MCRegressor from StochasticILE import StochasticILEMLClassifier dataset = "emotions" # load dataset X_train, y_train, _, _ = load_dataset(dataset, "undivided") # need to transform to dense arrays y_train = y_train.toarray() X_train = X_train.toarray() # normalise data scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) # add an additional column of 1s to the data for the bias X_train = np.c_[X_train, np.ones(X_train.shape[0])] # Bound parameters (these are checked before fitting) alpha = 0.25 t = 0.5 kappa = 22 # training parameters
from skmultilearn.problem_transform import LabelPowerset, BinaryRelevance, ClassifierChain from skmultilearn.ensemble import RakelD, RakelO from skmultilearn.adapt import MLkNN from skmultilearn.dataset import load_dataset import numpy as np import pandas as pd from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.metrics import multilabel_confusion_matrix, accuracy_score from sklearn.multiclass import OneVsRestClassifier from sklearn.pipeline import Pipeline from sklearn.feature_selection import VarianceThreshold ## Chargement des données emotions Xtrain, Ytrain, nom_variable, nom_label = load_dataset( 'emotions', 'train') # données train Xtest, Ytest, _, _ = load_dataset('emotions', 'test') # données test ## description des données sur le line suivant : http://scikit.ml/tutorial.html #Xtrain,Xtest,Ytrain,Ytest sont chargé comme matrice spase de scipy ''' On pourrais travaillé directement avec des matrices sparse avec sklearn et skmultilearn''' ## Tansformation des matrice sparse pour la visualisation des données datatrain = pd.DataFrame(Xtrain.toarray(), columns=nom_variable) print(datatrain.shape) # (391,72) #print(datatrain.head(5)) #print(datatrain.info()) #print(datatrain.describe()) datatrainy = pd.DataFrame(Ytrain.toarray(), columns=nom_label) print(datatrainy.shape) #6 sorties donc 6 label non exculsives ## prétraitement des données
def import_testing_set(dataset_name): dataset = load_dataset(dataset_name, "test") return cons_multilabel_dataset_mock(dataset)
def load_given_dataset(dataset): if dataset.lower() == "20ng": return load_custom_dataset("20ng") if dataset.lower() == "test": return load_custom_dataset("test") return load_dataset(dataset, 'undivided')
def load_given_dataset(d): if d.lower() == "20ng": return load_custom_dataset(d.lower()) return load_dataset(d, 'undivided')
def get_dataset(dataset, ranked_features=None, reduce_dim=None, num_features=None): # Load a multi-label dataset from https://www.openml.org/d/40597 # X, Y = fetch_mldata('yeast', version=4, return_X_y=True) if dataset == "yeast": data = fetch_mldata("yeast") X = data["data"] Y = data["target"].transpose().toarray() train_input, test_input, train_labels, test_labels = train_test_split( X, Y, test_size=0.2, random_state=0 ) if reduce_dim is not None and reduce_dim < train_input.shape[1]: from sklearn.decomposition import PCA pca = PCA(n_components=reduce_dim) pca.fit(train_input) train_input = pca.transform(train_input) pca.fit(test_input) test_input = pca.transform(test_input) return train_input, test_input, train_labels, test_labels elif dataset == "emotions": train_input, train_labels, feature_names, label_names = load_dataset( "emotions", "train" ) test_input, test_labels, _, _ = load_dataset("emotions", "test") fimp = Fimp( f_name=str(Path(os.path.dirname(__file__)).parent) + "/data/multilabel/emotions.fimp" ) indices = np.asarray(fimp.get_attr_indices())[range(-1, 2)] - 1 return ( train_input.toarray(), test_input.toarray(), train_labels.toarray(), test_labels.toarray(), ) elif ds_name == "xor": dataIn, dataOut = get_xor() return train_test_split(dataIn, dataOut, random_state=17) elif ds_name == "moons": dataIn, dataOut = make_moons( n_samples=20000, shuffle=True, noise=0.1, random_state=17 ) dataOut = np.vstack((dataOut, dataOut, dataOut)).T return train_test_split(dataIn, dataOut, random_state=17) elif ds_name == "sys_multilabel": dataIn, dataOut = get_sy_multilabel() return train_test_split(dataIn, dataOut, random_state=17) else: train_input, train_labels, test_input, test_labels = get_categorical_data( dataset ) train_input = np.asarray(train_input, dtype=np.float) train_labels = np.asarray(train_labels, dtype=np.int) test_input = np.asarray(test_input, dtype=np.float) test_labels = np.asarray(test_labels, dtype=np.int) return train_input, test_input, train_labels, test_labels