def train_test_split(self, feature_matrix, target_matrix, test_size=0.2): ''' Stratified Shuffle split technique is used to split train and test set, to have the equal proportion of classes in train and test. Input: feature_matrix : Feature matrix with rare classes filtered out target_matrix : Target matrix with rare classes filtered out test_size: default is 20% Output: train_x, train_y, test_x, test_y ''' lp = LabelPowerset() sss_level_1 = StratifiedShuffleSplit(lp.transform(target_matrix), n_iter=1, test_size=0.2, random_state=123) for train_ix, test_ix in sss_level_1: train_x = feature_matrix.iloc[train_ix, :] train_y = target_matrix.iloc[train_ix, :] test_x = feature_matrix.iloc[test_ix, :] test_y = target_matrix.iloc[test_ix, :] return train_x, train_y, test_x, test_y
class MyLabelPowerSetFeatureSelect(): def fit(self, X, y): # I'm using a gaussian naive bayes base classifier self.LabelPowerSetObject = LabelPowerset(GaussianNB()) # fitting the data self.LabelPowerSetObject.fit(X, y) # transformed y y_transformed = self.LabelPowerSetObject.transform(y) # instanciating with SelectKBest object self.X_new = SelectKBest(chi2, k=2) # the feature selecting self.X_transformed = self.X_new.fit_transform(X, y_transformed) # save indices of the saved attributes self.selected_attributes_indices = self.X_new.get_support(indices = True) #print(self.attributes_indices,'the indices of the selected atributes') return self def transform(self, X): return X[:,self.selected_attributes_indices] def predict(self, X): return self.LabelPowerSetObject.predict(X) def predict_proba(self, X): return self.LabelPowerSetObject.predict_proba(X)
def filter_rare_classes(self, feature_matrix, target_matrix): ''' In order to perform stratified split between train and test,there should be atleast 2 instances present in the data. Hence, filter label combinations that occurs only once in the entire dataset. Input : Feature Matrix : matrix of features Target Matrix : matrix containing the the target labels Output : Feature Matrix : Filtered Target Matrix : Filtered ''' lp = LabelPowerset() multi_class_target_labels = lp.transform(target_matrix) classes_vc = np.asarray( np.unique(multi_class_target_labels, return_counts=True)).T # 1635 unique classes class_to_keep = classes_vc[np.where(classes_vc[:, 1] > 1)][:, 0] mask = [ True if (multi_class_target_labels[i] in (class_to_keep)) else False for i in range(len(multi_class_target_labels)) ] feature_matrix = feature_matrix[mask] target_matrix = target_matrix[mask] return feature_matrix, target_matrix
def multi_labelTo_multi_class(Y, model): num_of_labels = Y.ndim if (num_of_labels == 1): print("This is not a multi-label problem!!!!!!") return Y #LabelPowerset is used here as it contains the transform function #that actuall do the multi_label to muti_class transformation. transclf = LabelPowerset(classifier=model, require_dense=[False, True]) return [transclf, transclf.transform(Y)]
def resampling_data(self, X, y): # Import a dataset with X and multi-label y lp = LabelPowerset() ros = RandomOverSampler(random_state=42) # Applies the above stated multi-label (ML) to multi-class (MC) transformation. yt = lp.transform(y) X_resampled, y_resampled = ros.fit_sample(X, yt) # Inverts the ML-MC transformation to recreate the ML set y_resampled = lp.inverse_transform(y_resampled) return X_resampled, y_resampled
def load_ucmerced_dataset(): dataset = scipy.io.loadmat( '/home/rishabh/Downloads/multi-label-analysis-master/src/graphcnn/setup/dataset/dataset1.mat' ) dataset = dataset['dataset1'] edges = np.squeeze(dataset['edges']) #adjacecny matrix index = np.squeeze(dataset['index']) # image index to keep track classes = np.squeeze(dataset['class']) #image class number to keep track #loading features in which NaN values have been replaced features = scipy.io.loadmat( '/home/rishabh/Downloads/multi-label-analysis-master/src/graphcnn/setup/dataset/features.mat' ) features = features['features'] features = features['val'] features = features[0] for i in range(0, len(features)): if np.isnan(features[i]).any() == True: print('features %d have NaN:' % i, np.isnan(features[i]).any()) # loading multi-labels labels = scipy.io.loadmat( '/home/rishabh/Downloads/multi-label-analysis-master/src/graphcnn/setup/dataset/LandUse_multilabels.mat' ) labels = labels['labels'] labels = np.transpose(labels, (1, 0)) # Calculating class weights lp = LabelPowerset() trans_labels = lp.transform(labels) unique, counts = np.unique(trans_labels, return_counts=True) class_freq = 1.0 / counts weight_mat = np.zeros((np.shape(trans_labels))) for i in range(len(weight_mat)): weight_mat[i] = class_freq[np.where(trans_labels[i] == unique)] # Calculating label weights sum_labels = np.sum(labels, axis=0, dtype=np.float32) sum_tot = np.sum(sum_labels, dtype=np.float32) label_freq = np.true_divide(sum_labels, sum_tot) return features, edges, labels, weight_mat, label_freq, index, classes
def multiple_smote(X, y): """ 为multi-label样本过采样 """ # Import a dataset with X and multi-label y y = np.array(y) lp = LabelPowerset() # oversampler = ADASYN(random_state=1994, n_neighbors=2) oversampler = SMOTE(k_neighbors=2) # Applies the above stated multi-label (ML) to multi-class (MC) transformation. yt = lp.transform(y) X_resampled, y_resampled = oversampler.fit_resample(X, yt) # Inverts the ML-MC transformation to recreate the ML set y_resampled = lp.inverse_transform(y_resampled) # return a sparse matrix return X_resampled, y_resampled.toarray()
def label_fscore(self, subset=False): if subset: true_list, pred_list = self.true_list, self.pred_list if self._is_binarized: # transform multilabel to multiclass for subset measurement lp = LabelPowerset() transformed = lp.transform(np.concatenate((true_list, pred_list))) true_list, pred_list = np.split(transformed, 2) else: true_list, pred_list = self._binarized_labels() prec, rec, fscore, count = skm.precision_recall_fscore_support( true_list, pred_list) fscores_dict = {} for c, f in zip(count, fscore): # label = class_by_count[c] # for when remove_multi_labeled is used if c == 0: continue fscores_dict[c] = f return fscores_dict
def resample_multilabel(data, target): """ Apply LP-transformation to create balanced classes, then convert back to multilabel targets """ target = target.astype(int) def convert_hads_to_str(hads_data, hads_type): hads_strs = [] for val in hads_data: if val == 0: str_convert = '%s_normal' % hads_type elif val == 1: str_convert = '%s_borderline' % hads_type elif val == 2: str_convert = '%s_abnormal' % hads_type hads_strs.append(str_convert) return hads_strs def convert_str_to_hads(hads_tuples): hads_array = np.ndarray(shape=(len(hads_tuples), 2)) for t, tup in enumerate(hads_tuples): for s, str in enumerate(tup): if '_normal' in str: hads_array[t, s] = 0 elif '_borderline' in str: hads_array[t, s] = 1 elif '_abnormal' in str: hads_array[t, s] = 2 return hads_array anx_strings = convert_hads_to_str(target[:, 0], 'anxiety') dep_strings = convert_hads_to_str(target[:, 1], 'depression') multilabel_hads = [(anx_strings[n], dep_strings[n]) for n in range(len(anx_strings))] mlb = preprocessing.MultiLabelBinarizer() binary_matrix = mlb.fit_transform(multilabel_hads) from skmultilearn.problem_transform import LabelPowerset lp = LabelPowerset() target_lp_transformed = lp.transform(binary_matrix) resampler = RandomOverSampler(sampling_strategy='not majority', random_state=seed) data_resampled, target_lp_transformed_resampled = resampler.fit_sample( data, target_lp_transformed) binary_matrix_resampled = lp.inverse_transform( target_lp_transformed_resampled) target_resampled_multilabel = mlb.inverse_transform( binary_matrix_resampled) target_resampled_multilabel_array = convert_str_to_hads( target_resampled_multilabel) anx_resampled_to_str = convert_hads_to_str( target_resampled_multilabel_array[:, 0], 'anxiety') dep_resampled_to_str = convert_hads_to_str( target_resampled_multilabel_array[:, 1], 'depression') target_resampled_multilabel_df = pd.DataFrame() target_resampled_multilabel_df['anxiety'] = anx_resampled_to_str target_resampled_multilabel_df['depression'] = dep_resampled_to_str return data_resampled, target_resampled_multilabel_df.values, target_lp_transformed_resampled
def lp_framework(self): clusters = [] for i in range(len(np.unique(self.clustering_labels))): cluster = [] for j in range(len(self.clustering_labels)): if i == self.clustering_labels[j]: cluster.append(self.labels[j]) clusters.append(cluster) print(self.labels) print(clusters) transformed_labels = [] LabelPowersetTramsformer = LabelPowerset() for cluster in clusters: print(cluster) print(self.universe[:, cluster].shape) # print(self.universe[:, cluster]) print(len(cluster)) if len(cluster) == 1: transformed_labels.append( list(self.universe[:, cluster].reshape(-1))) else: # print(self.universe[:, cluster]) print( LabelPowersetTramsformer.transform(self.universe[:, cluster])) print( type( LabelPowersetTramsformer.transform( self.universe[:, cluster]).shape)) print( list( LabelPowersetTramsformer.transform( self.universe[:, cluster]))) tmp = list( LabelPowersetTramsformer.transform(self.universe[:, cluster])) print( type( list( LabelPowersetTramsformer.transform( self.universe[:, cluster])))) transformed_labels.append(tmp) # print(len(transformed_labels)) # print(len(transformed_labels[0])) # print(transformed_labels[0]) # print(transformed_labels[0][0]) # print(transformed_labels[0][0][0]) print(np.array(transformed_labels).shape) print("universe shape:" + str(self.universe.shape)) print(self.universe[:, self.attributes].shape) print(np.array(transformed_labels).T.shape) self.universe = np.append(self.universe[:, self.attributes], np.array(transformed_labels).T, axis=1) print(self.universe.shape) print(self.attributes) self.labels = [ i for i in range(len(self.attributes), self.universe.shape[1]) ] print(self.labels) pass
train_text_ori = train_df['alltext'].tolist() train_text_ori = [' '.join(t.split()) for t in train_text_ori] train_text_ori = np.array(train_text_ori, dtype=object)[:, np.newaxis] train_label_ori = train_df.values[:,3:-1] test_text = test_df['alltext'].tolist() test_text = [' '.join(t.split()) for t in test_text] test_text = np.array(test_text, dtype=object)[:, np.newaxis] test_label = test_df.values[:,3:-1] ## Upsampling of data for each label print('Before upsampling: ',train_text_ori.shape,train_label_ori.shape,test_text.shape,test_label.shape) yt = lp.transform(train_label_ori.astype('int')) train_text, y_resampled = ros.fit_sample(train_text_ori.astype('str'), yt) train_label = lp.inverse_transform(y_resampled).toarray() # train_text=train_text_ori # train_label = train_label_ori print('After Up-sampling',train_text.shape,train_label.shape,test_text.shape,test_label.shape) # Instantiate tokenizer tokenizer = create_tokenizer_from_hub_module() # Convert data to InputExample format train_examples = convert_text_to_examples(train_text, train_label)
def load_ucmerced_dataset(): dataset = scipy.io.loadmat( 'dataset/newtotpatt.mat') #dataset1.mat, totgra.mat dataset = dataset['newtotpatt'] #dataset1, totgra #dataset = np.transpose(dataset); edges = np.squeeze(dataset['edges']) #adjacecny matrix index = np.squeeze(dataset['index']) # image index to keep track classes = np.squeeze(dataset['class']) #image class number to keep track #''' #loading features in which NaN values have been replaced features1 = scipy.io.loadmat('dataset/features1n.mat') #normfeatures1.mat features1 = features1['features1n'] features1 = np.squeeze(features1['val']) features2 = scipy.io.loadmat('dataset/features2n.mat') #normfeatures1.mat features2 = features2['features2n'] features2 = np.squeeze(features2['val']) features3 = scipy.io.loadmat('dataset/features3n.mat') #normfeatures1.mat features3 = features3['features3n'] features3 = np.squeeze(features3['val']) features = np.concatenate((features1, features2)) features = np.concatenate((features, features3)) #features = np.squeeze(dataset(features)) #features = features['val'] #features = features[0] print type(features[0]) print features.shape for i in range(0, len(features)): if np.isnan(features[i]).any() == True: print('features %d have NaN:' % i, np.isnan(features[i]).any()) ''' f = h5py.File('dataset/pattfeatures.mat','r')#dataset1.mat test = f['features'] #dataset1 #dataset = np.transpose(dataset); test = np.squeeze(test['val']) #adjacecny matrix st=test for i in range(30400): st[i] = test[i] features = f[st] #index = np.squeeze(dataset['index'][:]) # image index to keep track #val = np.array(val) #index = np.array(index) #classes = np.squeeze(dataset['class']) #image class number to keep track #features = features['val'] #features = test #features = f[features] #str1 = ''.join(features(i) for i in features[:]) #features = np.array([features]) #features[0] = np.array(features[0]) print type(features[0]) print features[0].shape #for i in range(0,len(features)): # if np.isnan(features[i]).any() == True: # print('features %d have NaN:'% i,np.isnan(features[i]).any()) ''' #loading multi-labels labels = scipy.io.loadmat('dataset/pattlabels.mat') #LandUse_multilabels labels = labels['labels'] #labels = np.transpose(labels) #loading positive-labels #pos = scipy.io.loadmat('dataset/Indexes.mat') ##labels #pos = pos['ind'] #pos = np.transpose(pos) #neg = scipy.io.loadmat('dataset/Index-.mat') ##labels #neg = labels['neg_ind'] #neg = np.transpose(neg) #load pairs #pairs = scipy.io.loadmat('dataset/pair.mat') # Calculating class weights lp = LabelPowerset() trans_labels = lp.transform(labels) unique, counts = np.unique(trans_labels, return_counts=True) class_freq = 1.0 / counts weight_mat = np.zeros((np.shape(trans_labels))) for i in range(len(weight_mat)): weight_mat[i] = class_freq[np.where(trans_labels[i] == unique)] # Calculating label weights sum_labels = np.sum(labels, axis=0, dtype=np.float32) sum_tot = np.sum(sum_labels, dtype=np.float32) label_freq = np.true_divide(sum_labels, sum_tot) return features, edges, labels, weight_mat, label_freq, index, classes