Exemplo n.º 1
0
    def train_test_split(self, feature_matrix, target_matrix, test_size=0.2):
        '''
        Stratified Shuffle split technique is used to split train and test set,
        to have the equal proportion of classes in train and test.
        
        Input:
            feature_matrix : Feature matrix with rare classes filtered out
            target_matrix : Target matrix with rare classes filtered out
            test_size: default is  20%
        
        Output:
            train_x, train_y, test_x, test_y
        '''
        lp = LabelPowerset()
        sss_level_1 = StratifiedShuffleSplit(lp.transform(target_matrix),
                                             n_iter=1,
                                             test_size=0.2,
                                             random_state=123)
        for train_ix, test_ix in sss_level_1:

            train_x = feature_matrix.iloc[train_ix, :]
            train_y = target_matrix.iloc[train_ix, :]

            test_x = feature_matrix.iloc[test_ix, :]
            test_y = target_matrix.iloc[test_ix, :]

        return train_x, train_y, test_x, test_y
class MyLabelPowerSetFeatureSelect():
   
    def fit(self, X, y):
        
        # I'm using a gaussian naive bayes base classifier
        self.LabelPowerSetObject = LabelPowerset(GaussianNB())
        
        # fitting the data
        self.LabelPowerSetObject.fit(X, y)
        
        # transformed y 
        y_transformed  = self.LabelPowerSetObject.transform(y)
        
        # instanciating with SelectKBest object
        self.X_new = SelectKBest(chi2, k=2)
        
        # the feature selecting
        self.X_transformed = self.X_new.fit_transform(X, y_transformed)
        
        # save indices of the saved attributes
        self.selected_attributes_indices = self.X_new.get_support(indices = True)
        
        #print(self.attributes_indices,'the indices of the selected atributes')
        
        return self
        
    
    def transform(self, X):    
        return X[:,self.selected_attributes_indices]
    
    def predict(self, X):
        return self.LabelPowerSetObject.predict(X)
    
    def predict_proba(self, X):
        return self.LabelPowerSetObject.predict_proba(X)
Exemplo n.º 3
0
    def filter_rare_classes(self, feature_matrix, target_matrix):
        '''
        In order to perform stratified split between train and test,there
        should be atleast 2 instances present in the data. Hence, filter 
        label combinations that occurs only once in the entire dataset.
        Input : 
            Feature Matrix : matrix of features
            Target Matrix : matrix containing the the target labels
        Output :
            Feature Matrix : Filtered 
            Target Matrix : Filtered    
        
        '''
        lp = LabelPowerset()
        multi_class_target_labels = lp.transform(target_matrix)
        classes_vc = np.asarray(
            np.unique(multi_class_target_labels,
                      return_counts=True)).T  # 1635 unique classes
        class_to_keep = classes_vc[np.where(classes_vc[:, 1] > 1)][:, 0]
        mask = [
            True if
            (multi_class_target_labels[i] in (class_to_keep)) else False
            for i in range(len(multi_class_target_labels))
        ]
        feature_matrix = feature_matrix[mask]
        target_matrix = target_matrix[mask]

        return feature_matrix, target_matrix
Exemplo n.º 4
0
def multi_labelTo_multi_class(Y, model):
    num_of_labels = Y.ndim
    if (num_of_labels == 1):
        print("This is not a multi-label problem!!!!!!")
        return Y
    #LabelPowerset is used here as it contains the transform function
    #that actuall do the multi_label to muti_class transformation.
    transclf = LabelPowerset(classifier=model, require_dense=[False, True])
    return [transclf, transclf.transform(Y)]
    def resampling_data(self, X, y):

        # Import a dataset with X and multi-label y
        lp = LabelPowerset()
        ros = RandomOverSampler(random_state=42)

        # Applies the above stated multi-label (ML) to multi-class (MC) transformation.
        yt = lp.transform(y)
        X_resampled, y_resampled = ros.fit_sample(X, yt)
        # Inverts the ML-MC transformation to recreate the ML set
        y_resampled = lp.inverse_transform(y_resampled)

        return X_resampled, y_resampled
Exemplo n.º 6
0
def load_ucmerced_dataset():

    dataset = scipy.io.loadmat(
        '/home/rishabh/Downloads/multi-label-analysis-master/src/graphcnn/setup/dataset/dataset1.mat'
    )
    dataset = dataset['dataset1']
    edges = np.squeeze(dataset['edges'])  #adjacecny matrix
    index = np.squeeze(dataset['index'])  # image index to keep track
    classes = np.squeeze(dataset['class'])  #image class number to keep track

    #loading features in which NaN values have been replaced
    features = scipy.io.loadmat(
        '/home/rishabh/Downloads/multi-label-analysis-master/src/graphcnn/setup/dataset/features.mat'
    )
    features = features['features']
    features = features['val']
    features = features[0]
    for i in range(0, len(features)):
        if np.isnan(features[i]).any() == True:
            print('features %d have NaN:' % i, np.isnan(features[i]).any())

    # loading multi-labels
    labels = scipy.io.loadmat(
        '/home/rishabh/Downloads/multi-label-analysis-master/src/graphcnn/setup/dataset/LandUse_multilabels.mat'
    )
    labels = labels['labels']
    labels = np.transpose(labels, (1, 0))

    # Calculating class weights
    lp = LabelPowerset()
    trans_labels = lp.transform(labels)
    unique, counts = np.unique(trans_labels, return_counts=True)
    class_freq = 1.0 / counts
    weight_mat = np.zeros((np.shape(trans_labels)))
    for i in range(len(weight_mat)):
        weight_mat[i] = class_freq[np.where(trans_labels[i] == unique)]

    # Calculating label weights
    sum_labels = np.sum(labels, axis=0, dtype=np.float32)
    sum_tot = np.sum(sum_labels, dtype=np.float32)
    label_freq = np.true_divide(sum_labels, sum_tot)

    return features, edges, labels, weight_mat, label_freq, index, classes
Exemplo n.º 7
0
def multiple_smote(X, y):
    """
    为multi-label样本过采样
    """

    # Import a dataset with X and multi-label y
    y = np.array(y)
    lp = LabelPowerset()
    # oversampler = ADASYN(random_state=1994, n_neighbors=2)
    oversampler = SMOTE(k_neighbors=2)

    # Applies the above stated multi-label (ML) to multi-class (MC) transformation.
    yt = lp.transform(y)

    X_resampled, y_resampled = oversampler.fit_resample(X, yt)

    # Inverts the ML-MC transformation to recreate the ML set
    y_resampled = lp.inverse_transform(y_resampled) # return a sparse matrix

    return X_resampled, y_resampled.toarray()
Exemplo n.º 8
0
 def label_fscore(self, subset=False):
     if subset:
         true_list, pred_list = self.true_list, self.pred_list
         if self._is_binarized:
             # transform multilabel to multiclass for subset measurement
             lp = LabelPowerset()
             transformed = lp.transform(np.concatenate((true_list,
                                                        pred_list)))
             true_list, pred_list = np.split(transformed, 2)
     else:
         true_list, pred_list = self._binarized_labels()
     prec, rec, fscore, count = skm.precision_recall_fscore_support(
         true_list, pred_list)
     fscores_dict = {}
     for c, f in zip(count, fscore):
         # label = class_by_count[c]
         # for when remove_multi_labeled is used
         if c == 0:
             continue
         fscores_dict[c] = f
     return fscores_dict
def resample_multilabel(data, target):
    """
    Apply LP-transformation to create balanced classes, then convert back to multilabel targets
    """
    target = target.astype(int)

    def convert_hads_to_str(hads_data, hads_type):
        hads_strs = []
        for val in hads_data:
            if val == 0:
                str_convert = '%s_normal' % hads_type
            elif val == 1:
                str_convert = '%s_borderline' % hads_type
            elif val == 2:
                str_convert = '%s_abnormal' % hads_type
            hads_strs.append(str_convert)
        return hads_strs

    def convert_str_to_hads(hads_tuples):
        hads_array = np.ndarray(shape=(len(hads_tuples), 2))
        for t, tup in enumerate(hads_tuples):
            for s, str in enumerate(tup):
                if '_normal' in str:
                    hads_array[t, s] = 0
                elif '_borderline' in str:
                    hads_array[t, s] = 1
                elif '_abnormal' in str:
                    hads_array[t, s] = 2
        return hads_array

    anx_strings = convert_hads_to_str(target[:, 0], 'anxiety')
    dep_strings = convert_hads_to_str(target[:, 1], 'depression')
    multilabel_hads = [(anx_strings[n], dep_strings[n])
                       for n in range(len(anx_strings))]
    mlb = preprocessing.MultiLabelBinarizer()
    binary_matrix = mlb.fit_transform(multilabel_hads)

    from skmultilearn.problem_transform import LabelPowerset
    lp = LabelPowerset()
    target_lp_transformed = lp.transform(binary_matrix)

    resampler = RandomOverSampler(sampling_strategy='not majority',
                                  random_state=seed)
    data_resampled, target_lp_transformed_resampled = resampler.fit_sample(
        data, target_lp_transformed)
    binary_matrix_resampled = lp.inverse_transform(
        target_lp_transformed_resampled)

    target_resampled_multilabel = mlb.inverse_transform(
        binary_matrix_resampled)
    target_resampled_multilabel_array = convert_str_to_hads(
        target_resampled_multilabel)

    anx_resampled_to_str = convert_hads_to_str(
        target_resampled_multilabel_array[:, 0], 'anxiety')
    dep_resampled_to_str = convert_hads_to_str(
        target_resampled_multilabel_array[:, 1], 'depression')
    target_resampled_multilabel_df = pd.DataFrame()
    target_resampled_multilabel_df['anxiety'] = anx_resampled_to_str
    target_resampled_multilabel_df['depression'] = dep_resampled_to_str

    return data_resampled, target_resampled_multilabel_df.values, target_lp_transformed_resampled
Exemplo n.º 10
0
    def lp_framework(self):
        clusters = []
        for i in range(len(np.unique(self.clustering_labels))):
            cluster = []
            for j in range(len(self.clustering_labels)):
                if i == self.clustering_labels[j]:
                    cluster.append(self.labels[j])
            clusters.append(cluster)
        print(self.labels)
        print(clusters)
        transformed_labels = []
        LabelPowersetTramsformer = LabelPowerset()
        for cluster in clusters:
            print(cluster)
            print(self.universe[:, cluster].shape)
            # print(self.universe[:, cluster])
            print(len(cluster))
            if len(cluster) == 1:
                transformed_labels.append(
                    list(self.universe[:, cluster].reshape(-1)))
            else:
                # print(self.universe[:, cluster])
                print(
                    LabelPowersetTramsformer.transform(self.universe[:,
                                                                     cluster]))
                print(
                    type(
                        LabelPowersetTramsformer.transform(
                            self.universe[:, cluster]).shape))
                print(
                    list(
                        LabelPowersetTramsformer.transform(
                            self.universe[:, cluster])))
                tmp = list(
                    LabelPowersetTramsformer.transform(self.universe[:,
                                                                     cluster]))
                print(
                    type(
                        list(
                            LabelPowersetTramsformer.transform(
                                self.universe[:, cluster]))))
                transformed_labels.append(tmp)

        # print(len(transformed_labels))
        # print(len(transformed_labels[0]))
        # print(transformed_labels[0])
        # print(transformed_labels[0][0])
        # print(transformed_labels[0][0][0])
        print(np.array(transformed_labels).shape)
        print("universe shape:" + str(self.universe.shape))
        print(self.universe[:, self.attributes].shape)
        print(np.array(transformed_labels).T.shape)
        self.universe = np.append(self.universe[:, self.attributes],
                                  np.array(transformed_labels).T,
                                  axis=1)
        print(self.universe.shape)
        print(self.attributes)
        self.labels = [
            i for i in range(len(self.attributes), self.universe.shape[1])
        ]
        print(self.labels)
        pass
		train_text_ori = train_df['alltext'].tolist()
		train_text_ori = [' '.join(t.split()) for t in train_text_ori]
		train_text_ori = np.array(train_text_ori, dtype=object)[:, np.newaxis]
		train_label_ori = train_df.values[:,3:-1]

		test_text = test_df['alltext'].tolist()
		test_text = [' '.join(t.split()) for t in test_text]
		test_text = np.array(test_text, dtype=object)[:, np.newaxis]
		test_label = test_df.values[:,3:-1]

		
		## Upsampling of data for each label
		print('Before upsampling: ',train_text_ori.shape,train_label_ori.shape,test_text.shape,test_label.shape)

		yt = lp.transform(train_label_ori.astype('int'))
		train_text, y_resampled = ros.fit_sample(train_text_ori.astype('str'), yt)

		train_label = lp.inverse_transform(y_resampled).toarray()

		# train_text=train_text_ori
		# train_label = train_label_ori

		print('After Up-sampling',train_text.shape,train_label.shape,test_text.shape,test_label.shape)


		# Instantiate tokenizer
		tokenizer = create_tokenizer_from_hub_module()

		# Convert data to InputExample format
		train_examples = convert_text_to_examples(train_text, train_label)
def load_ucmerced_dataset():

    dataset = scipy.io.loadmat(
        'dataset/newtotpatt.mat')  #dataset1.mat, totgra.mat
    dataset = dataset['newtotpatt']  #dataset1, totgra
    #dataset = np.transpose(dataset);
    edges = np.squeeze(dataset['edges'])  #adjacecny matrix
    index = np.squeeze(dataset['index'])  # image index to keep track
    classes = np.squeeze(dataset['class'])  #image class number to keep track
    #'''
    #loading features in which NaN values have been replaced
    features1 = scipy.io.loadmat('dataset/features1n.mat')  #normfeatures1.mat
    features1 = features1['features1n']
    features1 = np.squeeze(features1['val'])
    features2 = scipy.io.loadmat('dataset/features2n.mat')  #normfeatures1.mat
    features2 = features2['features2n']
    features2 = np.squeeze(features2['val'])
    features3 = scipy.io.loadmat('dataset/features3n.mat')  #normfeatures1.mat
    features3 = features3['features3n']
    features3 = np.squeeze(features3['val'])
    features = np.concatenate((features1, features2))
    features = np.concatenate((features, features3))
    #features = np.squeeze(dataset(features))
    #features = features['val']
    #features = features[0]
    print type(features[0])
    print features.shape
    for i in range(0, len(features)):
        if np.isnan(features[i]).any() == True:
            print('features %d have NaN:' % i, np.isnan(features[i]).any())
    '''
    f = h5py.File('dataset/pattfeatures.mat','r')#dataset1.mat
    test = f['features'] #dataset1
    #dataset = np.transpose(dataset);
    test = np.squeeze(test['val']) #adjacecny matrix

    st=test
    for i in range(30400):
       st[i] = test[i]
    features = f[st]
    #index = np.squeeze(dataset['index'][:])  # image index to keep track
    #val = np.array(val)
    #index = np.array(index)
    #classes = np.squeeze(dataset['class'])  #image class number to keep track
    


    #features = features['val']
    #features = test
    #features = f[features]
    #str1 = ''.join(features(i) for i in features[:])
    #features = np.array([features])
    #features[0] = np.array(features[0])
    print type(features[0])
    print features[0].shape
    #for i in range(0,len(features)):
    #    if np.isnan(features[i]).any() == True:
    #        print('features %d have NaN:'% i,np.isnan(features[i]).any())
    '''
    #loading multi-labels
    labels = scipy.io.loadmat('dataset/pattlabels.mat')  #LandUse_multilabels
    labels = labels['labels']
    #labels = np.transpose(labels)

    #loading positive-labels
    #pos = scipy.io.loadmat('dataset/Indexes.mat') ##labels
    #pos = pos['ind']
    #pos = np.transpose(pos)

    #neg = scipy.io.loadmat('dataset/Index-.mat') ##labels
    #neg = labels['neg_ind']
    #neg = np.transpose(neg)
    #load pairs
    #pairs = scipy.io.loadmat('dataset/pair.mat')

    # Calculating class weights

    lp = LabelPowerset()
    trans_labels = lp.transform(labels)
    unique, counts = np.unique(trans_labels, return_counts=True)
    class_freq = 1.0 / counts
    weight_mat = np.zeros((np.shape(trans_labels)))
    for i in range(len(weight_mat)):
        weight_mat[i] = class_freq[np.where(trans_labels[i] == unique)]

    # Calculating label weights
    sum_labels = np.sum(labels, axis=0, dtype=np.float32)
    sum_tot = np.sum(sum_labels, dtype=np.float32)
    label_freq = np.true_divide(sum_labels, sum_tot)

    return features, edges, labels, weight_mat, label_freq, index, classes