예제 #1
0
def multi_classTo_multi_multi(Y, model):
    num_of_labels = Y.ndim
    if (num_of_labels >= 2):
        print("This is already a multi-label problem!!!!!!")
        return Y
    transclf = LabelPowerset(classifier=model, require_dense=[False, True])
    return transclf.inverse_transform(Y)
    def resampling_data(self, X, y):

        # Import a dataset with X and multi-label y
        lp = LabelPowerset()
        ros = RandomOverSampler(random_state=42)

        # Applies the above stated multi-label (ML) to multi-class (MC) transformation.
        yt = lp.transform(y)
        X_resampled, y_resampled = ros.fit_sample(X, yt)
        # Inverts the ML-MC transformation to recreate the ML set
        y_resampled = lp.inverse_transform(y_resampled)

        return X_resampled, y_resampled
예제 #3
0
def multiple_smote(X, y):
    """
    为multi-label样本过采样
    """

    # Import a dataset with X and multi-label y
    y = np.array(y)
    lp = LabelPowerset()
    # oversampler = ADASYN(random_state=1994, n_neighbors=2)
    oversampler = SMOTE(k_neighbors=2)

    # Applies the above stated multi-label (ML) to multi-class (MC) transformation.
    yt = lp.transform(y)

    X_resampled, y_resampled = oversampler.fit_resample(X, yt)

    # Inverts the ML-MC transformation to recreate the ML set
    y_resampled = lp.inverse_transform(y_resampled) # return a sparse matrix

    return X_resampled, y_resampled.toarray()
def resample_multilabel(data, target):
    """
    Apply LP-transformation to create balanced classes, then convert back to multilabel targets
    """
    target = target.astype(int)

    def convert_hads_to_str(hads_data, hads_type):
        hads_strs = []
        for val in hads_data:
            if val == 0:
                str_convert = '%s_normal' % hads_type
            elif val == 1:
                str_convert = '%s_borderline' % hads_type
            elif val == 2:
                str_convert = '%s_abnormal' % hads_type
            hads_strs.append(str_convert)
        return hads_strs

    def convert_str_to_hads(hads_tuples):
        hads_array = np.ndarray(shape=(len(hads_tuples), 2))
        for t, tup in enumerate(hads_tuples):
            for s, str in enumerate(tup):
                if '_normal' in str:
                    hads_array[t, s] = 0
                elif '_borderline' in str:
                    hads_array[t, s] = 1
                elif '_abnormal' in str:
                    hads_array[t, s] = 2
        return hads_array

    anx_strings = convert_hads_to_str(target[:, 0], 'anxiety')
    dep_strings = convert_hads_to_str(target[:, 1], 'depression')
    multilabel_hads = [(anx_strings[n], dep_strings[n])
                       for n in range(len(anx_strings))]
    mlb = preprocessing.MultiLabelBinarizer()
    binary_matrix = mlb.fit_transform(multilabel_hads)

    from skmultilearn.problem_transform import LabelPowerset
    lp = LabelPowerset()
    target_lp_transformed = lp.transform(binary_matrix)

    resampler = RandomOverSampler(sampling_strategy='not majority',
                                  random_state=seed)
    data_resampled, target_lp_transformed_resampled = resampler.fit_sample(
        data, target_lp_transformed)
    binary_matrix_resampled = lp.inverse_transform(
        target_lp_transformed_resampled)

    target_resampled_multilabel = mlb.inverse_transform(
        binary_matrix_resampled)
    target_resampled_multilabel_array = convert_str_to_hads(
        target_resampled_multilabel)

    anx_resampled_to_str = convert_hads_to_str(
        target_resampled_multilabel_array[:, 0], 'anxiety')
    dep_resampled_to_str = convert_hads_to_str(
        target_resampled_multilabel_array[:, 1], 'depression')
    target_resampled_multilabel_df = pd.DataFrame()
    target_resampled_multilabel_df['anxiety'] = anx_resampled_to_str
    target_resampled_multilabel_df['depression'] = dep_resampled_to_str

    return data_resampled, target_resampled_multilabel_df.values, target_lp_transformed_resampled
		train_text_ori = np.array(train_text_ori, dtype=object)[:, np.newaxis]
		train_label_ori = train_df.values[:,3:-1]

		test_text = test_df['alltext'].tolist()
		test_text = [' '.join(t.split()) for t in test_text]
		test_text = np.array(test_text, dtype=object)[:, np.newaxis]
		test_label = test_df.values[:,3:-1]

		
		## Upsampling of data for each label
		print('Before upsampling: ',train_text_ori.shape,train_label_ori.shape,test_text.shape,test_label.shape)

		yt = lp.transform(train_label_ori.astype('int'))
		train_text, y_resampled = ros.fit_sample(train_text_ori.astype('str'), yt)

		train_label = lp.inverse_transform(y_resampled).toarray()

		# train_text=train_text_ori
		# train_label = train_label_ori

		print('After Up-sampling',train_text.shape,train_label.shape,test_text.shape,test_label.shape)


		# Instantiate tokenizer
		tokenizer = create_tokenizer_from_hub_module()

		# Convert data to InputExample format
		train_examples = convert_text_to_examples(train_text, train_label)
		test_examples = convert_text_to_examples(test_text, test_label)

		# Convert to features