def multi_classTo_multi_multi(Y, model): num_of_labels = Y.ndim if (num_of_labels >= 2): print("This is already a multi-label problem!!!!!!") return Y transclf = LabelPowerset(classifier=model, require_dense=[False, True]) return transclf.inverse_transform(Y)
def resampling_data(self, X, y): # Import a dataset with X and multi-label y lp = LabelPowerset() ros = RandomOverSampler(random_state=42) # Applies the above stated multi-label (ML) to multi-class (MC) transformation. yt = lp.transform(y) X_resampled, y_resampled = ros.fit_sample(X, yt) # Inverts the ML-MC transformation to recreate the ML set y_resampled = lp.inverse_transform(y_resampled) return X_resampled, y_resampled
def multiple_smote(X, y): """ 为multi-label样本过采样 """ # Import a dataset with X and multi-label y y = np.array(y) lp = LabelPowerset() # oversampler = ADASYN(random_state=1994, n_neighbors=2) oversampler = SMOTE(k_neighbors=2) # Applies the above stated multi-label (ML) to multi-class (MC) transformation. yt = lp.transform(y) X_resampled, y_resampled = oversampler.fit_resample(X, yt) # Inverts the ML-MC transformation to recreate the ML set y_resampled = lp.inverse_transform(y_resampled) # return a sparse matrix return X_resampled, y_resampled.toarray()
def resample_multilabel(data, target): """ Apply LP-transformation to create balanced classes, then convert back to multilabel targets """ target = target.astype(int) def convert_hads_to_str(hads_data, hads_type): hads_strs = [] for val in hads_data: if val == 0: str_convert = '%s_normal' % hads_type elif val == 1: str_convert = '%s_borderline' % hads_type elif val == 2: str_convert = '%s_abnormal' % hads_type hads_strs.append(str_convert) return hads_strs def convert_str_to_hads(hads_tuples): hads_array = np.ndarray(shape=(len(hads_tuples), 2)) for t, tup in enumerate(hads_tuples): for s, str in enumerate(tup): if '_normal' in str: hads_array[t, s] = 0 elif '_borderline' in str: hads_array[t, s] = 1 elif '_abnormal' in str: hads_array[t, s] = 2 return hads_array anx_strings = convert_hads_to_str(target[:, 0], 'anxiety') dep_strings = convert_hads_to_str(target[:, 1], 'depression') multilabel_hads = [(anx_strings[n], dep_strings[n]) for n in range(len(anx_strings))] mlb = preprocessing.MultiLabelBinarizer() binary_matrix = mlb.fit_transform(multilabel_hads) from skmultilearn.problem_transform import LabelPowerset lp = LabelPowerset() target_lp_transformed = lp.transform(binary_matrix) resampler = RandomOverSampler(sampling_strategy='not majority', random_state=seed) data_resampled, target_lp_transformed_resampled = resampler.fit_sample( data, target_lp_transformed) binary_matrix_resampled = lp.inverse_transform( target_lp_transformed_resampled) target_resampled_multilabel = mlb.inverse_transform( binary_matrix_resampled) target_resampled_multilabel_array = convert_str_to_hads( target_resampled_multilabel) anx_resampled_to_str = convert_hads_to_str( target_resampled_multilabel_array[:, 0], 'anxiety') dep_resampled_to_str = convert_hads_to_str( target_resampled_multilabel_array[:, 1], 'depression') target_resampled_multilabel_df = pd.DataFrame() target_resampled_multilabel_df['anxiety'] = anx_resampled_to_str target_resampled_multilabel_df['depression'] = dep_resampled_to_str return data_resampled, target_resampled_multilabel_df.values, target_lp_transformed_resampled
train_text_ori = np.array(train_text_ori, dtype=object)[:, np.newaxis] train_label_ori = train_df.values[:,3:-1] test_text = test_df['alltext'].tolist() test_text = [' '.join(t.split()) for t in test_text] test_text = np.array(test_text, dtype=object)[:, np.newaxis] test_label = test_df.values[:,3:-1] ## Upsampling of data for each label print('Before upsampling: ',train_text_ori.shape,train_label_ori.shape,test_text.shape,test_label.shape) yt = lp.transform(train_label_ori.astype('int')) train_text, y_resampled = ros.fit_sample(train_text_ori.astype('str'), yt) train_label = lp.inverse_transform(y_resampled).toarray() # train_text=train_text_ori # train_label = train_label_ori print('After Up-sampling',train_text.shape,train_label.shape,test_text.shape,test_label.shape) # Instantiate tokenizer tokenizer = create_tokenizer_from_hub_module() # Convert data to InputExample format train_examples = convert_text_to_examples(train_text, train_label) test_examples = convert_text_to_examples(test_text, test_label) # Convert to features