def do_transform(dataset, mrs, test_data): # training data id_df, bi_df, mrs_df, nih_df = data_utils.get_tsr(mrs, '') # Using the correlation matrix is equivalent to standardizing each of the variables (to mean 0 and standard deviation 1). # we want to use covariance matrix so don't scale the data # scaled_bi, scaler = data_utils.scale(bi_df) bi_df_pca, pca = data_utils.pca_reduction(bi_df) # testing data labels = test_data[['label']] test_bi = test_data.drop(['label'], axis=1) # test_bi_scaled = scaler.transform(test_bi.values) test_bi_pca = pca.transform(test_bi.values) test_bi_pca_df = pd.DataFrame(data=test_bi_pca, index=test_data.index, columns=['pca_1', 'pca_2']) test_bi_pca_df['label'] = labels.values return test_bi_pca_df
# plt.title('mRS-3') # threshold = pd.Series(X_scores).quantile(0.85) # 0:95, 1:90, 2:90, 3:90, 4:90, 5:85 plt.annotate('Elbow point', xy=(1104, threshold), arrowprops=dict(arrowstyle='->'), xytext=(900, threshold + 0.09)) print(threshold) return threshold if __name__ == '__main__': mrs = 5 test_dataset = 'tnk' id_df, bi_df, mrs_df, nih_df = data_utils.get_tsr(mrs, 'is') bi_df_unique = bi_df.drop_duplicates() bi_df_pca, pca = data_utils.pca_reduction(bi_df) bi_df_pca_unique = bi_df_pca.drop_duplicates() k_neighbors = 10 outliers_fraction = 0.01 # Training clf = LocalOutlierFactor(n_neighbors=k_neighbors, contamination=outliers_fraction) # determine cutoff cutoff = score_cutoff(bi_df_pca_unique, clf) # labels = make_score_label(bi_df_pca_unique, clf, cutoff) data_labeled_all, data_labeled_unique = data_utils.label_data( bi_df, bi_df_pca_unique, labels)