def DrawFeatureRelationshipAccordingToCsvFile(file_path, selected_feature_name_list, label_name_list, store_path=''): ''' Help draw the feature relationship among different features according to the path of the data container. :param file_path: the file path of the csv file :param selected_feature_name_list: the features that would be drawn :param label_name_list: the name of the label. e.g. ['non-cnacer', 'cancer'] :param store_path: The store path, supporting jpg and eps format. :return: ''' data_container = DataContainer() data_container.Load(file_path) data_container.UsualNormalize() data, label, feature_name, case_name = data_container.GetData() if len(selected_feature_name_list) > 3 or len( selected_feature_name_list) < 1: print( "Please check the length of the feature list. It can only show the relationship of the 1, 2, or 3 features" ) try: index = [feature_name.index(t) for t in selected_feature_name_list] except: print('The selected feature is not in the data container.') return result_data = [] for sub_index in index: result_data.append(data[:, sub_index]) DrawValueRelationship(result_data, selected_feature_name_list, label, label_name_list, store_path)
def GenerateDescription(): training_data_container = DataContainer() training_data_container.Load(r'..\..\Example\numeric_feature.csv') one_pipeline = OnePipeline() one_pipeline.LoadPipeline(r'C:\MyCode\FAEGitHub\FAE\Example\report_temp\NormUnit_Cos_ANOVA_5_SVM\pipeline_info.csv') description = Description() description.Run(training_data_container, one_pipeline, r'..\..\Example\report_temp', r'..\..\Example\report')
def TestNewData(NewDataCsv, model_folder, result_save_path=''): ''' :param NewDataCsv: New radiomics feature matrix csv file path :param model_folder:The trained model path :return:classification result ''' train_info = LoadTrainInfo(model_folder) new_data_container = DataContainer() #Normlization new_data_container.Load(NewDataCsv) # feature_selector = FeatureSelector() # feature_selector.SelectFeatureByName(new_data_container, train_info['selected_features'], is_replace=True) new_data_container = train_info['normalizer'].Transform(new_data_container) # data_frame = new_data_container.GetFrame() # data_frame = data_frame[train_info['selected_features']] # new_data_container.SetFrame(data_frame) # new_data_container.UpdateDataByFrame() ##Model train_info['classifier'].SetDataContainer(new_data_container) model = train_info['classifier'].GetModel() predict = model.predict_proba(new_data_container.GetArray())[:, 1] label = new_data_container.GetLabel() case_name = new_data_container.GetCaseName() test_result_info = [['CaseName', 'Pred', 'Label']] for index in range(len(label)): test_result_info.append( [case_name[index], predict[index], label[index]]) metric = EstimateMetirc(predict, label) info = {} info.update(metric) cv = CrossValidation() print(metric) print('\t') if result_save_path: cv.SaveResult(info, result_save_path) np.save(os.path.join(result_save_path, 'test_predict.npy'), predict) np.save(os.path.join(result_save_path, 'test_label.npy'), label) with open(os.path.join(result_save_path, 'test_info.csv'), 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerows(test_result_info) return metric
os.path.join(store_folder, 'train_numeric_feature.csv')) df_training = pd.DataFrame( data=self._training_index, columns=['training_index'], index=[case_name[index] for index in self._training_index]) df_training.to_csv(os.path.join(store_folder, 'training_index.csv'), sep=',', quotechar='"') test_data_container.Save( os.path.join(store_folder, 'test_numeric_feature.csv')) df_testing = pd.DataFrame( data=testing_index_list, columns=['training_index'], index=[case_name[index] for index in testing_index_list]) df_testing.to_csv(os.path.join(store_folder, 'testing_index.csv'), sep=',', quotechar='"') return train_data_container, test_data_container if __name__ == '__main__': data = DataContainer() data.Load(r'..\..\Example\numeric_feature.csv') data_separator = DataSeparate() data_separator.Run(data, store_folder=r'..\..\Example')
if not is_test: data_container.Save( os.path.join(store_folder, 'zero_center_unit_training_feature.csv')) self.Save(store_path=os.path.join( store_folder, 'zero_center_unit_normalization_training.csv'), feature_name=data_container.GetFeatureName()) else: data_container.Save( os.path.join( store_folder, 'zero_center_unit_normalized_testing_feature.csv')) return data_container def GetDescription(self): text = "We applied the normalization on the feature matrix. Each feature vector was subtracted by the mean " \ "value of the vector and was divided by the length of it. " return text if __name__ == '__main__': from FAE.DataContainer.DataContainer import DataContainer data_container = DataContainer() file_path = os.path.abspath(r'..\..\Example\numeric_feature.csv') print(file_path) data_container.Load(file_path) normalizer = NormalizerZeroCenterAndUnit() normalizer.Run(data_container, store_folder=r'..\..\Example\one_pipeline')
class PrepareConnection(QWidget, Ui_Prepare): def __init__(self, parent=None): super(PrepareConnection, self).__init__(parent) self.setupUi(self) self.data_container = DataContainer() self.buttonLoad.clicked.connect(self.LoadData) self.buttonRemove.clicked.connect(self.RemoveNonValidValue) self.checkSeparate.clicked.connect(self.SetSeparateStatus) self.spinBoxSeparate.setEnabled(False) self.buttonSave.clicked.connect(self.CheckAndSave) def UpdateTable(self): if self.data_container.GetArray().size == 0: return self.tableFeature.setRowCount(len(self.data_container.GetCaseName())) header_name = deepcopy(self.data_container.GetFeatureName()) header_name.insert(0, 'Label') self.tableFeature.setColumnCount(len(header_name)) self.tableFeature.setHorizontalHeaderLabels(header_name) self.tableFeature.setVerticalHeaderLabels( list(map(str, self.data_container.GetCaseName()))) for row_index in range(len(self.data_container.GetCaseName())): for col_index in range(len(header_name)): if col_index == 0: self.tableFeature.setItem( row_index, col_index, QTableWidgetItem( str(self.data_container.GetLabel()[row_index]))) else: self.tableFeature.setItem( row_index, col_index, QTableWidgetItem( str(self.data_container.GetArray()[row_index, col_index - 1]))) text = "The number of cases: {:d}\n".format( len(self.data_container.GetCaseName())) text += "The number of features: {:d}\n".format( len(self.data_container.GetFeatureName())) if len(np.unique(self.data_container.GetLabel())) == 2: positive_number = len( np.where(self.data_container.GetLabel() == np.max( self.data_container.GetLabel()))[0]) negative_number = len( self.data_container.GetLabel()) - positive_number assert (positive_number + negative_number == len( self.data_container.GetLabel())) text += "The number of positive samples: {:d}\n".format( positive_number) text += "The number of negative samples: {:d}\n".format( negative_number) self.textInformation.setText(text) def LoadData(self): dlg = QFileDialog() file_name, _ = dlg.getOpenFileName(self, 'Open SCV file', filter="csv files (*.csv)") try: self.data_container.Load(file_name) except: print('Error') self.UpdateTable() def RemoveNonValidValue(self): if self.radioRemoveNonvalidCases.isChecked(): self.data_container.RemoveUneffectiveCases() elif self.radioRemoveNonvalidFeatures.isChecked(): self.data_container.RemoveUneffectiveFeatures() self.UpdateTable() def SetSeparateStatus(self): if self.checkSeparate.isChecked(): self.spinBoxSeparate.setEnabled(True) else: self.spinBoxSeparate.setEnabled(False) def CheckAndSave(self): if self.data_container.IsEmpty(): QMessageBox.warning(self, "Warning", "There is no data", QMessageBox.Ok) elif self.data_container.HasNonValidNumber(): QMessageBox.warning(self, "Warning", "There are nan items", QMessageBox.Ok) non_valid_number_Index = self.data_container.FindNonValidNumberIndex( ) old_edit_triggers = self.tableFeature.editTriggers() self.tableFeature.setEditTriggers(QAbstractItemView.CurrentChanged) self.tableFeature.setCurrentCell(non_valid_number_Index[0], non_valid_number_Index[1] + 1) self.tableFeature.setEditTriggers(old_edit_triggers) elif self.checkSeparate.isChecked(): percentage_testing_data = self.spinBoxSeparate.value() folder_name = QFileDialog.getExistingDirectory(self, "Save data") if folder_name != '': data_seperate = DataSeparate.DataSeparate( percentage_testing_data) data_seperate.Run(self.data_container, folder_name) else: file_name, _ = QFileDialog.getSaveFileName( self, "Save data", filter="csv files (*.csv)") if file_name != '': self.data_container.Save(file_name)
self.classifier.Save(store_folder) self.SaveResult(info, store_folder) return train_cv_metric, val_cv_metric, test_metric, all_train_metric if __name__ == '__main__': from FAE.DataContainer.DataContainer import DataContainer from FAE.FeatureAnalysis.Normalizer import NormalizerZeroCenter from FAE.FeatureAnalysis.Classifier import SVM, LR, LDA, LRLasso, GaussianProcess, NaiveBayes, DecisionTree, RandomForest, AE, AdaBoost import numpy as np train_data_container = DataContainer() train_data_container.Load( r'C:\MyCode\FAEGitHub\FAE\Example\withoutshape\non_balance_features.csv' ) normalizer = NormalizerZeroCenter() train_data_container = normalizer.Run(train_data_container) data = train_data_container.GetArray() label = np.asarray(train_data_container.GetLabel()) # param_list = [ # {"hidden_layer_sizes": [(30,), (100,)], # "solver": ["adam"], # "alpha": [0.0001, 0.001], # "learning_rate_init": [0.001, 0.01]} # ] # from sklearn.model_selection import ParameterGrid
class PrepareConnection(QWidget, Ui_Prepare): def __init__(self, parent=None): super(PrepareConnection, self).__init__(parent) self.setupUi(self) self.data_container = DataContainer() self.buttonLoad.clicked.connect(self.LoadData) self.buttonRemove.clicked.connect(self.RemoveNonValidValue) self.loadTestingReference.clicked.connect( self.LoadTestingReferenceDataContainer) self.clearTestingReference.clicked.connect( self.ClearTestingReferenceDataContainer) self.__testing_ref_data_container = DataContainer() self.checkSeparate.clicked.connect(self.SetSeparateStatus) self.spinBoxSeparate.setEnabled(False) self.logger = eclog(os.path.split(__file__)[-1]).GetLogger() self.loadTestingReference.setEnabled(False) self.clearTestingReference.setEnabled(False) self.buttonSave.clicked.connect(self.CheckAndSave) def UpdateTable(self): if self.data_container.GetArray().size == 0: return self.tableFeature.setRowCount(len(self.data_container.GetCaseName())) header_name = deepcopy(self.data_container.GetFeatureName()) header_name.insert(0, 'Label') self.tableFeature.setColumnCount(len(header_name)) self.tableFeature.setHorizontalHeaderLabels(header_name) self.tableFeature.setVerticalHeaderLabels( list(map(str, self.data_container.GetCaseName()))) for row_index in range(len(self.data_container.GetCaseName())): for col_index in range(len(header_name)): if col_index == 0: self.tableFeature.setItem( row_index, col_index, QTableWidgetItem( str(self.data_container.GetLabel()[row_index]))) else: self.tableFeature.setItem( row_index, col_index, QTableWidgetItem( str(self.data_container.GetArray()[row_index, col_index - 1]))) text = "The number of cases: {:d}\n".format( len(self.data_container.GetCaseName())) text += "The number of features: {:d}\n".format( len(self.data_container.GetFeatureName())) if len(np.unique(self.data_container.GetLabel())) == 2: positive_number = len( np.where(self.data_container.GetLabel() == np.max( self.data_container.GetLabel()))[0]) negative_number = len( self.data_container.GetLabel()) - positive_number assert (positive_number + negative_number == len( self.data_container.GetLabel())) text += "The number of positive samples: {:d}\n".format( positive_number) text += "The number of negative samples: {:d}\n".format( negative_number) self.textInformation.setText(text) def LoadData(self): dlg = QFileDialog() file_name, _ = dlg.getOpenFileName(self, 'Open SCV file', filter="csv files (*.csv)") try: self.data_container.Load(file_name) self.logger.info('Open the file ' + file_name + ' Succeed.') except OSError as reason: self.logger.log('Open SCV file Error, The reason is ' + str(reason)) QMessageBox.about(self, 'Load data Error', reason.__str__()) print('Error!' + str(reason)) except ValueError: self.logger.error('Open SCV file ' + file_name + ' Failed. because of value error.') QMessageBox.information(self, 'Error', 'The selected data file mismatch.') self.UpdateTable() self.buttonRemove.setEnabled(True) self.buttonSave.setEnabled(True) def LoadTestingReferenceDataContainer(self): dlg = QFileDialog() file_name, _ = dlg.getOpenFileName(self, 'Open SCV file', filter="csv files (*.csv)") try: self.__testing_ref_data_container.Load(file_name) self.loadTestingReference.setEnabled(False) self.clearTestingReference.setEnabled(True) self.spinBoxSeparate.setEnabled(False) except OSError as reason: self.logger.log('Load Testing Reference Error: ' + str(reason)) print('Error!' + str(reason)) except ValueError: self.logger.error('Open SCV file ' + file_name + ' Failed. because of value error.') QMessageBox.information(self, 'Error', 'The selected data file mismatch.') def ClearTestingReferenceDataContainer(self): del self.__testing_ref_data_container self.__testing_ref_data_container = DataContainer() self.loadTestingReference.setEnabled(True) self.clearTestingReference.setEnabled(False) self.spinBoxSeparate.setEnabled(False) def RemoveNonValidValue(self): if self.radioRemoveNonvalidCases.isChecked(): self.data_container.RemoveUneffectiveCases() elif self.radioRemoveNonvalidFeatures.isChecked(): self.data_container.RemoveUneffectiveFeatures() self.UpdateTable() def SetSeparateStatus(self): if self.checkSeparate.isChecked(): self.spinBoxSeparate.setEnabled(True) self.loadTestingReference.setEnabled(True) self.clearTestingReference.setEnabled(False) else: self.spinBoxSeparate.setEnabled(False) self.loadTestingReference.setEnabled(False) self.clearTestingReference.setEnabled(False) def CheckAndSave(self): if self.data_container.IsEmpty(): QMessageBox.warning(self, "Warning", "There is no data", QMessageBox.Ok) elif not self.data_container.IsBinaryLabel(): QMessageBox.warning(self, "Warning", "There are not 2 Labels", QMessageBox.Ok) non_valid_number_Index = self.data_container.FindNonValidLabelIndex( ) old_edit_triggers = self.tableFeature.editTriggers() self.tableFeature.setEditTriggers(QAbstractItemView.CurrentChanged) self.tableFeature.setCurrentCell(non_valid_number_Index, 0) self.tableFeature.setEditTriggers(old_edit_triggers) elif self.data_container.HasNonValidNumber(): QMessageBox.warning(self, "Warning", "There are nan items", QMessageBox.Ok) non_valid_number_Index = self.data_container.FindNonValidNumberIndex( ) old_edit_triggers = self.tableFeature.editTriggers() self.tableFeature.setEditTriggers(QAbstractItemView.CurrentChanged) self.tableFeature.setCurrentCell(non_valid_number_Index[0], non_valid_number_Index[1] + 1) self.tableFeature.setEditTriggers(old_edit_triggers) else: remove_features_with_same_value = RemoveSameFeatures() self.data_container = remove_features_with_same_value.Run( self.data_container) data_balance = DataBalance() if self.radioDownSampling.isChecked(): data_balance = DownSampling() elif self.radioUpSampling.isChecked(): data_balance = UpSampling() elif self.radioSmote.isChecked(): data_balance = SmoteSampling() if self.checkSeparate.isChecked(): folder_name = QFileDialog.getExistingDirectory( self, "Save data") if folder_name != '': data_separate = DataSeparate.DataSeparate() try: if self.__testing_ref_data_container.IsEmpty(): testing_data_percentage = self.spinBoxSeparate.value( ) training_data_container, _, = data_separate.RunByTestingPercentage( self.data_container, testing_data_percentage, folder_name) else: training_data_container, _, = data_separate.RunByTestingReference( self.data_container, self.__testing_ref_data_container, folder_name) if training_data_container.IsEmpty(): QMessageBox.information( self, 'Error', 'The testing data does not mismatch, please check the testing data ' 'really exists in current data') return None data_balance.Run(training_data_container, store_path=folder_name) except Exception as e: content = 'PrepareConnection, splitting failed: ' self.logger.error('{}{}'.format(content, str(e))) QMessageBox.about(self, content, e.__str__()) else: file_name, _ = QFileDialog.getSaveFileName( self, "Save data", filter="csv files (*.csv)") if file_name != '': data_balance.Run(self.data_container, store_path=file_name)
input_data_container = data_container for fs in self.__selector_list: output = fs.Run(input_data_container, store_folder) input_data_container = output return output ################################################################ if __name__ == '__main__': import os print(os.getcwd()) from FAE.DataContainer.DataContainer import DataContainer data_container = DataContainer() print(os.path.abspath(r'..\..\Example\numeric_feature.csv')) data_container.Load(r'..\..\Example\numeric_feature.csv') # data_container.UsualNormalize() print(data_container.GetArray().shape) print(data_container.GetFeatureName()) fs = FeatureSelectBySubName(['shape', 'ADC']) output = fs.Run(data_container) print(output.GetFeatureName()) # fs1 = RemoveNonNumericFeature() # fs1.SetDataContainer(data_container) # non_number_data_container = fs1.Run() # # fs2 = FeatureSelectByANOVA(10)
input_data_container = output return output def SaveInfo(self, store_folder, all_features): for fs in self.__selector_list: fs.SaveInfo(store_folder, all_features) def SaveDataContainer(self, data_container, store_folder, store_key): for fs in self.__selector_list: fs.SaveDataContainer(data_container, store_folder, store_key) ################################################################ if __name__ == '__main__': from FAE.DataContainer.DataContainer import DataContainer from FAE.FeatureAnalysis.Normalizer import NormalizerZeroCenter from FAE.FeatureAnalysis.DimensionReduction import DimensionReductionByPCC dc = DataContainer() pcc = DimensionReductionByPCC() fs = FeatureSelectByKruskalWallis(selected_feature_number=5) dc.Load(r'..\..\Demo\train_numeric_feature.csv') dc = NormalizerZeroCenter.Run(dc) dc = pcc.Run(dc) print(dc.GetArray().shape) dc = fs.Run(dc) print(dc.GetArray().shape)
vif_dict[exog] = vif # calculate tolerance tolerance = 1 - r_squared tolerance_dict[exog] = tolerance # return VIF DataFrame df_vif = pd.DataFrame({'VIF': vif_dict, 'Tolerance': tolerance_dict}) return df_vif if __name__ == '__main__': data_path = r'..\..\Demo\train_numeric_feature.csv' from FAE.DataContainer.DataContainer import DataContainer from FAE.FeatureAnalysis.Normalizer import NormalizerZeroCenter pca = DimensionReductionByPCA() dc = DataContainer() dc.Load(data_path) dc = NormalizerZeroCenter.Run(dc) # dc = pca.Run(dc) df = pd.DataFrame(dc.GetArray(), index=dc.GetCaseName(), columns=dc.GetFeatureName()) dr = DimensionReductionByVIF() new_df = dr.CalculateVIF(df) print(dc.GetArray().shape, new_df.shape)
from copy import deepcopy from FAE.DataContainer.DataContainer import DataContainer class FeatureEncodingOneHot(): def __init__(self): pass def OneHotOneColumn(self, data_container, feature_list): info = data_container.GetFrame() feature_name = data_container.GetFeatureName() for feature in feature_list: assert(feature in feature_name) new_info = pd.get_dummies(info, columns=feature_list) new_data = DataContainer() new_data.SetFrame(new_info) return new_data if __name__ == '__main__': import pandas as pd data = DataContainer() data.Load(r'c:\Users\yangs\Desktop\test.csv') info = data.GetFrame() new_info = pd.get_dummies(info, columns=['bGs', 'PIRADS', 't2score', 'DWIscore', 'MR_stage']) new_info.to_csv(r'c:\Users\yangs\Desktop\test_onehot.csv')
test_auc_info[index] for index in column_list ] test_df.loc[feature_selector.GetName() + '-' + classifier.GetName()] = test_save_info test_df.to_csv(test_store_path) # return val_return_list, test_return_list if __name__ == '__main__': print(os.getcwd()) from DataContainer.DataContainer import DataContainer import pandas as pd data_container = DataContainer() data_container.Load(r'..\tempResult\NumericFeature.csv') data_container.UsualNormalize() df = pd.DataFrame(columns=column_list) # Set Feature Selector List feature_selector_list = [] feature_selector_list.append( FeatureSelectPipeline([ RemoveSameFeatures(), RemoveCosSimilarityFeatures(), FeatureSelectByANOVA() ])) feature_selector_list.append( FeatureSelectPipeline([ RemoveSameFeatures(),
cv_val_metric['{}_{}'.format(CV_VAL, AUC_STD)] self._AddOneMetric(cv_val_metric, os.path.join(cls_store_folder, 'metrics.csv')) self._MergeOneMetric(cv_val_metric, CV_VAL, model_name) self.total_metric[CV_TRAIN].to_csv(os.path.join(store_folder, '{}_results.csv'.format(CV_TRAIN))) self.total_metric[CV_VAL].to_csv(os.path.join(store_folder, '{}_results.csv'.format(CV_VAL))) if __name__ == '__main__': manager = PipelinesManager() index_dict = Index2Dict() train = DataContainer() test = DataContainer() train.Load(r'C:\Users\yangs\Desktop\train_numeric_feature.csv') test.Load(r'C:\Users\yangs\Desktop\test_numeric_feature.csv') faps = PipelinesManager(balancer=index_dict.GetInstantByIndex('UpSampling'), normalizer_list=[index_dict.GetInstantByIndex('Mean')], dimension_reduction_list=[index_dict.GetInstantByIndex('PCC')], feature_selector_list=[index_dict.GetInstantByIndex('ANOVA')], feature_selector_num_list=list(np.arange(1, 18)), classifier_list=[index_dict.GetInstantByIndex('SVM')], cross_validation=index_dict.GetInstantByIndex('5-Fold')) # for total, num in faps.RunWithoutCV(train, store_folder=r'..\..\Demo\db2-1'): # print(total, num) for total, num, group in faps.RunWithCV(train, store_folder=r'..\..\Demo\db1'): print(total, num, group) for total, num in faps.MergeCvResult(store_folder=r'..\..\Demo\db2-1'):
class PrepareConnection(QWidget, Ui_Prepare): close_signal = pyqtSignal(bool) def __init__(self, parent=None): super(PrepareConnection, self).__init__(parent) self.setupUi(self) self.data_container = DataContainer() self._filename = os.path.split(__file__)[-1] self.buttonLoad.clicked.connect(self.LoadData) self.buttonRemoveAndExport.clicked.connect(self.RemoveInvalidValue) self.__testing_ref_data_container = DataContainer() self.__clinical_ref = pd.DataFrame() self.radioSplitRandom.clicked.connect(self.ChangeSeparateMethod) self.radioSplitRef.clicked.connect(self.ChangeSeparateMethod) self.checkUseClinicRef.clicked.connect( self.RandomSeparateButtonUpdates) self.loadTestingReference.clicked.connect( self.LoadTestingReferenceDataContainer) self.clearTestingReference.clicked.connect( self.ClearTestingReferenceDataContainer) self.loadClinicRef.clicked.connect(self.LoadClinicalRef) self.clearClinicRef.clicked.connect(self.ClearClinicalRef) self.buttonSave.clicked.connect(self.CheckAndSave) def closeEvent(self, QCloseEvent): self.close_signal.emit(True) QCloseEvent.accept() def UpdateTable(self): self.tableFeature.setRowCount(self.data_container.GetFrame().shape[0]) header_name = deepcopy(list(self.data_container.GetFrame().columns)) min_col = np.min([len(header_name), 100]) if min_col == 100: header_name = header_name[:100] header_name[-1] = '...' self.tableFeature.setColumnCount(min_col) self.tableFeature.setHorizontalHeaderLabels(header_name) self.tableFeature.setVerticalHeaderLabels( list(map(str, self.data_container.GetFrame().index))) for row_index in range(self.data_container.GetFrame().shape[0]): for col_index in range(min_col): if col_index < 99: self.tableFeature.setItem( row_index, col_index, QTableWidgetItem( str(self.data_container.GetFrame().iloc[ row_index, col_index]))) else: self.tableFeature.setItem(row_index, col_index, QTableWidgetItem('...')) text = "The number of cases: {:d}\n".format( self.data_container.GetFrame().shape[0]) # To process Label temporally if 'label' in self.data_container.GetFrame().columns: label_name = 'label' text += "The number of features: {:d}\n".format( self.data_container.GetFrame().shape[1] - 1) elif 'Label' in self.data_container.GetFrame().columns: label_name = 'Label' text += "The number of features: {:d}\n".format( self.data_container.GetFrame().shape[1] - 1) else: label_name = '' text += "The number of features: {:d}\n".format( self.data_container.GetFrame().shape[1]) if label_name: labels = np.asarray( self.data_container.GetFrame()[label_name].values, dtype=np.int) if len(np.unique(labels)) == 2: positive_number = len(np.where(labels == np.max(labels))[0]) negative_number = len(labels) - positive_number assert (positive_number + negative_number == len(labels)) text += "The number of positive samples: {:d}\n".format( positive_number) text += "The number of negative samples: {:d}\n".format( negative_number) self.textInformation.setText(text) def SetButtonsState(self, state): self.buttonRemoveAndExport.setEnabled(state) self.buttonSave.setEnabled(state) self.checkExport.setEnabled(state) self.radioRemoveNone.setEnabled(state) self.radioRemoveNonvalidCases.setEnabled(state) self.radioRemoveNonvalidFeatures.setEnabled(state) self.radioSplitRandom.setEnabled(state) self.radioSplitRef.setEnabled(state) def LoadData(self): dlg = QFileDialog() file_name, _ = dlg.getOpenFileName(self, 'Open SCV file', filter="csv files (*.csv)") if file_name: try: if self.data_container.Load(file_name, is_update=True): self.UpdateTable() self.SetButtonsState(True) except OSError as reason: eclog(self._filename).GetLogger().error( 'Load CSV Error: {}'.format(reason)) QMessageBox.about(self, 'Load data Error', reason.__str__()) print('Error!' + str(reason)) except ValueError: eclog(self._filename).GetLogger().error( 'Open CSV Error: {}'.format(file_name)) QMessageBox.information(self, 'Error', 'The selected data file mismatch.') def LoadTestingReferenceDataContainer(self): dlg = QFileDialog() file_name, _ = dlg.getOpenFileName(self, 'Open SCV file', filter="csv files (*.csv)") if file_name: try: self.__testing_ref_data_container.Load(file_name) self.loadTestingReference.setEnabled(False) self.clearTestingReference.setEnabled(True) self.spinBoxSeparate.setEnabled(False) except OSError as reason: eclog(self._filename).GetLogger().error( 'Load Testing Ref Error: {}'.format(reason)) print('Error!' + str(reason)) except ValueError: eclog(self._filename).GetLogger().error( 'Open CSV Error: {}'.format(file_name)) QMessageBox.information(self, 'Error', 'The selected data file mismatch.') def ClearTestingReferenceDataContainer(self): del self.__testing_ref_data_container self.__testing_ref_data_container = DataContainer() self.loadTestingReference.setEnabled(True) self.clearTestingReference.setEnabled(False) self.spinBoxSeparate.setEnabled(False) def LoadClinicalRef(self): dlg = QFileDialog() file_name, _ = dlg.getOpenFileName(self, 'Open SCV file', filter="csv files (*.csv)") if file_name: try: self.__clinical_ref = pd.read_csv(file_name, index_col=0) if list(self.__clinical_ref.index ) != self.data_container.GetCaseName(): QMessageBox.information( self, 'Error', 'The index of clinical features is not consistent to the data' ) return None self.loadClinicRef.setEnabled(False) self.clearClinicRef.setEnabled(True) except OSError as reason: eclog(self._filename).GetLogger().error( 'Load Clinical Ref Error: {}'.format(reason)) QMessageBox.information(self, 'Error', 'Can not Open the Files') except ValueError: eclog(self._filename).GetLogger().error( 'OpenCSV Error: {}'.format(file_name)) QMessageBox.information(self, 'Error', 'The selected data file mismatch.') return None def ClearClinicalRef(self): del self.__clinical_ref self.__clinical_ref = pd.DataFrame() self.loadClinicRef.setEnabled(True) self.clearClinicRef.setEnabled(False) def RemoveInvalidValue(self): if not self.data_container.IsEmpty(): if self.checkExport.isChecked(): folder_name = QFileDialog.getExistingDirectory( self, "Save Invalid data") store_path = os.path.join(folder_name, 'invalid_feature.csv') else: store_path = '' if self.radioRemoveNone.isChecked(): self.data_container.RemoveInvalid(store_path=store_path, remove_index=REMOVE_NONE) if self.radioRemoveNonvalidCases.isChecked(): self.data_container.RemoveInvalid(store_path=store_path, remove_index=REMOVE_CASE) elif self.radioRemoveNonvalidFeatures.isChecked(): self.data_container.RemoveInvalid(store_path=store_path, remove_index=REMOVE_FEATURE) self.UpdateTable() def ChangeSeparateMethod(self): if self.radioSplitRandom.isChecked(): self.spinBoxSeparate.setEnabled(True) self.checkUseClinicRef.setEnabled(True) self.loadTestingReference.setEnabled(False) self.clearTestingReference.setEnabled(False) elif self.radioSplitRef.isChecked(): self.spinBoxSeparate.setEnabled(False) self.checkUseClinicRef.setEnabled(False) if self.__testing_ref_data_container.IsEmpty(): self.loadTestingReference.setEnabled(True) self.clearTestingReference.setEnabled(False) else: self.loadTestingReference.setEnabled(False) self.clearTestingReference.setEnabled(True) self.RandomSeparateButtonUpdates() def RandomSeparateButtonUpdates(self): if self.checkUseClinicRef.isChecked(): if self.__clinical_ref.size > 0: self.loadClinicRef.setEnabled(False) self.clearClinicRef.setEnabled(True) else: self.loadClinicRef.setEnabled(True) self.clearClinicRef.setEnabled(False) else: self.loadClinicRef.setEnabled(False) self.clearClinicRef.setEnabled(False) def CheckAndSave(self): if self.data_container.IsEmpty(): QMessageBox.warning(self, "Warning", "There is no data", QMessageBox.Ok) elif not self.data_container.IsBinaryLabel(): QMessageBox.warning(self, "Warning", "There are not 2 Labels", QMessageBox.Ok) non_valid_number_index = self.data_container.FindInvalidLabelIndex( ) old_edit_triggers = self.tableFeature.editTriggers() self.tableFeature.setEditTriggers(QAbstractItemView.CurrentChanged) self.tableFeature.setCurrentCell(non_valid_number_index, 0) self.tableFeature.setEditTriggers(old_edit_triggers) elif self.data_container.HasInvalidNumber(): QMessageBox.warning(self, "Warning", "There are nan items", QMessageBox.Ok) non_valid_number_index = self.data_container.FindInvalidNumberIndex( ) old_edit_triggers = self.tableFeature.editTriggers() self.tableFeature.setEditTriggers(QAbstractItemView.CurrentChanged) self.tableFeature.setCurrentCell(non_valid_number_index[0], non_valid_number_index[1] + 1) self.tableFeature.setEditTriggers(old_edit_triggers) else: remove_features_with_same_value = RemoveSameFeatures() self.data_container = remove_features_with_same_value.Run( self.data_container) if self.radioSplitRandom.isChecked( ) or self.radioSplitRef.isChecked(): folder_name = QFileDialog.getExistingDirectory( self, "Save data") if folder_name != '': data_separate = DataSeparate.DataSeparate() try: if self.__testing_ref_data_container.IsEmpty(): testing_data_percentage = self.spinBoxSeparate.value( ) if self.__clinical_ref.size == 0: training_data_container, _, = \ data_separate.RunByTestingPercentage(self.data_container, testing_data_percentage, store_folder=folder_name) else: training_data_container, _, = \ data_separate.RunByTestingPercentage(self.data_container, testing_data_percentage, clinic_df=self.__clinical_ref, store_folder=folder_name) else: training_data_container, _, = \ data_separate.RunByTestingReference(self.data_container, self.__testing_ref_data_container, folder_name) if training_data_container.IsEmpty(): QMessageBox.information( self, 'Error', 'The testing data does not mismatch, please check the testing data ' 'really exists in current data') return None os.system("explorer.exe {:s}".format( os.path.normpath(folder_name))) except Exception as e: content = 'PrepareConnection, splitting failed: ' eclog(self._filename).GetLogger().error( 'Split Error: ' + e.__str__()) QMessageBox.about(self, content, e.__str__()) else: file_name, _ = QFileDialog.getSaveFileName( self, "Save data", filter="csv files (*.csv)") if file_name: self.data_container.Save(file_name)
training_index_list.append(index) train_data_container = self.__SetNewData(data_container, training_index_list) test_data_container = self.__SetNewData(data_container, testing_index_list) if store_folder: train_data_container.Save( os.path.join(store_folder, 'train_numeric_feature.csv')) test_data_container.Save( os.path.join(store_folder, 'test_numeric_feature.csv')) return train_data_container, test_data_container if __name__ == '__main__': data = DataContainer() data.Load(r'..\..\Example\numeric_feature.csv') data_separator = DataSeparate() data_separator.Run(data, store_folder=r'..\..\Example\separate_test') ref_data_container = DataContainer() ref_data_container.Load( r'..\..\Example\separate_test\train_numeric_feature.csv') data_separator.training_ref_data_container = ref_data_container data_separator.Run(data, store_folder=r'..\..\Example\separate_test\reload')
FeatureSelectByPCA() ])) # Set Classifier List classifier_list = [] classifier_list.append(SVM()) classifier_list.append(AE(max_iter=1000)) classifier_list.append(RandomForest()) classifier_list.append(LDA()) cv = CrossValidationOnFeatureNumber('5-folder') data_container = DataContainer() if os.path.exists(r'Example\numeric_feature.csv'): data_path = r'Example\numeric_feature.csv' # Run by Console elif os.path.exists(r'numeric_feature.csv'): data_path = r'numeric_feature.csv' # Run by PyCharm data_container.Load(data_path) data_container.UsualAndL2Normalize() fae = FeatureAnalysisExplore(feature_selector_list=feature_selector_list, classifier_list=classifier_list, cv=cv, max_feature_number=20) if os.path.exists(r'Result'): store_path = r'Result' # Run By PyCharm elif os.path.exists(r'Example\Result'): store_path = r'Example\Result' # Run By Console fae.Run(data_container, store_folder=store_path)
testing_index_list.append(index) else: training_index_list.append(index) train_data_container = self.__SetNewData(data_container, training_index_list) test_data_container = self.__SetNewData(data_container, testing_index_list) if store_folder: train_data_container.Save( os.path.join(store_folder, 'train_numeric_feature.csv')) test_data_container.Save( os.path.join(store_folder, 'test_numeric_feature.csv')) return train_data_container, test_data_container if __name__ == '__main__': clinics = pd.read_csv(r'..\..\Demo\simulated_clinics.csv', index_col=0) container = DataContainer() container.Load(r'..\..\Demo\simulated_feature.csv') separator = DataSeparate() train, test = separator.RunByTestingPercentage(container, 0.3, clinic_df=clinics) print(train.GetArray().shape, test.GetArray().shape) print(separator.clinic_split_result)
self.__cv.SetClassifier(self.__classifier) train_cv_metric, val_cv_metric, test_metric, all_train_metric = self.__cv.Run(raw_train_data_container, raw_test_data_conainer, store_folder, is_hyper_parameter) if store_folder: self.SavePipeline(len(raw_train_data_container.GetFeatureName()), os.path.join(store_folder, 'pipeline_info.csv')) return train_cv_metric, val_cv_metric, test_metric, all_train_metric if __name__ == '__main__': index_dict = Index2Dict() train = DataContainer() test = DataContainer() train.Load(r'..\..\Demo\zero_center_normalized_training_feature.csv') test.Load(r'..\..\Demo\zero_center_normalized_testing_feature.csv') faps = FeatureAnalysisPipelines(balancer=index_dict.GetInstantByIndex('NoneBalance'), normalizer_list=[index_dict.GetInstantByIndex('None')], dimension_reduction_list=[index_dict.GetInstantByIndex('PCC')], feature_selector_list=[index_dict.GetInstantByIndex('RFE')], feature_selector_num_list=[15], classifier_list=[index_dict.GetInstantByIndex('LR')], cross_validation=index_dict.GetInstantByIndex('5-Folder')) for temp in faps.Run(train, test, store_folder=r'..\..\Demo\db2-2'): print(temp) print('Done')