def RunByTestingReference(self, data_container, testing_ref_data_container, store_folder=''): training_index_list, testing_index_list = [], [] # TODO: assert data_container include all cases which is in the training_ref_data_container. all_name_list = data_container.GetCaseName() testing_name_list = testing_ref_data_container.GetCaseName() for training_name in testing_name_list: if training_name not in all_name_list: print( 'The data container and the training data container are not consistent.' ) return DataContainer(), DataContainer() for name, index in zip(all_name_list, range(len(all_name_list))): if name in testing_name_list: testing_index_list.append(index) else: training_index_list.append(index) train_data_container = self.__SetNewData(data_container, training_index_list) test_data_container = self.__SetNewData(data_container, testing_index_list) if store_folder: train_data_container.Save( os.path.join(store_folder, 'train_numeric_feature.csv')) test_data_container.Save( os.path.join(store_folder, 'test_numeric_feature.csv')) return train_data_container, test_data_container
def DrawFeatureRelationshipAccordingToCsvFile(file_path, selected_feature_name_list, label_name_list, store_path=''): ''' Help draw the feature relationship among different features according to the path of the data container. :param file_path: the file path of the csv file :param selected_feature_name_list: the features that would be drawn :param label_name_list: the name of the label. e.g. ['non-cnacer', 'cancer'] :param store_path: The store path, supporting jpg and eps format. :return: ''' data_container = DataContainer() data_container.Load(file_path) data_container.UsualNormalize() data, label, feature_name, case_name = data_container.GetData() if len(selected_feature_name_list) > 3 or len( selected_feature_name_list) < 1: print( "Please check the length of the feature list. It can only show the relationship of the 1, 2, or 3 features" ) try: index = [feature_name.index(t) for t in selected_feature_name_list] except: print('The selected feature is not in the data container.') return result_data = [] for sub_index in index: result_data.append(data[:, sub_index]) DrawValueRelationship(result_data, selected_feature_name_list, label, label_name_list, store_path)
def GenerateDescription(): training_data_container = DataContainer() training_data_container.Load(r'..\..\Example\numeric_feature.csv') one_pipeline = OnePipeline() one_pipeline.LoadPipeline(r'C:\MyCode\FAEGitHub\FAE\Example\report_temp\NormUnit_Cos_ANOVA_5_SVM\pipeline_info.csv') description = Description() description.Run(training_data_container, one_pipeline, r'..\..\Example\report_temp', r'..\..\Example\report')
def OneHotOneColumn(self, data_container, feature_list): info = data_container.GetFrame() feature_name = data_container.GetFeatureName() for feature in feature_list: assert(feature in feature_name) new_info = pd.get_dummies(info, columns=feature_list) new_data = DataContainer() new_data.SetFrame(new_info) return new_data
def __init__(self, parent=None): super(PrepareConnection, self).__init__(parent) self.setupUi(self) self.data_container = DataContainer() self.buttonLoad.clicked.connect(self.LoadData) self.buttonRemove.clicked.connect(self.RemoveNonValidValue) self.checkSeparate.clicked.connect(self.SetSeparateStatus) self.spinBoxSeparate.setEnabled(False) self.buttonSave.clicked.connect(self.CheckAndSave)
def Run(self, data_container, store_folder=''): temp_frame = data_container.GetFrame().select_dtypes(include=None, exclude=['object']) new_data_container = DataContainer() new_data_container.SetFrame(temp_frame) if store_folder and os.path.isdir(store_folder): feature_store_path = os.path.join(store_folder, 'numeric_feature.csv') featureinfo_store_path = os.path.join(store_folder, 'feature_select_info.csv') new_data_container.Save(feature_store_path) SaveSelectInfo(new_data_container, featureinfo_store_path, is_merge=False) return new_data_container
def __SetNewData(self, data_container, case_index): array, label, feature_name, case_name = data_container.GetData() new_array = array[case_index, :] new_label = label[case_index] new_case_name = [case_name[i] for i in case_index] new_data_container = DataContainer(array=new_array, label=new_label, case_name=new_case_name, feature_name=feature_name) new_data_container.UpdateFrameByData() return new_data_container
def Run(self, store_path): data, label, feature_name, label_name = self.GetDataContainer( ).GetData() data_resampled, label_resampled = self.__model.fit_sample(data, label) new_case_name = [ 'Generate' + str(index) for index in range(data_resampled.shape[0]) ] new_data_container = DataContainer( data_resampled, label_resampled, self.GetDataContainer().GetFeatureName(), new_case_name) if store_path != '': new_data_container.Save(store_path) return new_data_container
def RunOneModel(self, data_container, feature_selector, classifier, cv, test_data_container=DataContainer(), store_folder=''): ''' :param data_container: The implement of the DataContainer. :param feature_selector: The implement of the FeatureSelector. :param classifier: The implement of the Classifier :param cv: The implement of the CrossValidation :param store_folder: The path of the store folder.. :return: The metric of the validation data. ''' feature_selector.SetDataContainer(data_container) selected_data_container = feature_selector.Run(store_folder) cv.SetClassifier(classifier) cv.SetDataContainer(selected_data_container) train_metric, val_metric, test_metric = cv.Run( data_container, test_data_container=test_data_container, store_folder=store_folder) return val_metric, test_metric
def Run(self, store_path=''): data, label, feature_name, label_name = self.GetDataContainer( ).GetData() rus = RandomOverSampler(random_state=0) data_resampled, label_resampled = rus.fit_sample(data, label) new_case_name = [] for index in range(data_resampled.shape[0]): new_case_name.append( self.GetCaseNameFromAllCaseNames(data_resampled[index, :])) new_data_container = DataContainer( data_resampled, label_resampled, self.GetDataContainer().GetFeatureName(), new_case_name) if store_path != '': new_data_container.Save(store_path) return new_data_container
def Run(self, data_container, store_path=''): data, label, feature_name, label_name = data_container.GetData() data_resampled, label_resampled = self.__model.fit_sample(data, label) new_case_name = [ 'Generate' + str(index) for index in range(data_resampled.shape[0]) ] new_data_container = DataContainer(data_resampled, label_resampled, data_container.GetFeatureName(), new_case_name) if store_path != '': if os.path.isdir(store_path): new_data_container.Save( os.path.join(store_path, 'smote_features.csv')) else: new_data_container.Save(store_path) return new_data_container
def GenerateTrainingAndTestingData(csv_file_path, training_index=[], testing_percentage=0.3, is_store_index=False): ''' Seperate the data container into training part and the testing part. :param csv_file_path: The file path of the data container :param training_index: The index of the training data set. This is usually to compare with different combination of the sequences. Default is [] :param testing_percentage: The percentage of data set is used to separate for testing data set. Default is 30% :param is_store_index: To store or not. Default is False. :return: ''' data_container = DataContainer() data, label, feature_name, case_name = data_container.LoadAndGetData( csv_file_path) folder_path = os.path.split(csv_file_path)[0] training_folder = os.path.join(folder_path, 'training') testing_folder = os.path.join(folder_path, 'testing') if not os.path.exists(training_folder): os.mkdir(training_folder) if not os.path.exists(testing_folder): os.mkdir(testing_folder) if is_store_index: store_folder = os.path.split(csv_file_path)[0] if store_folder == '': store_folder = os.path.abspath('') else: store_folder = '' output = SeperateDataToTrainingAndTesting( data, testing_percentage, label, training_index_list=training_index, store_folder=store_folder) training_data_contrainer = DataContainer( output['training_data'], output['training_label'], feature_name, [case_name[temp] for temp in output['training_index']]) training_data_contrainer.Save(os.path.join(training_folder, 'feature.csv')) testing_data_contrainer = DataContainer( output['testing_data'], output['testing_label'], feature_name, [case_name[temp] for temp in output['testing_index']]) testing_data_contrainer.Save(os.path.join(testing_folder, 'feature.csv'))
def Run(self, data_container, test_data_container=DataContainer(), store_folder=''): column_list = [ 'sample_number', 'positive_number', 'negative_number', 'auc', 'auc 95% CIs', 'accuracy', 'feature_number', 'Yorden Index', 'sensitivity', 'specificity', 'positive predictive value', 'negative predictive value' ] df = pd.DataFrame(columns=column_list) test_df = pd.DataFrame(columns=column_list) for feature_selector in self.__feature_selector_list: for classifier in self.__classifier_list: print(feature_selector.GetName() + '-' + classifier.GetName() + ':') self.__cv.SetClassifier(classifier) self.__cv.SetFeatureSelector(feature_selector) self.__cv.SetMaxFeatureNumber(self.__max_feature_number) model_store_folder = os.path.join( store_folder, feature_selector.GetName() + '-' + classifier.GetName()) if not os.path.exists(model_store_folder): os.mkdir(model_store_folder) val_return_list, test_return_list = self.__cv.Run( data_container, test_data_container=test_data_container, store_folder=model_store_folder, metric_name_list=('auc', 'accuracy')) if store_folder and os.path.isdir(store_folder): val_auc_info = val_return_list[0] store_path = os.path.join(store_folder, 'val_result.csv') save_info = [val_auc_info[index] for index in column_list] df.loc[feature_selector.GetName() + '-' + classifier.GetName()] = save_info df.to_csv(store_path) if test_data_container.GetArray().size > 0: test_auc_info = test_return_list[0] test_store_path = os.path.join(store_folder, 'test_result.csv') test_save_info = [ test_auc_info[index] for index in column_list ] test_df.loc[feature_selector.GetName() + '-' + classifier.GetName()] = test_save_info test_df.to_csv(test_store_path)
def Generate(self, data_container): array, label = data_container.GetArray(), data_container.GetLabel() feature_name, case_name = data_container.GetFeatureName( ), data_container.GetCaseName() for train_index, val_index in self._cv.split(array, label): train_array, train_label = array[ train_index, :], label[train_index] val_array, val_label = array[val_index, :], label[val_index] sub_train_container = DataContainer( array=train_array, label=train_label, feature_name=feature_name, case_name=[case_name[index] for index in train_index]) sub_val_container = DataContainer( array=val_array, label=val_label, feature_name=feature_name, case_name=[case_name[index] for index in val_index]) yield (sub_train_container, sub_val_container)
def Run(self, data_container, store_path=''): data, label, feature_name, label_name = data_container.GetData() rus = RandomOverSampler(random_state=0) data_resampled, label_resampled = rus.fit_sample(data, label) new_case_name = [] for index in range(data_resampled.shape[0]): new_case_name.append( self.GetCaseNameFromAllCaseNames(data_container, data_resampled[index, :])) new_data_container = DataContainer(data_resampled, label_resampled, data_container.GetFeatureName(), new_case_name) if store_path != '': if os.path.isdir(store_path): new_data_container.Save( os.path.join(store_path, 'upsampling_features.csv')) else: new_data_container.Save(store_path) return new_data_container
def Run(self, data_container, store_path=''): data, label, feature_name, label_name = data_container.GetData() data_resampled, label_resampled = self._model.fit_sample(data, label) new_case_name = [] for index in range(data_resampled.shape[0]): new_case_name.append( self.GetCaseNameFromAllCaseNames(data_container, data_resampled[index, :])) new_data_container = DataContainer(data_resampled, label_resampled, data_container.GetFeatureName(), new_case_name) if store_path != '': if os.path.isdir(store_path): new_data_container.Save( os.path.join(store_path, '{}_features.csv'.format(self._name))) else: new_data_container.Save(store_path) return new_data_container
def __init__(self, parent=None): super(PrepareConnection, self).__init__(parent) self.setupUi(self) self.data_container = DataContainer() self.buttonLoad.clicked.connect(self.LoadData) self.buttonRemove.clicked.connect(self.RemoveNonValidValue) self.loadTestingReference.clicked.connect( self.LoadTestingReferenceDataContainer) self.clearTestingReference.clicked.connect( self.ClearTestingReferenceDataContainer) self.__testing_ref_data_container = DataContainer() self.checkSeparate.clicked.connect(self.SetSeparateStatus) self.spinBoxSeparate.setEnabled(False) self.logger = eclog(os.path.split(__file__)[-1]).GetLogger() self.loadTestingReference.setEnabled(False) self.clearTestingReference.setEnabled(False) self.buttonSave.clicked.connect(self.CheckAndSave)
def Run(self, train_data_container, test_data_container=DataContainer(), store_folder='', is_hyper_parameter=False): raw_train_data_container = deepcopy(train_data_container) raw_test_data_conainer = deepcopy(test_data_container) if store_folder: if not os.path.exists(store_folder): os.mkdir(store_folder) if not (self.__cv and self.__classifier): print('Give CV method and classifier') if self.__normalizer: raw_train_data_container = self.__normalizer.Run( raw_train_data_container, store_folder) if not test_data_container.IsEmpty(): raw_test_data_conainer = self.__normalizer.Run( raw_test_data_conainer, store_folder, is_test=True) if self.__dimension_reduction: raw_train_data_container = self.__dimension_reduction.Run( raw_train_data_container, store_folder) if not test_data_container.IsEmpty(): raw_test_data_conainer = self.__dimension_reduction.Transform( raw_test_data_conainer) if self.__feature_selector: raw_train_data_container = self.__feature_selector.Run( raw_train_data_container, store_folder) if not test_data_container.IsEmpty(): selected_feature_name = raw_train_data_container.GetFeatureName( ) fs = FeatureSelector() raw_test_data_conainer = fs.SelectFeatureByName( raw_test_data_conainer, selected_feature_name) self.__cv.SetClassifier(self.__classifier) train_cv_metric, val_cv_metric, test_metric, all_train_metric = self.__cv.Run( raw_train_data_container, raw_test_data_conainer, store_folder, is_hyper_parameter) if store_folder: self.SavePipeline(len(raw_train_data_container.GetFeatureName()), os.path.join(store_folder, 'pipeline_info.csv')) return train_cv_metric, val_cv_metric, test_metric, all_train_metric
def __init__(self, parent=None): super(PrepareConnection, self).__init__(parent) self.setupUi(self) self.data_container = DataContainer() self._filename = os.path.split(__file__)[-1] self.buttonLoad.clicked.connect(self.LoadData) self.buttonRemoveAndExport.clicked.connect(self.RemoveInvalidValue) self.__testing_ref_data_container = DataContainer() self.__clinical_ref = pd.DataFrame() self.radioSplitRandom.clicked.connect(self.ChangeSeparateMethod) self.radioSplitRef.clicked.connect(self.ChangeSeparateMethod) self.checkUseClinicRef.clicked.connect( self.RandomSeparateButtonUpdates) self.loadTestingReference.clicked.connect( self.LoadTestingReferenceDataContainer) self.clearTestingReference.clicked.connect( self.ClearTestingReferenceDataContainer) self.loadClinicRef.clicked.connect(self.LoadClinicalRef) self.clearClinicRef.clicked.connect(self.ClearClinicalRef) self.buttonSave.clicked.connect(self.CheckAndSave)
def TestNewData(NewDataCsv, model_folder, result_save_path=''): ''' :param NewDataCsv: New radiomics feature matrix csv file path :param model_folder:The trained model path :return:classification result ''' train_info = LoadTrainInfo(model_folder) new_data_container = DataContainer() #Normlization new_data_container.Load(NewDataCsv) # feature_selector = FeatureSelector() # feature_selector.SelectFeatureByName(new_data_container, train_info['selected_features'], is_replace=True) new_data_container = train_info['normalizer'].Transform(new_data_container) # data_frame = new_data_container.GetFrame() # data_frame = data_frame[train_info['selected_features']] # new_data_container.SetFrame(data_frame) # new_data_container.UpdateDataByFrame() ##Model train_info['classifier'].SetDataContainer(new_data_container) model = train_info['classifier'].GetModel() predict = model.predict_proba(new_data_container.GetArray())[:, 1] label = new_data_container.GetLabel() case_name = new_data_container.GetCaseName() test_result_info = [['CaseName', 'Pred', 'Label']] for index in range(len(label)): test_result_info.append( [case_name[index], predict[index], label[index]]) metric = EstimateMetirc(predict, label) info = {} info.update(metric) cv = CrossValidation() print(metric) print('\t') if result_save_path: cv.SaveResult(info, result_save_path) np.save(os.path.join(result_save_path, 'test_predict.npy'), predict) np.save(os.path.join(result_save_path, 'test_label.npy'), label) with open(os.path.join(result_save_path, 'test_info.csv'), 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerows(test_result_info) return metric
def Run(self, train_data_container, test_data_container=DataContainer(), store_folder=''): raw_train_data_container = deepcopy(train_data_container) raw_test_data_conainer = deepcopy(test_data_container) if store_folder: if not os.path.exists(store_folder): os.mkdir(store_folder) if not (self.__cv and self.__classifier): print('Give CV method and classifier') if self.__normalizer: raw_train_data_container = self.__normalizer.Run( raw_train_data_container, store_folder) if not test_data_container.IsEmpty(): raw_test_data_conainer = self.__normalizer.Run( raw_test_data_conainer) if self.__dimension_reduction: raw_train_data_container = self.__dimension_reduction.Run( raw_train_data_container, store_folder) if not test_data_container.IsEmpty(): raw_test_data_conainer = self.__dimension_reduction.Transform( raw_test_data_conainer) if self.__feature_selector: raw_train_data_container = self.__feature_selector.Run( raw_train_data_container, store_folder) if not test_data_container.IsEmpty(): selected_feature_name = raw_train_data_container.GetFeatureName( ) fs = FeatureSelector() raw_test_data_conainer = fs.SelectFeatureByName( raw_test_data_conainer, selected_feature_name) self.__cv.SetClassifier(self.__classifier) train_metric, val_metric, test_metric = self.__cv.Run( raw_train_data_container, raw_test_data_conainer, store_folder) return train_metric, val_metric, test_metric
def Run(self, data_container, test_data_container=DataContainer(), store_folder='', is_hyper_parameter=False): train_pred_list, train_label_list, val_pred_list, val_label_list = [], [], [], [] data = data_container.GetArray() label = data_container.GetLabel() case_name = data_container.GetCaseName() param_metric_train_auc = [] param_metric_val_auc = [] param_all = [] if len(self.classifier_parameter_list) == 1 and is_hyper_parameter: self.AutoLoadClassifierParameterList( relative_path=r'HyperParameters\Classifier') for parameter in self.classifier_parameter_list: self.SetDefaultClassifier() self.classifier.SetModelParameter(parameter) train_cv_info = [['CaseName', 'Group', 'Pred', 'Label']] val_cv_info = [['CaseName', 'Group', 'Pred', 'Label']] group_index = 0 for train_index, val_index in self.__cv.split(data, label): group_index += 1 train_data = data[train_index, :] train_label = label[train_index] val_data = data[val_index, :] val_label = label[val_index] self.classifier.SetData(train_data, train_label) self.classifier.Fit() train_prob = self.classifier.Predict(train_data) val_prob = self.classifier.Predict(val_data) for index in range(len(train_index)): train_cv_info.append([ case_name[train_index[index]], str(group_index), train_prob[index], train_label[index] ]) for index in range(len(val_index)): val_cv_info.append([ case_name[val_index[index]], str(group_index), val_prob[index], val_label[index] ]) train_pred_list.extend(train_prob) train_label_list.extend(train_label) val_pred_list.extend(val_prob) val_label_list.extend(val_label) total_train_label = np.asarray(train_label_list, dtype=np.uint8) total_train_pred = np.asarray(train_pred_list, dtype=np.float32) train_cv_metric = EstimateMetirc(total_train_pred, total_train_label, 'train') total_val_label = np.asarray(val_label_list, dtype=np.uint8) total_val_pred = np.asarray(val_pred_list, dtype=np.float32) val_cv_metric = EstimateMetirc(total_val_pred, total_val_label, 'val') param_metric_train_auc.append(float(train_cv_metric['train_auc'])) param_metric_val_auc.append(float(val_cv_metric['val_auc'])) param_all.append({ 'total_train_label': total_train_label, 'total_train_pred': total_train_pred, 'train_metric': train_cv_metric, 'train_cv_info': deepcopy(train_cv_info), 'total_val_label': total_val_label, 'total_val_pred': total_val_pred, 'val_metric': val_cv_metric, 'val_cv_info': deepcopy(val_cv_info) }) # find the best parameter index = np.argmax(param_metric_val_auc) total_train_label = param_all[index]['total_train_label'] total_train_pred = param_all[index]['total_train_pred'] train_cv_metric = param_all[index]['train_metric'] train_cv_info = param_all[index]['train_cv_info'] total_val_label = param_all[index]['total_val_label'] total_val_pred = param_all[index]['total_val_pred'] val_cv_metric = param_all[index]['val_metric'] val_cv_info = param_all[index]['val_cv_info'] self.SetDefaultClassifier() self.classifier.SetModelParameter( self.classifier_parameter_list[index]) self.classifier.SetDataContainer(data_container) self.classifier.Fit() all_train_pred = self.classifier.Predict(data_container.GetArray()) all_train_label = data_container.GetLabel() all_train_metric = EstimateMetirc(all_train_pred, all_train_label, 'all_train') test_metric = {} if test_data_container.GetArray().size > 0: test_data = test_data_container.GetArray() test_label = test_data_container.GetLabel() test_case_name = test_data_container.GetCaseName() test_pred = self.classifier.Predict(test_data) test_metric = EstimateMetirc(test_pred, test_label, 'test') if store_folder: if not os.path.exists(store_folder): os.mkdir(store_folder) # Save the Parameter: if self.classifier_parameter_list[0] != {}: with open(os.path.join(store_folder, 'Classifier_Param_Result.csv'), 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerow(['Param', 'Train AUC', 'Val AUC']) for param, param_index in zip( self.classifier_parameter_list, range(len(self.classifier_parameter_list))): writer.writerow([ self._GetNameOfParamDict(param), param_metric_train_auc[param_index], param_metric_val_auc[param_index] ]) info = {} info.update(train_cv_metric) info.update(val_cv_metric) info.update(all_train_metric) np.save(os.path.join(store_folder, 'train_predict.npy'), total_train_pred) np.save(os.path.join(store_folder, 'train_label.npy'), total_train_label) np.save(os.path.join(store_folder, 'val_predict.npy'), total_val_pred) np.save(os.path.join(store_folder, 'val_label.npy'), total_val_label) np.save(os.path.join(store_folder, 'all_train_predict.npy'), all_train_pred) np.save(os.path.join(store_folder, 'all_train_label.npy'), all_train_label) with open(os.path.join(store_folder, 'train_cv5_info.csv'), 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerows(train_cv_info) with open(os.path.join(store_folder, 'val_cv5_info.csv'), 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerows(val_cv_info) if test_data_container.GetArray().size > 0: info.update(test_metric) np.save(os.path.join(store_folder, 'test_predict.npy'), test_pred) np.save(os.path.join(store_folder, 'test_label.npy'), test_label) test_result_info = [['CaseName', 'Pred', 'Label']] for index in range(len(test_label)): test_result_info.append([ test_case_name[index], test_pred[index], test_label[index] ]) with open(os.path.join(store_folder, 'test_info.csv'), 'w', newline='') as csvfile: writer = csv.writer(csvfile) writer.writerows(test_result_info) self.classifier.Save(store_folder) self.SaveResult(info, store_folder) return train_cv_metric, val_cv_metric, test_metric, all_train_metric
os.path.join(store_folder, 'train_numeric_feature.csv')) df_training = pd.DataFrame( data=self._training_index, columns=['training_index'], index=[case_name[index] for index in self._training_index]) df_training.to_csv(os.path.join(store_folder, 'training_index.csv'), sep=',', quotechar='"') test_data_container.Save( os.path.join(store_folder, 'test_numeric_feature.csv')) df_testing = pd.DataFrame( data=testing_index_list, columns=['training_index'], index=[case_name[index] for index in testing_index_list]) df_testing.to_csv(os.path.join(store_folder, 'testing_index.csv'), sep=',', quotechar='"') return train_data_container, test_data_container if __name__ == '__main__': data = DataContainer() data.Load(r'..\..\Example\numeric_feature.csv') data_separator = DataSeparate() data_separator.Run(data, store_folder=r'..\..\Example')
def Run(self, data_container, test_data_container=DataContainer(), store_folder=''): train_pred_list, train_label_list, val_pred_list, val_label_list = [], [], [], [] data = data_container.GetArray() label = data_container.GetLabel() val_index_store = [] for train_index, val_index in self.__cv.split(data, label): val_index_store.extend(val_index) train_data = data[train_index, :] train_label = label[train_index] val_data = data[val_index, :] val_label = label[val_index] self.__classifier.SetData(train_data, train_label) self.__classifier.Fit() train_prob = self.__classifier.Predict(train_data) val_prob = self.__classifier.Predict(val_data) train_pred_list.extend(train_prob) train_label_list.extend(train_label) val_pred_list.extend(val_prob) val_label_list.extend(val_label) total_train_label = np.asarray(train_label_list, dtype=np.uint8) total_train_pred = np.asarray(train_pred_list, dtype=np.float32) train_metric = EstimateMetirc(total_train_pred, total_train_label, 'train') total_label = np.asarray(val_label_list, dtype=np.uint8) total_pred = np.asarray(val_pred_list, dtype=np.float32) val_metric = EstimateMetirc(total_pred, total_label, 'val') self.__classifier.SetDataContainer(data_container) self.__classifier.Fit() test_metric = {} if test_data_container.GetArray().size > 0: test_data = test_data_container.GetArray() test_label = test_data_container.GetLabel() test_pred = self.__classifier.Predict(test_data) test_metric = EstimateMetirc(test_pred, test_label, 'test') if store_folder: if not os.path.exists(store_folder): os.mkdir(store_folder) info = {} info.update(train_metric) info.update(val_metric) np.save(os.path.join(store_folder, 'train_predict.npy'), total_train_pred) np.save(os.path.join(store_folder, 'val_predict.npy'), total_pred) np.save(os.path.join(store_folder, 'train_label.npy'), total_train_label) np.save(os.path.join(store_folder, 'val_label.npy'), total_label) cv_info_path = os.path.join(store_folder, 'cv_info.csv') df = pd.DataFrame(data=val_index_store) df.to_csv(cv_info_path) DrawROCList(total_train_pred, total_train_label, store_path=os.path.join(store_folder, 'train_ROC.jpg'), is_show=False) DrawROCList(total_pred, total_label, store_path=os.path.join(store_folder, 'val_ROC.jpg'), is_show=False) if test_data_container.GetArray().size > 0: info.update(test_metric) np.save(os.path.join(store_folder, 'test_predict.npy'), test_pred) np.save(os.path.join(store_folder, 'test_label.npy'), test_label) DrawROCList(test_pred, test_label, store_path=os.path.join(store_folder, 'test_ROC.jpg'), is_show=False) self.__classifier.Save(store_folder) self.SaveResult(info, store_folder) return train_metric, val_metric, test_metric
testing_index_list.append(index) else: training_index_list.append(index) train_data_container = self.__SetNewData(data_container, training_index_list) test_data_container = self.__SetNewData(data_container, testing_index_list) if store_folder: train_data_container.Save( os.path.join(store_folder, 'train_numeric_feature.csv')) test_data_container.Save( os.path.join(store_folder, 'test_numeric_feature.csv')) return train_data_container, test_data_container if __name__ == '__main__': clinics = pd.read_csv(r'..\..\Demo\simulated_clinics.csv', index_col=0) container = DataContainer() container.Load(r'..\..\Demo\simulated_feature.csv') separator = DataSeparate() train, test = separator.RunByTestingPercentage(container, 0.3, clinic_df=clinics) print(train.GetArray().shape, test.GetArray().shape) print(separator.clinic_split_result)
def Run(self, data_container, test_data_container=DataContainer(), store_folder='', metric_name_list=('auc', 'accuracy')): train_metric_list = [] val_metric_list = [] test_metric_list = [] for feature_number in range(1, self.__max_feature_number + 1): LoadWaitBar(self.__max_feature_number, feature_number) feature_store_folder = os.path.join(store_folder, 'feature_'+str(feature_number)) if not os.path.exists(feature_store_folder): os.mkdir(feature_store_folder) self.__feature_selector.SetSelectedFeatureNumber(feature_number) feature_selected_data_container = self.__feature_selector.Run(data_container, feature_store_folder) # feature_selected_data_container.UsualAndL2Normalize() train_metric, val_metric, test_metric = super(CrossValidationOnFeatureNumber, self).Run( feature_selected_data_container, test_data_container=test_data_container, store_folder=feature_store_folder) train_metric_list.append(train_metric) val_metric_list.append(val_metric) test_metric_list.append(test_metric) metric_list = [] for metric in metric_name_list: metric_ditc = {'train': [], 'val': [], 'test': [], 'name': metric} for feature_number in range(self.__max_feature_number): metric_ditc['train'].append(float(train_metric_list[feature_number]['train_' + metric])) metric_ditc['val'].append(float(val_metric_list[feature_number]['val_' + metric])) if test_metric_list[0] != {}: metric_ditc['test'].append(float(test_metric_list[feature_number]['test_' + metric])) metric_list.append(metric_ditc) # Save the Relationship v.s. number of features if store_folder and os.path.isdir(store_folder): for metric_dict in metric_list: if test_metric_list[0] != {}: DrawCurve(range(1, self.__max_feature_number + 1), [metric_dict['train'], metric_dict['val'], metric_dict['test']], xlabel='# Features', ylabel=metric_dict['name'], name_list=['train', 'validation', 'test'], is_show=False, store_path=os.path.join(store_folder, metric_dict['name'] + '_FeatureNum.jpg')) else: DrawCurve(range(1, self.__max_feature_number + 1), [metric_dict['train'], metric_dict['val']], xlabel='# Features', ylabel=metric_dict['name'], name_list=['train', 'validation'], is_show=False, store_path=os.path.join(store_folder, metric_dict['name'] + '_FeatureNum.jpg')) val_return_list = [] test_return_max_val_list = [] test_return_max_test_list = [] for metric_dict in metric_list: metric_info = {} new_info = val_metric_list[np.argmax(metric_dict['val'])] metric_info['feature_number'] = np.argmax(metric_dict['val']) + 1 for key in new_info.keys(): metric_info[key[4:]] = new_info[key] val_return_list.append(dict(sorted(metric_info.items(), key=lambda item:item[0]))) if test_metric_list[0] != {}: # Max the validation test_metric_info = {} test_new_info = test_metric_list[np.argmax(metric_dict['val'])] test_metric_info['feature_number'] = np.argmax(metric_dict['val']) + 1 for key in test_new_info.keys(): test_metric_info[key[5:]] = test_new_info[key] test_return_max_val_list.append(dict(sorted(test_metric_info.items(), key=lambda item: item[0]))) # Max the testing data test_metric_info = {} test_new_info = test_metric_list[np.argmax(metric_dict['test'])] test_metric_info['feature_number'] = np.argmax(metric_dict['test']) + 1 for key in test_new_info.keys(): test_metric_info[key[5:]] = test_new_info[key] test_return_max_test_list.append(dict(sorted(test_metric_info.items(), key=lambda item:item[0]))) return val_return_list, test_return_max_val_list, test_return_max_test_list
#TODO: Add verbose parameter to show the removed feature name in each selector def Run(self, data_container, store_folder=''): input_data_container = data_container for fs in self.__selector_list: output = fs.Run(input_data_container, store_folder) input_data_container = output return output ################################################################ if __name__ == '__main__': import os print(os.getcwd()) from FAE.DataContainer.DataContainer import DataContainer data_container = DataContainer() print(os.path.abspath(r'..\..\Example\numeric_feature.csv')) data_container.Load(r'..\..\Example\numeric_feature.csv') # data_container.UsualNormalize() print(data_container.GetArray().shape) print(data_container.GetFeatureName()) fs = FeatureSelectBySubName(['shape', 'ADC']) output = fs.Run(data_container) print(output.GetFeatureName()) # fs1 = RemoveNonNumericFeature() # fs1.SetDataContainer(data_container) # non_number_data_container = fs1.Run()
training_index_list.append(index) train_data_container = self.__SetNewData(data_container, training_index_list) test_data_container = self.__SetNewData(data_container, testing_index_list) if store_folder: train_data_container.Save( os.path.join(store_folder, 'train_numeric_feature.csv')) test_data_container.Save( os.path.join(store_folder, 'test_numeric_feature.csv')) return train_data_container, test_data_container if __name__ == '__main__': data = DataContainer() data.Load(r'..\..\Example\numeric_feature.csv') data_separator = DataSeparate() data_separator.Run(data, store_folder=r'..\..\Example\separate_test') ref_data_container = DataContainer() ref_data_container.Load( r'..\..\Example\separate_test\train_numeric_feature.csv') data_separator.training_ref_data_container = ref_data_container data_separator.Run(data, store_folder=r'..\..\Example\separate_test\reload')
class PrepareConnection(QWidget, Ui_Prepare): def __init__(self, parent=None): super(PrepareConnection, self).__init__(parent) self.setupUi(self) self.data_container = DataContainer() self.buttonLoad.clicked.connect(self.LoadData) self.buttonRemove.clicked.connect(self.RemoveNonValidValue) self.checkSeparate.clicked.connect(self.SetSeparateStatus) self.spinBoxSeparate.setEnabled(False) self.buttonSave.clicked.connect(self.CheckAndSave) def UpdateTable(self): if self.data_container.GetArray().size == 0: return self.tableFeature.setRowCount(len(self.data_container.GetCaseName())) header_name = deepcopy(self.data_container.GetFeatureName()) header_name.insert(0, 'Label') self.tableFeature.setColumnCount(len(header_name)) self.tableFeature.setHorizontalHeaderLabels(header_name) self.tableFeature.setVerticalHeaderLabels( list(map(str, self.data_container.GetCaseName()))) for row_index in range(len(self.data_container.GetCaseName())): for col_index in range(len(header_name)): if col_index == 0: self.tableFeature.setItem( row_index, col_index, QTableWidgetItem( str(self.data_container.GetLabel()[row_index]))) else: self.tableFeature.setItem( row_index, col_index, QTableWidgetItem( str(self.data_container.GetArray()[row_index, col_index - 1]))) text = "The number of cases: {:d}\n".format( len(self.data_container.GetCaseName())) text += "The number of features: {:d}\n".format( len(self.data_container.GetFeatureName())) if len(np.unique(self.data_container.GetLabel())) == 2: positive_number = len( np.where(self.data_container.GetLabel() == np.max( self.data_container.GetLabel()))[0]) negative_number = len( self.data_container.GetLabel()) - positive_number assert (positive_number + negative_number == len( self.data_container.GetLabel())) text += "The number of positive samples: {:d}\n".format( positive_number) text += "The number of negative samples: {:d}\n".format( negative_number) self.textInformation.setText(text) def LoadData(self): dlg = QFileDialog() file_name, _ = dlg.getOpenFileName(self, 'Open SCV file', filter="csv files (*.csv)") try: self.data_container.Load(file_name) except: print('Error') self.UpdateTable() def RemoveNonValidValue(self): if self.radioRemoveNonvalidCases.isChecked(): self.data_container.RemoveUneffectiveCases() elif self.radioRemoveNonvalidFeatures.isChecked(): self.data_container.RemoveUneffectiveFeatures() self.UpdateTable() def SetSeparateStatus(self): if self.checkSeparate.isChecked(): self.spinBoxSeparate.setEnabled(True) else: self.spinBoxSeparate.setEnabled(False) def CheckAndSave(self): if self.data_container.IsEmpty(): QMessageBox.warning(self, "Warning", "There is no data", QMessageBox.Ok) elif self.data_container.HasNonValidNumber(): QMessageBox.warning(self, "Warning", "There are nan items", QMessageBox.Ok) non_valid_number_Index = self.data_container.FindNonValidNumberIndex( ) old_edit_triggers = self.tableFeature.editTriggers() self.tableFeature.setEditTriggers(QAbstractItemView.CurrentChanged) self.tableFeature.setCurrentCell(non_valid_number_Index[0], non_valid_number_Index[1] + 1) self.tableFeature.setEditTriggers(old_edit_triggers) elif self.checkSeparate.isChecked(): percentage_testing_data = self.spinBoxSeparate.value() folder_name = QFileDialog.getExistingDirectory(self, "Save data") if folder_name != '': data_seperate = DataSeparate.DataSeparate( percentage_testing_data) data_seperate.Run(self.data_container, folder_name) else: file_name, _ = QFileDialog.getSaveFileName( self, "Save data", filter="csv files (*.csv)") if file_name != '': self.data_container.Save(file_name)
if not is_test: data_container.Save( os.path.join(store_folder, 'zero_center_unit_training_feature.csv')) self.Save(store_path=os.path.join( store_folder, 'zero_center_unit_normalization_training.csv'), feature_name=data_container.GetFeatureName()) else: data_container.Save( os.path.join( store_folder, 'zero_center_unit_normalized_testing_feature.csv')) return data_container def GetDescription(self): text = "We applied the normalization on the feature matrix. Each feature vector was subtracted by the mean " \ "value of the vector and was divided by the length of it. " return text if __name__ == '__main__': from FAE.DataContainer.DataContainer import DataContainer data_container = DataContainer() file_path = os.path.abspath(r'..\..\Example\numeric_feature.csv') print(file_path) data_container.Load(file_path) normalizer = NormalizerZeroCenterAndUnit() normalizer.Run(data_container, store_folder=r'..\..\Example\one_pipeline')