def __train_test_val_split(self): y = self.__same_length_vectors[Data_Fields.get_target()] binary_fields = [ field for field in Data_Fields.get_binary_vars() if field in Config.DATA_FIELDS_IN_ANALYSIS ] continuous_fields = [ field for field in Data_Fields.get_continuous_vars() if field in Config.DATA_FIELDS_IN_ANALYSIS ] predictors_names = binary_fields + continuous_fields predictors_vectors_tuple = tuple( [self.__same_length_vectors[name] for name in predictors_names]) X = np.stack(predictors_vectors_tuple, axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42) self.X_train = X_train self.X_val = X_val self.X_test = X_test self.y_train = y_train self.y_val = y_val self.y_test = y_test self.predictors_names = predictors_names
def __init__(self): self.__class__.WORK_DIR = 'C:/Users/normy/corona_classifier_files/classification' self.__class__.COMMON_FILES_DIR = 'C:/Users/normy/PycharmProjects/corona_classifier/common_files' self.__class__.RAW_DATA_PATH = 'C:/Users/normy/PycharmProjects/covidclinicaldata/data' self.__class__.PICKLE_PATH = os.path.join(self.WORK_DIR, 'pickle_files') self.__class__.YAML_FILE_DIR = os.path.join(os.path.dirname(__file__), 'yaml_files') self.__class__.COMMON_YAML_FILE_DIR = os.path.join( self.COMMON_FILES_DIR, 'yaml_files') self.__class__.OUTPUT_PATH = os.path.join(self.WORK_DIR, 'outputs') self.__class__.DATA_FIELDS_IN_ANALYSIS = [ Data_Fields.get_target(), *Data_Fields.get_binary_vars(), *Data_Fields.get_continuous_vars() ] static_values = self.load_yaml(self.YAML_FILE_DIR, 'static_values') self.__class__.DATA_FIELD_MISSING_VALUES_THRESHOLD = static_values[ 'data_field_missing_values_threshold'] self.__class__.BOOTSTRAP_PATIENT_ENLARGEMENT_SIZE = static_values[ 'bootstrap_patient_enlargement_size'] self.__class__.MODEL_THRESHOLDS = static_values['model_thresholds'] self.__class__.CONTINUOUS_FIELDS_THRESHOLDS = self.load_yaml( self.COMMON_YAML_FILE_DIR, 'continuous_fields_thresholds')
def __binary_one_hot_encoding(self): data_fields = Data_Fields.get_binary_vars() data_fields.append(Data_Fields.get_target()) for patient in self.__patients: for field in data_fields: if getattr(patient, field) is None: continue elif getattr(patient, field) not in ('TRUE', 'Positive'): setattr(patient, field, 0) else: setattr(patient, field, 1)
def __get_vectors_for_analysis(self): data_fields_for_analysis = [Data_Fields.get_target()] data_fields_for_analysis.extend(Data_Fields.get_continuous_vars()) vectors_for_analysis = [] for vector in self.__vectors: if vector.field_name in data_fields_for_analysis: if vector.field_name in Config.DATA_FIELDS_IN_ANALYSIS: vectors_for_analysis.append(vector) self.__vectors = vectors_for_analysis self.__graph_vectors['continuous_vectors'] = self.__vectors
def __get_vector_dict(self, remove_missing_values: bool): vector_dict = {} for analysis_vector in self.__vectors: field_name = analysis_vector.field_name if field_name == Data_Fields.get_target(): continue if remove_missing_values: vector = analysis_vector.vector_without_missing_values else: vector = analysis_vector.vector vector_dict[field_name] = vector return vector_dict
def __average_by_target_for_age_groups(self): report_table = defaultdict(list) vector_dict = self.__get_analysis_vector_dict() for continuous_var in Data_Fields.get_continuous_vars(): if continuous_var == 'age': continue if continuous_var not in Config.DATA_FIELDS_IN_ANALYSIS: continue same_length_vectors = self.__get_same_length_vectors(vector_list=[ vector_dict[Data_Fields.get_target()], vector_dict[ Data_Fields.AGE.field_name], vector_dict[continuous_var] ]) is_adult = same_length_vectors[Data_Fields.AGE.field_name] >= 18 target_adult_vector = same_length_vectors[ Data_Fields.get_target()][is_adult] response_adult_vector = same_length_vectors[continuous_var][ is_adult] target_child_vector = same_length_vectors[ Data_Fields.get_target()][~is_adult] response_child_vector = same_length_vectors[continuous_var][ ~is_adult] adult_corona_positive_response_vector = response_adult_vector[ target_adult_vector == 1] adult_corona_negative_response_vector = response_adult_vector[ target_adult_vector == 0] child_corona_positive_response_vector = response_child_vector[ target_child_vector == 1] child_corona_negative_response_vector = response_child_vector[ target_child_vector == 0] adult_bootstrap_significance = self.bootstrap_difference_in_mean_of_two_groups( adult_corona_positive_response_vector, adult_corona_negative_response_vector) child_bootstrap_significance = self.bootstrap_difference_in_mean_of_two_groups( child_corona_positive_response_vector, child_corona_negative_response_vector) adult_corona_positive_bootstrap_mean = self.__calc_bootstrap_mean( adult_corona_positive_response_vector, iterations=Config.BOOTSTRAP_ITERATIONS) adult_corona_negative_bootstrap_mean = self.__calc_bootstrap_mean( adult_corona_negative_response_vector, iterations=Config.BOOTSTRAP_ITERATIONS) child_corona_positive_bootstrap_mean = self.__calc_bootstrap_mean( child_corona_positive_response_vector, iterations=Config.BOOTSTRAP_ITERATIONS) child_corona_negative_bootstrap_mean = self.__calc_bootstrap_mean( child_corona_negative_response_vector, iterations=Config.BOOTSTRAP_ITERATIONS) adult_count = len(adult_corona_positive_response_vector) + len( adult_corona_negative_response_vector) child_count = len(child_corona_positive_response_vector) + len( child_corona_negative_response_vector) report_table['feature'].append(continuous_var) report_table['adult corona positive regular AVG'].append( np.mean(adult_corona_positive_response_vector)) report_table['adult corona negative regular AVG'].append( np.mean(adult_corona_negative_response_vector)) report_table['adult corona positive bootstrap AVG'].append( adult_corona_positive_bootstrap_mean) report_table['adult corona negative bootstrap AVG'].append( adult_corona_negative_bootstrap_mean) report_table['adult bootstrap significance'].append( adult_bootstrap_significance) report_table['adult count'].append(adult_count) report_table['child corona positive AVG'].append( np.mean(child_corona_positive_response_vector)) report_table['child corona negative AVG'].append( np.mean(child_corona_negative_response_vector)) report_table['child corona positive bootstrap AVG'].append( child_corona_positive_bootstrap_mean) report_table['child corona negative bootstrap AVG'].append( child_corona_negative_bootstrap_mean) report_table['child bootstrap significance'].append( child_bootstrap_significance) report_table['child count'].append(child_count) self.__report_tables['average_by_target'] = report_table