def __train_test_val_split(self): y = self.__same_length_vectors[Data_Fields.get_target()] binary_fields = [ field for field in Data_Fields.get_binary_vars() if field in Config.DATA_FIELDS_IN_ANALYSIS ] continuous_fields = [ field for field in Data_Fields.get_continuous_vars() if field in Config.DATA_FIELDS_IN_ANALYSIS ] predictors_names = binary_fields + continuous_fields predictors_vectors_tuple = tuple( [self.__same_length_vectors[name] for name in predictors_names]) X = np.stack(predictors_vectors_tuple, axis=1) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42) self.X_train = X_train self.X_val = X_val self.X_test = X_test self.y_train = y_train self.y_val = y_val self.y_test = y_test self.predictors_names = predictors_names
def __init__(self): self.__class__.WORK_DIR = 'C:/Users/normy/corona_classifier_files/classification' self.__class__.COMMON_FILES_DIR = 'C:/Users/normy/PycharmProjects/corona_classifier/common_files' self.__class__.RAW_DATA_PATH = 'C:/Users/normy/PycharmProjects/covidclinicaldata/data' self.__class__.PICKLE_PATH = os.path.join(self.WORK_DIR, 'pickle_files') self.__class__.YAML_FILE_DIR = os.path.join(os.path.dirname(__file__), 'yaml_files') self.__class__.COMMON_YAML_FILE_DIR = os.path.join( self.COMMON_FILES_DIR, 'yaml_files') self.__class__.OUTPUT_PATH = os.path.join(self.WORK_DIR, 'outputs') self.__class__.DATA_FIELDS_IN_ANALYSIS = [ Data_Fields.get_target(), *Data_Fields.get_binary_vars(), *Data_Fields.get_continuous_vars() ] static_values = self.load_yaml(self.YAML_FILE_DIR, 'static_values') self.__class__.DATA_FIELD_MISSING_VALUES_THRESHOLD = static_values[ 'data_field_missing_values_threshold'] self.__class__.BOOTSTRAP_PATIENT_ENLARGEMENT_SIZE = static_values[ 'bootstrap_patient_enlargement_size'] self.__class__.MODEL_THRESHOLDS = static_values['model_thresholds'] self.__class__.CONTINUOUS_FIELDS_THRESHOLDS = self.load_yaml( self.COMMON_YAML_FILE_DIR, 'continuous_fields_thresholds')
def __binary_one_hot_encoding(self): data_fields = Data_Fields.get_binary_vars() data_fields.append(Data_Fields.get_target()) for patient in self.__patients: for field in data_fields: if getattr(patient, field) is None: continue elif getattr(patient, field) not in ('TRUE', 'Positive'): setattr(patient, field, 0) else: setattr(patient, field, 1)