def missing_state_and_imputing(dataset): dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) dataset = data_utils.remove_features(dataset) dataset = dataset.reset_index() dataset = dataset.drop(['index'], axis = 1) imp = SimpleImputer(missing_values = '0', strategy="most_frequent") zero_values = list(dataset.columns[dataset.eq('0').mean().mean() > 0])[0] print(zero_values) for feature in zero_values: print(feature) dataset[f'{feature}_is_missing'] = dataset[feature].apply(lambda value: 0 if value == '0' else 1) dataset[zero_values] = imp.fit_transform(dataset[zero_values]) dtypes_df = dataset.dtypes.to_frame(name = 'dtype') dtypes_df = dtypes_df[dtypes_df['dtype'] == 'object'] non_numeric_features = list(dtypes_df.index) dataset[non_numeric_features] = dataset[non_numeric_features].applymap(str) dataset[non_numeric_features] = label_encode(dataset[non_numeric_features]) dataset.to_csv("datasets/experiments/missing_state_and_imputing.csv", index = False) log_artifact("datasets/experiments/missing_state_and_imputing.csv")
def clean_dataset(path): dataset = data_utils.get_dataset() dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) dataset = data_utils.remove_features(dataset) dataset.to_csv(path, index=False)
def risk_cases_encoder(dataset): dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) mot_esp_cases = ['referencia', 'contra_referencia', 'urgencias', 'entrega de medicamentos', 'citas de consulta medica especializada', 'procedimientos y/o servicios', 'enfermedades raras o hu'] afec_edad_cases = ['de 6 a 12 años', 'de 0 a 5 años', 'de 13 a 17 años', 'mayor de 63 años'] cie10_cases = ['vih', 'tumores malignos', 'maternas', 'trasplantados'] dataset['CASO_RIESGO'] = dataset['MOTIVO_ESPECIFICO'].apply(lambda value: contains(value, mot_esp_cases)) dataset['POBESPECIAL'] = dataset['AFEC_POBESPECIAL'].apply(lambda value: False if value == 'no aplica' else True) dataset['EDAD_RIESGO'] = dataset['AFEC_EDADR'].apply(lambda value: contains(value, afec_edad_cases)) dataset['CIE10_RIESGO'] = dataset['CIE_10'].apply(lambda value: contains(value, cie10_cases)) dataset = data_utils.remove_features(dataset) dataset = dataset.reset_index() dataset = dataset.drop(['index'], axis = 1) labels = dataset[['RIESGO_VIDA']] features = dataset.drop(['RIESGO_VIDA'], axis = 1) encoded_features = data_utils.encode_features(features, labels) encoded_features['RIESGO_VIDA'] = labels encoded_features.to_csv("datasets/experiments/risk_cases_encoder.csv", index = False) log_artifact("datasets/experiments/risk_cases_encoder.csv")
def basic(dataset): dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) dataset = data_utils.remove_features(dataset) dataset = dataset.reset_index() dataset.drop(['index'], axis = 1) dataset = label_encode(dataset) dataset.to_csv("datasets/experiments/basic.csv", index = False) log_artifact("datasets/experiments/basic.csv")
def target_encoder(dataset): dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) dataset = data_utils.remove_features(dataset) dataset = dataset.reset_index() dataset = dataset.drop(['index'], axis = 1) labels = dataset[['RIESGO_VIDA']] features = dataset.drop(['RIESGO_VIDA'], axis = 1) encoded_features = data_utils.encode_features(features, labels) encoded_features['RIESGO_VIDA'] = labels encoded_features.to_csv("datasets/experiments/target_encoder.csv", index = False)
def imputing(dataset): dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) dataset = data_utils.remove_features(dataset) dataset = dataset.reset_index() dataset = dataset.drop(['index'], axis = 1) zero_values = list(dataset.columns[dataset.eq('0').mean() > 0]) dataset[zero_values] = dataset[zero_values].apply(lambda col: col.fillna(col.mode()[0]), axis=0) dataset = dataset.applymap(str) dataset = label_encode(dataset) dataset.to_csv("datasets/experiments/imputing.csv", index = False) log_artifact("datasets/experiments/imputing.csv")
def missing_state(dataset): dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) dataset = data_utils.remove_features(dataset) dataset = dataset.reset_index() dataset = dataset.drop(['index'], axis = 1) zero_values = set(dataset.columns[dataset.eq('0').mean() > 0]) for feature in zero_values: dataset[f'{feature}_is_missing'] = dataset[feature].apply(lambda f: 1 if f == '0' else 0) features_columns = [column for column in dataset.columns if '_is_missing' not in column] dataset[features_columns] = label_encode(dataset[features_columns]) dataset.to_csv("datasets/experiments/missing_state.csv", index = False) log_artifact("datasets/experiments/missing_state.csv")
def target_encoder_only_complains(dataset): dataset = dataset[ (dataset['PQR_TIPOPETICION'] != 'peticion de informacion') & (dataset['PQR_TIPOPETICION'] != 'consulta y/o solicitud de informacion') ] dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) dataset = data_utils.remove_features(dataset) dataset = dataset.reset_index() dataset = dataset.drop(['index'], axis = 1) labels = dataset[['RIESGO_VIDA']] features = dataset.drop(['RIESGO_VIDA'], axis = 1) encoded_features = data_utils.encode_features(features, labels) encoded_features['RIESGO_VIDA'] = labels encoded_features.to_csv("datasets/experiments/target_encoder_only_complains.csv", index = False) log_artifact("datasets/experiments/target_encoder_only_complains.csv")
def normalizing(dataset): dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.clean_cie_10(dataset) dataset = data_utils.remove_features(dataset) dataset = dataset.reset_index() dataset = dataset.drop(['index'], axis = 1) dataset = label_encode(dataset) scaler = MinMaxScaler() features = dataset.drop(['RIESGO_VIDA'], axis = 1) labels = dataset[['RIESGO_VIDA']] features[features.columns] = features[features.columns].apply(lambda x: np.log(x + 1)) features[features.columns] = scaler.fit_transform(features[features.columns]) dataset = features dataset['RIESGO_VIDA'] = labels.values dataset.to_csv("datasets/experiments/normalizing.csv", index = False) log_artifact("datasets/experiments/normalizing.csv")
def naive(dataset): dataset = data_utils.clean_riesgo_vida(dataset) dataset = dataset.drop(['PQR_ESTADO'], axis = 1) dataset = label_encode(dataset) dataset.to_csv("datasets/experiments/naive.csv", index = False) log_artifact("datasets/experiments/naive.csv")
def cie10_only_complains(dataset): dataset = dataset[ (dataset['PQR_TIPOPETICION'] != 'peticion de informacion') & (dataset['PQR_TIPOPETICION'] != 'consulta y/o solicitud de informacion') ] cie10_df = pd.read_csv('datasets/CIE10.csv', sep = ';') cie10_df['DESCRIPCION_COD_CIE_10_04'] = cie10_df['DESCRIPCION_COD_CIE_10_04'].apply(lambda value: value.lower()) dataset = pd.merge(left = dataset, right = cie10_df, how = 'left', left_on='CIE_10', right_on='DESCRIPCION_COD_CIE_10_04') dataset = dataset.drop(['CIE_10', 'NOMBRE_CAPITULO', 'DESCRIPCION_COD_CIE_10_03', 'DESCRIPCION_COD_CIE_10_04'], axis = 1) cie10_columns = [ 'CAPITULO', 'COD_CIE_10_03', 'COD_CIE_10_04', 'SEXO', 'LIMITE_INFERIOR_EDAD', 'LIMITE_SUPERIOR_EDAD'] dataset[cie10_columns] = dataset[cie10_columns].replace(np.nan, 'no_cie10', regex=True) dataset = dataset[dataset['CAPITULO'] != 'no_cie10'] dataset['CIE10_SEXO'] = dataset['SEXO'].apply(cie10_sexo) dataset['LIMITE_INFERIOR_EDAD_Y'] = dataset['LIMITE_INFERIOR_EDAD'].apply(to_year) dataset['LIMITE_SUPERIOR_EDAD_Y'] = dataset['LIMITE_SUPERIOR_EDAD'].apply(to_year) dataset['AFEC_EDADR_INF'] = dataset['AFEC_EDADR'].apply(get_edad_inf) dataset['AFEC_EDADR_SUP'] = dataset['AFEC_EDADR'].apply(get_edad_sup) dataset['CIE10_RANGO_EDAD'] = dataset.apply(in_range, axis=1) dataset = dataset.drop( [ 'SEXO', 'LIMITE_INFERIOR_EDAD', 'LIMITE_SUPERIOR_EDAD', 'LIMITE_INFERIOR_EDAD_Y', 'LIMITE_SUPERIOR_EDAD_Y', 'AFEC_EDADR' ], axis = 1 ) dataset[ [ 'AFEC_GENERO', 'CIE10_SEXO', 'CIE10_RANGO_EDAD', 'AFEC_EDADR_INF', 'AFEC_EDADR_SUP', ] ].head() dataset = data_utils.clean_afec_dpto(dataset) dataset = data_utils.clean_riesgo_vida(dataset) dataset = data_utils.remove_features(dataset) dataset = dataset.reset_index() dataset = dataset.drop(['index'], axis = 1) labels = dataset[['RIESGO_VIDA']] features = dataset.drop(['RIESGO_VIDA'], axis = 1) encoded_features = data_utils.encode_features(features, labels) encoded_features['RIESGO_VIDA'] = labels encoded_features.to_csv("datasets/experiments/cie10_only_complains.csv", index = False) log_artifact("datasets/experiments/cie10_only_complains.csv")