def test_data_columns(self): data_engineering = DataEngineering() data_engineering.load_data(self.csv_path) columns = [ 'POZO', 'FECHA PRUEBAS', 'mes', 'año', 'BBPD', 'BNPD', '% AGUA', 'E_FLUJO', 'NU_COORD_UTM ESTE', 'NU_COORD_UTM NORTE', '°API' ] assert data_engineering.get_data_columns() == columns
def test_set_get_label(self): label = "BNPD" data_engineering = DataEngineering() assert data_engineering.get_label() is None data_engineering.load_data(self.csv_path) data_engineering.set_label(label) assert data_engineering.get_label() == label
def test_set_get_features(self): features = ["POZO", "mes", "BBPD"] data_engineering = DataEngineering() assert data_engineering.get_features() is None data_engineering.load_data(self.csv_path) data_engineering.set_features(features) assert data_engineering.get_features() == features
def test_load_data(self): data_engineering = DataEngineering() data_engineering.load_data(self.csv_path) self.assertIsNotNone(data_engineering.get_data())
def test_split_data(self): features = [ "flujo", "NU_COORD_UTM ESTE", "NU_COORD_UTM NORTE", "°API", "antiguedad" ] label = "BBPD" data_engineering = DataEngineering() data_engineering.load_data(self.csv_path) data = data_engineering.get_data() max_date = data["año"].max() age = max_date - data["año"] data_engineering.add_column("antiguedad", age) flow_data = data["E_FLUJO"].copy().astype("category").cat.codes data_engineering.add_column("flujo", flow_data) data_engineering.set_label(label) data_engineering.set_features(features) data_engineering.split_data() assert data_engineering.x_train is not None assert data_engineering.x_test is not None assert data_engineering.y_train is not None assert data_engineering.y_test is not None
def test_load_data_empty(self): data_engineering = DataEngineering() self.assertIsNone(data_engineering.get_data())
from model.utils.data_engineering import DataEngineering from model.prediction_model.regression import Regression # Create an instance for DataEngineering and load data from CSV csv_path = "data/area_01.csv" data_e = DataEngineering() data_e.load_data(csv_path) data_e.clean_data() # Create new features # "age" feature max_date = data_e.get_data()["año"].max() age = max_date - data_e.get_data()["año"] data_e.add_column("age", age) # "flow" feature flow_data = data_e.get_data()["E_FLUJO"].copy().astype("category").cat.codes data_e.add_column("flow", flow_data) # Set features and label features = ["flow", "NU_COORD_UTM ESTE", "NU_COORD_UTM NORTE", "°API", "age"] label = "BBPD" data_e.set_features(features) data_e.set_label(label) # Split Train-Test data data_e.split_data() # Create a Model model = Regression(data_e)