def train_model(config_path: Text, base_config_path: Text): config = yaml.load(open(config_path), Loader=yaml.FullLoader) base_config = yaml.load(open(base_config_path), Loader=yaml.FullLoader) estimator_name = config['estimator_name'] param_grid = config['estimators'][estimator_name]['param_grid'] cv = config['cv'] class_weight = config['class_weight'] target_column = base_config['featurize']['target_column'] train_df = get_dataset(base_config['split_train_test']['train_csv'], base_config['featurize']['dataset_sep']) valid_df = get_dataset(base_config['split_train_test']['valid_csv'], base_config['featurize']['dataset_sep']) sampler = base_config['split_train_test']['sampler'] sampler_rate = base_config['split_train_test']['sampler_rate'] ''' Your code is here !!! ''' print(model.best_score_) model_name = base_config['base']['model']['model_name'] models_folder = base_config['base']['model']['models_folder'] joblib.dump(model, os.path.join(models_folder, model_name))
def train_model(config_path: Text): config = yaml.load(open(config_path), Loader=yaml.FullLoader) estimator_name = config['train']['estimator_name'] param_grid = config['train']['estimators'][estimator_name]['param_grid'] cv = config['train']['cv'] target_column = config['featurize']['target_column'] train_df = get_dataset(config['split_train_test']['train_csv']) model = train( df=train_df, target_column=target_column, estimator_name=estimator_name, param_grid=param_grid, cv=cv ) print(model.best_score_) model_name = config['base']['model']['model_name'] models_folder = config['base']['model']['models_folder'] joblib.dump( model, os.path.join(models_folder, model_name) )
def evaluate_model(config_path: Text, base_config_path: Text): config = yaml.load(open(config_path), Loader=yaml.FullLoader) base_config = yaml.load(open(base_config_path), Loader=yaml.FullLoader) target_column = base_config['featurize']['target_column'] test_df = get_dataset(base_config['split_train_test']['test_csv'], base_config['featurize']['dataset_sep']) model_name = base_config['base']['model']['model_name'] models_folder = base_config['base']['model']['models_folder'] model = joblib.load(os.path.join(models_folder, model_name)) ''' Your code is here !!! ''' test_report = { 'f1_score': f1, 'precision': precision, 'recall': recall, 'accuracy': accuracy } #print(test_report) filepath = os.path.join( base_config['base']['experiments']['experiments_folder'], config['metrics_file']) json.dump(obj=test_report, fp=open(filepath, 'w'), indent=2) ''' Your code is here !!! ''' experiment.log_confusion_matrix(matrix=cm.tolist()) experiment.log_metrics(test_report) experiment.log_dataset_hash(test_df)
def featurize(config_path: Text): config = yaml.load(open(config_path), Loader=yaml.FullLoader) dataset = get_dataset(config['dataset_csv'], config['dataset_sep']) featured_dataset = extract_features(dataset, config['features_columns_range'], config['target_column']) filpath = config['featured_dataset_csv'] featured_dataset.to_csv(filpath, index=False, sep = config['dataset_sep'])
def featurize(config_path: Text): config = yaml.load(open(config_path), Loader=yaml.FullLoader) dataset = get_dataset(config['dataset_csv']) featured_dataset = extract_features(dataset) filpath = config['featured_dataset_csv'] featured_dataset.to_csv(filpath, index=False)
def data_load(config_path: Text) -> None: """Load raw data. Args: config_path {Text}: path to config """ config = load_config(config_path) dataset = get_dataset() dataset.to_csv(config.data_load.dataset_csv, index=False) print(f'Data saved to: {config.data_load.dataset_csv}')
def split_dataset(config_path: Text, base_config_path: Text): config = yaml.load(open(config_path), Loader=yaml.FullLoader) base_config = yaml.load(open(base_config_path), Loader=yaml.FullLoader) dataset = get_dataset(base_config['featurize']['featured_dataset_csv'], base_config['featurize']['dataset_sep']) random_state = base_config['base']['random_state'] base_dataset = get_dataset(base_config['featurize']['dataset_csv'], base_config['featurize']['dataset_sep']) base_dataset = base_dataset[base_config['featurize'] ['features_columns_range']] var_numeric = base_dataset.select_dtypes([np.number]).columns test_size = config['test_size'] valid_size = config['valid_size'] train_csv_path = config['train_csv'] test_csv_path = config['test_csv'] valid_csv_path = config['valid_csv'] ''' Your code is here !!! ''' train_dataset.to_csv(train_csv_path, sep=base_config['featurize']['dataset_sep'], encoding='utf-8', index=False) test_dataset.to_csv(test_csv_path, sep=base_config['featurize']['dataset_sep'], encoding='utf-8', index=False) valid_dataset.to_csv(valid_csv_path, sep=base_config['featurize']['dataset_sep'], encoding='utf-8', index=False)
def evaluate_model(config_path: Text): config = yaml.load(open(config_path), Loader=yaml.FullLoader) estimator_name = config['train']['estimator_name'] target_column = config['featurize']['target_column'] test_df = get_dataset(config['split_train_test']['test_csv']) model_name = config['base']['model']['model_name'] models_folder = config['base']['model']['models_folder'] model = joblib.load(os.path.join(models_folder, model_name)) f1, cm = evaluate(df=test_df, target_column=target_column, clf=model) test_report = {'f1_score': f1, 'confusion_matrix': cm.tolist()} print(test_report) filepath = os.path.join( config['base']['experiments']['experiments_folder'], config['evaluate']['metrics_file']) json.dump(obj=test_report, fp=open(filepath, 'w'), indent=2) # Logging into mlflow client = MlflowClient() experiments = client.list_experiments( ) # returns a list of mlflow.entities.Experiment mlflow.set_experiment(estimator_name) print(experiments) with mlflow.start_run() as run: print(run) print(run.info) print(run.info.run_uuid) param_grid = config['train']['estimators'][estimator_name][ 'param_grid'] log_param(key='estimator', value=estimator_name) log_param(key='cv', value=config['train']['cv']) for param, value in param_grid.items(): log_param(key=param, value=value) log_metric(key='f1_score', value=f1) log_artifact(local_path='models/model.joblib') log_artifact(local_path='src/features/features.py') log_artifact(local_path='src/train/train.py') log_artifact(local_path='src/pipelines/train.py')
def split_dataset(config_path: Text): config = yaml.load(open(config_path), Loader=yaml.FullLoader) dataset = get_dataset(config['featurize']['featured_dataset_csv']) target_column = config['featurize']['target_column'] random_state = config['base']['random_state'] test_size = config['split_train_test']['test_size'] train_csv_path = config['split_train_test']['train_csv'] test_csv_path = config['split_train_test']['test_csv'] dataset = transform_targets_to_numerics(dataset, target_column=target_column) train_dataset, test_dataset = split_dataset_in_train_test( dataset, test_size=test_size, random_state=random_state) train_dataset.to_csv(train_csv_path, index=False) test_dataset.to_csv(test_csv_path, index=False)
def evaluate_model(config_path: Text, base_config_path: Text): config = yaml.load(open(config_path), Loader=yaml.FullLoader) base_config = yaml.load(open(base_config_path), Loader=yaml.FullLoader) target_column = base_config['featurize']['target_column'] test_df = get_dataset(base_config['split_train_test']['test_csv']) model_name = base_config['base']['model']['model_name'] models_folder = base_config['base']['model']['models_folder'] model = joblib.load(os.path.join(models_folder, model_name)) f1, cm = evaluate(df=test_df, target_column=target_column, clf=model) test_report = { 'f1_score': f1, 'confusion_matrix': cm.tolist() } print(test_report) filepath = os.path.join(base_config['base']['experiments']['experiments_folder'], config['metrics_file']) json.dump(obj=test_report, fp=open(filepath, 'w'), indent=2)