def main(artifact_location: str, production_ready: bool = False) -> None: art_loc = ArtifactLocation(artifact_location) data_dict = load_and_preprocess_data(art_loc) if art_loc == ArtifactLocation.LOCAL: _ = train_and_persist(data_dict) elif art_loc == ArtifactLocation.S3: _ = train_and_persist(data_dict) s3 = boto3.client("s3") s3.upload_file( f"{os.getcwd()}/{Config.LOCAL_ARTIFACTS_PATH}/{Config.FEATURE_ENGINEERING_ARTIFACT}", Bucket=Config.BUCKET_NAME, Key= f"{Config.S3_ARTIFACTS_DIR}/{Config.FEATURE_ENGINEERING_ARTIFACT}", ) s3.upload_file( f"{os.getcwd()}/{Config.LOCAL_ARTIFACTS_PATH}/{Config.CLASSIFIER_ARTIFACT}", Bucket=Config.BUCKET_NAME, Key=f"{Config.S3_ARTIFACTS_DIR}/{Config.CLASSIFIER_ARTIFACT}", ) elif ArtifactLocation.S3_MLFLOW: mlflow.set_tracking_uri(Config.TRACKING_URI) # MLflow experiment tracking with mlflow.start_run(experiment_id=Config.EXPERIMENT_ID): training_metadata = train_and_persist(data_dict) logging.info(mlflow.get_artifact_uri()) for k, v in training_metadata["params"][ "feature_engineering"].items(): mlflow.log_param(str(k), str(v)) for k, v in training_metadata["params"]["classifier"].items(): mlflow.log_param(str(k), str(v)) mlflow.log_metric("training accuracy", training_metadata["accuracy"]["train"]) mlflow.log_metric("test accuracy", training_metadata["accuracy"]["test"]) mlflow.log_artifact( f"{os.getcwd()}/{Config.LOCAL_ARTIFACTS_PATH}/{Config.FEATURE_ENGINEERING_ARTIFACT}" ) mlflow.log_artifact( f"{os.getcwd()}/{Config.LOCAL_ARTIFACTS_PATH}/{Config.CLASSIFIER_ARTIFACT}" ) if production_ready: mlflow.set_tag(Config.LIVE_TAG, 1) else: mlflow.set_tag(Config.LIVE_TAG, 0) mlflow.set_tag(Config.CANDIDATE_TAG, 1) # When running in Github actions set EXPERIMENT_ID as env # for consumption by the subsequent step print(f"::set-output name=EXPERIMENT_ID::{Config.EXPERIMENT_ID}")
from sklearn.tree import export_graphviz import pydot from sklearn.preprocessing import normalize from sklearn import metrics from sklearn.metrics import classification_report, confusion_matrix, accuracy_score from sklearn.preprocessing import StandardScaler from imblearn import under_sampling, over_sampling from imblearn.over_sampling import SMOTE from sklearn.decomposition import PCA from sklearn.model_selection import RandomizedSearchCV from sklearn.feature_selection import RFE sns.set(style="white") sns.set(style="whitegrid", color_codes=True) features, features_list, labels = utils.load_and_preprocess_data() # Using Skicit-learn to split data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=0) isSMOTE = True if isSMOTE: os = SMOTE(random_state=0) os_data_X, os_data_y = os.fit_sample(X_train, y_train) X_train = pd.DataFrame(data=os_data_X) y_train = pd.DataFrame(data=os_data_y) print("length of oversampled data is ", len(os_data_X)) print(y_train[0].value_counts())