Exemplo n.º 1
0
class FlowSpec(MetaFlowSpec):
    auth_file = IncludeFile('auth_file',
                            is_text=True,
                            help='My input',
                            default=Configuration.auth["file"])
    auth_env = Parameter('auth_env',
                         type=str,
                         default=Configuration.auth["env"])

    def databases(self):
        for auth_type in [self.auth_file, self.auth_env]:
            try:
                return Databases(connections=json.loads(auth_type))
            except:
                continue
        else:
            raise AttributeError("No authentication provided")

    @classmethod
    def hacky_run(cls):
        cmd = [
            'python',
            inspect.getfile(cls),  # The name of the file to be run
            '--no-pylint',
            'run',
        ]
        result = subprocess.run(cmd, capture_output=False)
        return result.returncode == 0
Exemplo n.º 2
0
class Download(GoraniFlowSpec):
    """
    데이터를 다운로드 합니다.

    Attributes
    ----------------------
    logs: list[EventLog]
        로그 데이터 입니다. EventLog는 공통타입입니다.
    users: Dict[str, dict]
        유저
    books: List[Book]
        책
    vocab_skills: List[VocabSkill]
        단어 셋 정보
    """

    data_file = IncludeFile('data',
                            is_text=False,
                            help='Raw Data File',
                            default='./data.msgpack')

    @step
    def start(self):
        data = msgpack.unpackb(self.data_file)
        self.logs = data['logs']
        self.users = data['users']

        self.books = [Book.from_dict(book) for book in data['books']]
        self.vocab_skills = [VocabSkill(**vc) for vc in data['vocab_skills']]

        self.next(self.end)

    @step
    def end(self):
        pass
Exemplo n.º 3
0
class WineFlow(FlowSpec):
    """
    A flow to read in the wine dataset -- see src.data.download_raw.py
    https://archive.ics.uci.edu/ml/datasets/Wine
    """

    raw_data_path = PROJECT_DIR / "data" / "raw" / "wine.csv"
    raw_wine_data = IncludeFile(
        "wine_data",
        help="These data are the results of a chemical analysis of wines grown in the same region in Italy but derived from three different cultivars. The analysis determined the quantities of 13 constituents found in each of the three types of wines.",
        default=raw_data_path,
    )

    @step
    def start(self):
        """
        Read data/raw/wine.csv into a pandas df
        """
        import pandas as pd
        from io import StringIO

        self.dataframe = pd.read_csv(StringIO(self.raw_wine_data))
        self.next(self.rename_cols)

    @step
    def rename_cols(self):
        """
        Give the wine data some computer friendlier column names
        Save to "data/interim/wine_df_nice_cols.csv"
        """
        # could not find out what this means! All google hits were for this dataset
        self.dataframe = self.dataframe.rename(
            {"OD280/OD315 of diluted wines": "od280_over_od315"}, axis="columns"
        )
        self.dataframe.columns = [
            col.lower().replace(" ", "_") for col in self.dataframe.columns
        ]

        save_path = PROJECT_DIR / "data" / "interim" / "wine_df_nice_cols.csv"
        self.dataframe.to_csv(save_path, index=False)

        self.next(self.end)

    @step
    def end(self):
        """
        Save the final version to "data/processed/wine_df_final.csv"
        End the flow.
        """
        save_path = PROJECT_DIR / "data" / "processed" / "wine_df_final.csv"
        self.dataframe.to_csv(save_path, index=False)
Exemplo n.º 4
0
class ProphetFlow(FlowSpec):
    """
    ProphetFlow use Facebook Prophet to predict future values of a
    timeseries.
    """
    data_file = IncludeFile('datafile',
                            is_text=True,
                            help='Time series data file - csv file format',
                            default='data/daily-min-temperatures.txt')
    columns_mapping = Parameter(
        'columns',
        default={
            'Date': 'ds',
            'Temp': 'y'
        },
        help="Rename columns according to Prophet standards")

    @step
    def start(self):
        """
        Raw data is loaded and prepared
        """
        # Load csv in pandas dataframe
        self.df = pd.read_csv(StringIO(self.data_file))

        # Rename columns to meet Prophet input dataframe standards
        self.df.rename(columns=self.columns_mapping, inplace=True)

        # Convert Date column to datetime64 dtype
        self.df['ds'] = pd.to_datetime(self.df['ds'],
                                       infer_datetime_format=True)

        self.next(self.train)

    @step
    def train(self):
        """
        A new Prophet model is fitted.
        """
        # Fit a new model using defaults
        self.m = Prophet()
        self.m.fit(self.df)

        self.next(self.end)

    @step
    def end(self):
        """
        Last step, process is finished
        """
        print("ProphetFlow is all done.")
Exemplo n.º 5
0
class CSVFileFlow(FlowSpec):

    data = IncludeFile('csv', help="CSV file to be parsed", is_text=True)

    delimiter = Parameter('delimiter', help="delimiter", default=',')

    @step
    def start(self):
        fileobj = StringIO(self.data)
        for i, row in enumerate(csv.reader(fileobj, delimiter=self.delimiter)):
            print("row %d: %s" % (i, row))
        self.next(self.end)

    @step
    def end(self):
        print('done!')
Exemplo n.º 6
0
class IrisFlow(FlowSpec):
    """
    A Flow to train two Iris dataset models and combine them for inference with Tempo

    The flow performs the following steps:

    1) Load Iris Data
    2) Train SKLearn LR Model
    3) Train XGBoost LR Model
    4) Create and deploy Tempo artifacts
    """

    conda_env = IncludeFile(
        "conda_env",
        help="The path to conda environment for classifier",
        default=script_path("conda.yaml"))
    kubeconfig = IncludeFile("kubeconfig",
                             help="The path to kubeconfig",
                             default=script_path("kubeconfig.yaml"))
    gsa_key = IncludeFile("gsa_key",
                          help="The path to google service account json",
                          default=script_path("gsa-key.json"))
    k8s_provider = Parameter(
        "k8s_provider",
        help="kubernetes provider. Needed for non local run to deploy",
        default="gke")
    eks_cluster_name = Parameter("eks_cluster_name",
                                 help="AWS EKS cluster name (if using EKS)",
                                 default="")

    @conda(libraries={"scikit-learn": "0.24.1"})
    @step
    def start(self):
        """
        Download Iris classification datatset
        """
        # pylint: disable=no-member
        from sklearn import datasets

        iris = datasets.load_iris()
        self.X = iris.data
        self.y = iris.target
        self.next(self.train_sklearn, self.train_xgboost)

    @conda(libraries={"scikit-learn": "0.24.1"})
    @step
    def train_sklearn(self):
        """
        Train a SKLearn Logistic Regression Classifier on dataset and save model as artifact
        """
        from joblib import dump
        from sklearn.linear_model import LogisticRegression

        lr = LogisticRegression(C=1e5)
        lr.fit(self.X, self.y)
        dump(lr, script_path("model.joblib"))
        with open(script_path("model.joblib"), "rb") as fh:
            self.buffered_lr_model = fh.read()

        self.next(self.join)

    @conda(libraries={"xgboost": "1.4.0"})
    @step
    def train_xgboost(self):
        """
        Train an XGBoost classifier on the dataset and save model as artifact
        """
        from xgboost import XGBClassifier

        xgb = XGBClassifier()
        xgb.fit(self.X, self.y)
        xgb.save_model(script_path("model.bst"))
        with open(script_path("model.bst"), "rb") as fh:
            self.buffered_xgb_model = fh.read()
        self.next(self.join)

    @step
    def join(self, inputs):
        """
        Merge two training runs.
        """
        self.merge_artifacts(inputs)

        self.next(self.tempo)

    def create_tempo_artifacts(self):
        import tempfile

        from deploy import get_tempo_artifacts

        from tempo.metaflow.utils import create_s3_folder, save_artifact, save_pipeline_with_conda, upload_s3_folder

        # Store models to local artifact locations
        local_sklearn_path = save_artifact(self.buffered_lr_model,
                                           "model.joblib")
        local_xgb_path = save_artifact(self.buffered_xgb_model, "model.bst")
        local_pipeline_path = tempfile.mkdtemp()
        # Create S3 folders for artifacts
        classifier_url = create_s3_folder(self, PIPELINE_FOLDER_NAME)
        sklearn_url = create_s3_folder(self, SKLEARN_FOLDER_NAME)
        xgboost_url = create_s3_folder(self, XGBOOST_FOLDER_NAME)

        classifier, sklearn_model, xgboost_model = get_tempo_artifacts(
            local_sklearn_path, local_xgb_path, local_pipeline_path,
            sklearn_url, xgboost_url, classifier_url)
        # Create pipeline artifacts
        save_pipeline_with_conda(classifier, local_pipeline_path,
                                 self.conda_env)
        if classifier_url:  # Check running with S3 access
            # Upload artifacts to S3
            upload_s3_folder(self, PIPELINE_FOLDER_NAME, local_pipeline_path)
            upload_s3_folder(self, SKLEARN_FOLDER_NAME, local_sklearn_path)
            upload_s3_folder(self, XGBOOST_FOLDER_NAME, local_xgb_path)
            return classifier, True
        else:
            return classifier, False

    def deploy_tempo_local(self, classifier):
        import time

        import numpy as np

        from tempo import deploy_local
        from tempo.serve.deploy import get_client

        remote_model = deploy_local(classifier)
        self.client_model = get_client(remote_model)
        time.sleep(10)
        print(self.client_model.predict(np.array([[1, 2, 3, 4]])))

    def deploy_tempo_remote(self, classifier):
        import time

        import numpy as np

        from tempo import deploy_remote
        from tempo.metaflow.utils import aws_authenticate, gke_authenticate
        from tempo.serve.deploy import get_client
        from tempo.serve.metadata import SeldonCoreOptions

        if self.k8s_provider == "gke":
            gke_authenticate(self.kubeconfig, self.gsa_key)
        elif self.k8s_provider == "aws":
            aws_authenticate(self.eks_cluster_name)
        else:
            raise Exception(f"Unknown Kubernetes Provider {self.k8s_provider}")

        runtime_options = SeldonCoreOptions(
            **{
                "remote_options": {
                    "namespace": "production",
                    "authSecretName": "s3-secret"
                }
            })

        remote_model = deploy_remote(classifier, options=runtime_options)
        self.client_model = get_client(remote_model)
        time.sleep(10)
        print(self.client_model.predict(np.array([[1, 2, 3, 4]])))

    @conda(libraries={"numpy": "1.19.5"})
    @pip(libraries={"mlops-tempo": "0.5.1", "conda_env": "2.4.2"})
    @step
    def tempo(self):
        """
        Create Tempo artifacts locally and saved to S3 within the workflow bucket.
        Then either deploy locally to Docker or deploy to a remote Kubernetes cluster based on the
        --tempo-on-docker parameter
        """
        from tempo.metaflow.utils import running_aws_batch

        classifier, s3_active = self.create_tempo_artifacts()
        if s3_active and running_aws_batch(self.tempo):
            print("Deploying to remote k8s cluster")
            self.deploy_tempo_remote(classifier)
        else:
            print("Deploying to local Docker")
            self.deploy_tempo_local(classifier)

        self.next(self.end)

    @step
    def end(self):
        """
        End flow.
        """
        pass
Exemplo n.º 7
0
class TrainUniRep(FlowSpec):
    local_file = IncludeFile('local_file',
                             is_text=True,
                             help='Input data',
                             default=None)
    s3_file = Parameter('s3_file',
                        help='File on S3',
                        default='s3://' + BUCKET +
                        '/data/mhci binding affinity.csv')
    weights_path = Parameter('weights_path',
                             help='Location of weights on S3',
                             default='s3://' + BUCKET +
                             '/models/UniRep/base64/')
    batch_size = Parameter('batch_size', help='Batch size', default='256')
    end_to_end = Parameter('end_to_end',
                           help='Train end to end',
                           default='true')
    learning_rate = Parameter('learning_rate',
                              help='Learning rate',
                              default='0.001')

    @step
    def start(self):
        self.begin = datetime.now()

        import pandas as pd
        import unirep_tools as ut

        # Load data
        if self.local_file:
            self.file_name = ''
            df = pd.read_csv(StringIO(self.local_file), index_col=False)
        elif self.s3_file:
            self.file_name = self.s3_file.split('/')[-1]
            with S3() as s3:
                s3obj = s3.get(self.s3_file)
                df = pd.read_csv(s3obj.path, index_col=False)
        seqs = df.iloc[:, 0].values
        vals = df.iloc[:, 1].values

        # Load model metadata (determine model size)
        with S3() as s3:
            s3obj = s3.get(join(self.weights_path, 'metadata.json'))
            metadata = json.loads(s3obj.text)

        # Record to registry
        features = df.columns.tolist()[1:]
        if len(features) > 1: features = ','.join(features)
        else: features = features[0]
        self.meta = {
            "flow": current.flow_name,
            "id": current.run_id,
            "data_file": self.file_name,
            "features": features,
            "size": metadata['size'],
            "start": self.begin
        }
        write_row(self.meta, table="training")

        # Train model
        loss, save_path = ut.fit(
            seqs,
            vals,
            weights_path=self.weights_path,
            batch_size=int(self.batch_size),
            model_size=self.meta['size'],
            end_to_end=True if self.end_to_end.lower() == 'true' else False,
            learning_rate=float(self.learning_rate),
            save_path=SAVE_PATH)

        # Save metadata file
        print('Saving: ' + join(save_path, 'metadata.json'))
        self.meta['finish'] = datetime.now()
        self.meta['mse'] = loss[-1]
        with open(join(save_path, 'metadata.json'), 'w') as outfile:
            row = self.meta.copy()
            row['start'] = row['start'].strftime('%m/%d/%y %H:%M:%S')
            row['finish'] = row['finish'].strftime('%m/%d/%y %H:%M:%S')
            row['mse'] = str(row['mse'])
            json.dump(row, outfile)

        # Copy outputs to S3
        print('Copying output files to: ' + 's3://' + BUCKET + '/models/' +
              current.flow_name + '/' + current.run_id + '/')
        saved_files = [
            f for f in listdir(save_path) if isfile(join(save_path, f))
        ]
        file_paths = [join(save_path, f) for f in saved_files]
        put_files = tuple(zip(saved_files, file_paths))
        with S3(s3root='s3://' + BUCKET + '/models/' + current.flow_name +
                '/' + current.run_id + '/') as s3:
            s3.put_files(put_files)

        # Adjust registries
        print('Adjusting registry tables')
        delete_row({k: self.meta[k]
                    for k in ['flow', 'id'] if k in self.meta},
                   table="training")  # delete from training registry
        write_row({k: self.meta[k]
                   for k in model_columns if k in self.meta},
                  table="models")  # append to trained registry

        self.next(self.end)

    @step
    def end(self):
        print('TrainUniRep has finished.')
Exemplo n.º 8
0
class PlayListFlow(FlowSpec):
    """
    A flow to help you build your favorite movie playlist.

    The flow performs the following steps:
    1) Ingests a CSV file containing metadata about movies.
    2) Loads two of the columns from the CSV into python lists.
    3) In parallel branches:
       - A) Filters movies by the genre parameter.
       - B) Choose a random movie from a different genre.
    4) Displays the top entries from the playlist.

    """

    movie_data = IncludeFile(
        "movie_data",
        help="The path to a movie metadata file.",
        default=script_path("movies.csv"),
    )

    genre = Parameter(
        "genre", help="Filter movies for a particular genre.", default="Sci-Fi"
    )

    recommendations = Parameter(
        "recommendations",
        help="The number of movies to recommend in " "the playlist.",
        default=5,
    )

    @step
    def start(self):
        """
        Parse the CSV file and load the values into a dictionary of lists.

        """
        # For this example, we only need the movie title and the genres.
        columns = ["movie_title", "genres"]

        # Create a simple data frame as a dictionary of lists.
        self.dataframe = dict((column, list()) for column in columns)

        # Parse the CSV header.
        lines = self.movie_data.split("\n")
        header = lines[0].split(",")
        idx = {column: header.index(column) for column in columns}

        # Populate our dataframe from the lines of the CSV file.
        for line in lines[1:]:
            if not line:
                continue

            fields = line.rsplit(",", 4)
            for column in columns:
                self.dataframe[column].append(fields[idx[column]])

        # Compute genre specific movies and a bonus movie in parallel.
        self.next(self.bonus_movie, self.genre_movies)

    @step
    def bonus_movie(self):
        """
        This step chooses a random movie from a different genre.

        """
        from random import choice

        # Find all the movies that are not in the provided genre.
        movies = [
            (movie, genres)
            for movie, genres in zip(
                self.dataframe["movie_title"], self.dataframe["genres"]
            )
            if self.genre.lower() not in genres.lower()
        ]

        # Choose one randomly.
        self.bonus = choice(movies)

        self.next(self.join)

    @step
    def genre_movies(self):
        """
        Filter the movies by genre.

        """
        from random import shuffle

        # Find all the movies titles in the specified genre.
        self.movies = [
            movie
            for movie, genres in zip(
                self.dataframe["movie_title"], self.dataframe["genres"]
            )
            if self.genre.lower() in genres.lower()
        ]

        # Randomize the title names.
        shuffle(self.movies)

        self.next(self.join)

    @step
    def join(self, inputs):
        """
        Join our parallel branches and merge results.

        """
        # Reassign relevant variables from our branches.
        self.playlist = inputs.genre_movies.movies
        self.bonus = inputs.bonus_movie.bonus

        self.next(self.end)

    @step
    def end(self):
        """
        Print out the playlist and bonus movie.

        """
        print("Playlist for movies in genre '%s'" % self.genre)
        for pick, movie in enumerate(self.playlist, start=1):
            print("Pick %d: '%s'" % (pick, movie))
            if pick >= self.recommendations:
                break

        print("Bonus Pick: '%s' from '%s'" % (self.bonus[0], self.bonus[1]))
Exemplo n.º 9
0
class GoraniFlowSpec(FlowSpec):
    config_file = IncludeFile('config',
                              is_text=False,
                              help='Config Key File',
                              default='./config.yaml')
Exemplo n.º 10
0
class RegressionModel(FlowSpec):

    # if a static file is part of the flow, it can be called in any downstream process, gets versioned etc.
    # https://docs.metaflow.org/metaflow/data#data-in-local-files
    DATA_FILE = IncludeFile('dataset',
                            help='Text File With Regression Numbers',
                            is_text=True,
                            default='dataset.txt')

    # uri from: https://github.com/aws/deep-learning-containers/blob/master/available_images.md
    DOCKER_IMAGE_URI = Parameter(
        name='sagemaker_image',
        help='AWS Docker Image URI for SageMaker Inference',
        default=
        '763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-inference:2.3.0-gpu-py37-cu102-ubuntu18.04'
    )

    # NOTE: this is expensive! Remember to SHUT IT DOWN in your AWS after completing the tutorial!
    SAGEMAKER_INSTANCE = Parameter(
        name='sagemaker_instance',
        help='AWS Instance to Power SageMaker Inference',
        default='ml.p3.2xlarge')

    # this is the name of the IAM role with SageMaker permissions
    # make sure this role has access to the bucket containing the tar file!
    IAM_SAGEMAKER_ROLE = Parameter(name='sagemaker_role',
                                   help='AWS Role for SageMaker',
                                   default='MetaSageMakerRole')

    @step
    def start(self):
        """

        Read data in, and parallelize model building with two params (in this case, dummy example with learning rate).

        """
        # debug printing - this is from https://docs.metaflow.org/metaflow/tagging
        # to show how information about the current run can be accessed programmatically
        print("flow name: %s" % current.flow_name)
        print("run id: %s" % current.run_id)
        print("username: %s" % current.username)
        # data is an array of lines from the text file containing the numbers
        raw_data = StringIO(self.DATA_FILE).readlines()
        print("Total of {} rows in the dataset!".format(len(raw_data)))
        # cast strings to float and prepare for training
        self.dataset = [[float(_) for _ in d.strip().split('\t')]
                        for d in raw_data]
        print("Raw data: {}, cleaned data: {}".format(raw_data[0].strip(),
                                                      self.dataset[0]))
        # store dataset as train and test set
        split_index = int(len(self.dataset) * 0.8)
        self.train_dataset = self.dataset[:split_index]
        self.test_dataset = self.dataset[split_index:]
        print("Training data: {}, test data: {}".format(
            len(self.train_dataset), len(self.test_dataset)))
        # this is the only MetaFlow-specific part: based on a list of options (here, learning rates)
        # spin up N parallel process, passing the given option to the child process
        self.learning_rates = [0.1, 0.2]
        self.next(self.train_model, foreach='learning_rates')

    # comment out @batch if you want to run the parallel steps locally and not on AWS
    @batch(gpu=1, memory=80000)
    @step
    def train_model(self):
        """

        Train a dummy regression model with Keras (https://www.tensorflow.org/tutorials/keras/regression)
        and use high-performance s3 client from metaflow to store the model tar file for further processing.

        """
        # this is the CURRENT learning rate in the fan-out
        # each copy of this step in the parallelization will have it's own value
        self.learning_rate = self.input
        # do some specific import
        import tensorflow as tf
        from tensorflow.keras import layers
        import tarfile
        # build the model
        x_train = np.array([[_[0]] for _ in self.train_dataset])
        y_train = np.array([_[1] for _ in self.train_dataset])
        x_test = np.array([[_[0]] for _ in self.test_dataset])
        y_test = np.array([_[1] for _ in self.test_dataset])
        x_model = tf.keras.Sequential(
            [layers.Dense(input_shape=[
                1,
            ], units=1)])
        # print out models for debug
        print(x_model.summary())
        x_model.compile(
            optimizer=tf.optimizers.Adam(learning_rate=self.learning_rate),
            loss='mean_absolute_error')
        history = x_model.fit(x_train,
                              y_train,
                              epochs=100,
                              validation_split=0.2)
        self.hist = history.history
        # store loss for downstream tasks
        self.results = x_model.evaluate(x_test, y_test)
        print("Test set results: {}".format(self.results))
        # save model: IMPORTANT: TF models need to have a version
        # see: https://github.com/aws/sagemaker-python-sdk/issues/1484
        model_name = "regression-model-{}/1".format(self.learning_rate)
        local_tar_name = 'model-{}.tar.gz'.format(self.learning_rate)
        x_model.save(filepath=model_name)
        # zip keras folder to a single tar file
        with tarfile.open(local_tar_name, mode="w:gz") as _tar:
            _tar.add(model_name, recursive=True)
        # metaflow nice s3 client needs a byte object for the put
        # IMPORTANT: if you're using the metaflow local setup,
        # you have to upload the model to S3 for
        # sagemaker using custom code - replace the metaflow client here with a standard
        # boto call and a target bucket over which you have writing permissions
        # remember to store in self.s3_path the final full path of the model tar file, to be used
        # downstream by sagemaker!
        with open(local_tar_name, "rb") as in_file:
            data = in_file.read()
            with S3(run=self) as s3:
                url = s3.put(local_tar_name, data)
                # print it out for debug purposes
                print("Model saved at: {}".format(url))
                # save this path for downstream reference!
                self.s3_path = url
        # finally join with the other runs
        self.next(self.join_runs)

    @step
    def join_runs(self, inputs):
        """
        Join the parallel runs and merge results into a dictionary.
        """
        # merge results (loss) from runs with different parameters
        self.results_from_runs = {
            inp.learning_rate: {
                'metrics': inp.results,
                'tar': inp.s3_path
            }
            for inp in inputs
        }
        print("Current results: {}".format(self.results_from_runs))
        # pick one according to some logic, e.g. smaller loss (here just pick a random one)
        self.best_learning_rate = choice(list(self.results_from_runs.keys()))
        self.best_s3_model_path = self.results_from_runs[
            self.best_learning_rate]['tar']
        # next, deploy
        self.next(self.deploy)

    @step
    def deploy(self):
        """
        Use SageMaker to deploy the model as a stand-alone, PaaS endpoint, with our choice of the underlying
        Docker image and hardware capabilities.

        Available images for inferences can be chosen from AWS official list:
        https://github.com/aws/deep-learning-containers/blob/master/available_images.md

        Once the endpoint is deployed, you can add a further step with for example behavioral testing, to
        ensure model robustness (e.g. see https://arxiv.org/pdf/2005.04118.pdf). Here, we just "prove" that
        the endpoint is up and running!

        """
        from sagemaker.tensorflow import TensorFlowModel
        # generate a signature for the endpoint, using learning rate and timestamp as a convention
        ENDPOINT_NAME = 'regression-{}-endpoint'.format(
            int(round(time.time() * 1000)))
        # print out the name, so that we can use it when deploying our lambda
        print("\n\n================\nEndpoint name is: {}\n\n".format(
            ENDPOINT_NAME))
        model = TensorFlowModel(model_data=self.best_s3_model_path,
                                image_uri=self.DOCKER_IMAGE_URI,
                                role=self.IAM_SAGEMAKER_ROLE)
        predictor = model.deploy(initial_instance_count=1,
                                 instance_type=self.SAGEMAKER_INSTANCE,
                                 endpoint_name=ENDPOINT_NAME)
        # run a small test against the endpoint
        # pick a number for X and check the predicted Y is sensible
        input = {'instances': np.array([[0.57457947234]])}
        # output is on the form {'predictions': [[10.879798]]}
        result = predictor.predict(input)
        print(input, result)
        assert result['predictions'][0][0] > 0
        self.next(self.end)

    @step
    def end(self):
        """
        The final step is empty here, but cleaning operations and/or sending hooks for downstream deployment tasks
        is a natural necessity for machine learning DAGs.

        """
        print('Dag ended!')
Exemplo n.º 11
0
class NFLStatsFlow(FlowSpec):
    """
    A flow to collect some information about the San Francisco 49ers plays in 2019.

    The flow performs the following steps:
    1) Ingests a CSV into a Pandas Dataframe.
    2) Filter play by play data.
    3) Add additional metrics at the drive and game level.

    """
    nfl_data = IncludeFile(
        "nfl_data",
        help="The path to a nfl play by play metadata file.",
        default=('reg_pbp_2018.csv'))

    @step
    def start(self):
        """
        The start step:
            1) Loads the data into pandas dataframe and filter based on relevant
            2) Add full play type.
            3) Add rushing/passing specific metrics.
        """
        from io import StringIO
        ########################################################
        ###1) Loads the data into pandas dataframe and filter ##
        ########################################################
        self.nfl_dataframe = pandas.read_csv(StringIO(self.nfl_data))

        # Filter data down to San Francisco 49'ers
        self.san_fran_df = self.nfl_dataframe[ \
            (self.nfl_dataframe.posteam=='SF') & \
            (self.nfl_dataframe.down.isin(range(1,5))) & \
            ((self.nfl_dataframe.play_type=='run') | (self.nfl_dataframe.play_type == 'pass')) & \
            (self.nfl_dataframe.qb_spike==0) & \
            (self.nfl_dataframe.qb_kneel==0)
        ]

        ########################################################
        ################ 2) Add full play type #################
        ########################################################

        from modules import get_full_play_type

        self.san_fran_df = self.san_fran_df.replace(np.nan,
                                                    'unknown',
                                                    regex=True)
        self.san_fran_df['full_play_type'] = self.san_fran_df[[
            'play_type', 'pass_location', 'pass_length', 'run_location'
        ]].apply(get_full_play_type, axis=1)
        self.san_fran_df = self.san_fran_df[(
            self.san_fran_df.full_play_type.isin([
                'pass_left_short', 'pass_left_deep', 'pass_middle_short',
                'pass_middle_deep', 'pass_right_short', 'pass_right_deep',
                'run_left', 'run_middle', 'run_right'
            ]))]

        ########################################################
        ####### 3) Add rushing/passing specific metrics. #######
        ########################################################

        self.san_fran_df['rushing_yards_gained'] = np.where(
            self.san_fran_df['play_type'] == 'run',
            self.san_fran_df['yards_gained'], 0)
        self.san_fran_df['passing_yards_gained'] = np.where(
            self.san_fran_df['play_type'] == 'pass',
            self.san_fran_df['yards_gained'], 0)

        self.next(self.drive_level_index)

    @step
    def drive_level_index(self):
        """
        Create an unique identifier for a drive + game id
        """
        self.san_fran_df['unique_drive'] = self.san_fran_df.apply(
            lambda row: str(row['game_id']) + '_' + str(row['drive']), axis=1)
        self.next(self.drive_level_metrics)

    @step
    def drive_level_metrics(self):
        """
        Adds metrics about the specific drive (ie rushing yards, penalties, sacks, etc)
        """
        def get_cumulative_data(metric,
                                new_name,
                                fill_blanks=0,
                                granularity='drive'):
            index_name = 'unique_drive'
            if (granularity == 'game'):
                index_name = 'game_id'
            a = self.san_fran_df.groupby(index_name)[metric].cumsum()
            a.name = new_name + '_after'
            self.san_fran_df = pandas.concat([self.san_fran_df, a], axis=1)
            self.san_fran_df[new_name] = self.san_fran_df.groupby(
                [index_name])[a.name].shift(1)
            self.san_fran_df[new_name].fillna(fill_blanks, inplace=True)

        def get_rank_data(metric,
                          new_name,
                          fill_blanks=0,
                          granularity='drive'):
            index_name = 'unique_drive'
            if (granularity == 'game'):
                index_name = 'game_id'
            a = self.san_fran_df.groupby(index_name)[metric].rank(
                ascending=True, method='first')
            a.name = new_name
            self.san_fran_df = pandas.concat([self.san_fran_df, a], axis=1)

        from metric_list import metric_list

        for i in metric_list:
            if (i['type'] == 'cumulative_data'):
                get_cumulative_data(metric=i['metric'],
                                    new_name='drive' + '_' + i['metric'],
                                    fill_blanks=i['fill_blanks'],
                                    granularity='drive')
                get_cumulative_data(metric=i['metric'],
                                    new_name='game' + '_' + i['metric'],
                                    fill_blanks=i['fill_blanks'],
                                    granularity='game')
            else:
                get_rank_data(metric=i['metric'],
                              new_name='game' + '_' + i['new_name'],
                              fill_blanks=i['fill_blanks'],
                              granularity='game')

        self.san_fran_df['previous_play_in_drive'] = self.san_fran_df.groupby(
            ['unique_drive'])['full_play_type'].shift(1)
        self.san_fran_df['previous_play_in_drive'].fillna('first_play',
                                                          inplace=True)

        self.next(self.end)

    @step
    def end(self):
        """
        End the flow.

        """
        pass
Exemplo n.º 12
0
class SentimentAnalysis(FlowSpec):

    edgar_data = IncludeFile("data", default='testlist.csv')

    @conda(libraries={'pandas': '1.0.1'})
    @step
    def start(self):
        import pandas
        from io import StringIO

        # Load the data set into a pandas dataframe.
        self.dataframe = pandas.read_csv(StringIO(self.edgar_data))

        self.links = list(self.dataframe['link'])
        self.next(self.scrapping)

    @conda(
        libraries={
            'pandas': '1.0.1',
            're': '4.4.0',
            'pathlib': '1.0.1',
            'random': '2.2.1',
            'urllib.parse': '',
            'bs4': '4.8.2',
            'furl': '2.1.0',
            'selenium': '3.141.1'
        })
    @step
    def scrapping(self):
        import re
        from pathlib import Path
        import urllib.request
        import boto3
        from metaflow import S3
        from urllib.parse import urljoin
        import pandas as pd
        from bs4 import BeautifulSoup
        from furl import furl
        from selenium import webdriver

        soup = BeautifulSoup(html, 'lxml')
        meta, participants, content = {}, [], []
        h1 = soup.find('h1', itemprop='headline')
        if h1 is None:
            return
        h1 = h1.text
        meta['company'] = h1[:h1.find('(')].strip()
        meta['symbol'] = h1[h1.find('(') + 1:h1.find(')')]
        match = quarter_pattern.search(title)
        if match:
            meta['quarter'] = match.group(0)

        SA_URL = 'https://seekingalpha.com/'
        TRANSCRIPT = re.compile('Earnings Call Transcript')

        next_page = True
        page = 1
        driver = webdriver.Firefox()
        while next_page:
            print(f'Page: {page}')
            url = f'{SA_URL}/earnings/earnings-call-transcripts/{page}'
            driver.get(urljoin(SA_URL, url))
            response = driver.page_source

            soup = BeautifulSoup(response, 'lxml')
            links = soup.find_all(name='a', string=TRANSCRIPT)
            if len(links) == 0:
                next_page = False
            else:
                for link in links:
                    transcript_url = link.attrs.get('href')
                    article_url = furl(urljoin(SA_URL, transcript_url)).add(
                        {'part': 'single'})
                    driver.get(article_url.url)
                    html = driver.page_source
                    result = parse_html(html)
                    if result is not None:
                        meta, participants, content = result
                        meta['link'] = link
                        store_result(meta, participants, content)
                    sleep(5 + (random() - .5) * 2)
        driver.close()
        for x in participants:
            f = open("scrappeddata.txt", "a")
            f.write(str_response)
            f.close()

        with S3(s3root='s3://outputbucket1221/') as s3:
            s3.put_files([('scrappeddata', 'scrappeddata.txt')])

        self.next(self.preprocessing)

    @conda(libraries={
        'pandas': '1.0.1',
        'nltk': '3.4.5',
        'smart_open': '1.9.0'
    })
    @step
    def preprocessing(self):
        import boto3
        from metaflow import S3
        import re
        import pandas as pd
        from nltk import tokenize
        import string
        import nltk
        from nltk.corpus import stopwords
        from smart_open import smart_open

        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('words')

        listed = []
        with smart_open('s3://inputbucket1221/input.txt', 'r') as s3_source:
            Line = s3_source.readline()

            while Line != '':
                Line1 = Line.split(".")
                for Sentence in Line1:
                    listed.append(Sentence)
                Line = s3_source.readline()

        L = []
        for x in listed:
            if len(x) > 5:
                L.append(x)

        df = pd.DataFrame()

        df['Text'] = L
        print(df['Text'])

        def remove_punct(text):
            text = "".join(
                [char for char in text if char not in string.punctuation])
            text = re.sub('[0-9]+', '', text)
            return text

        df['Textclean'] = df['Text'].apply(lambda x: remove_punct(x))
        df = df.dropna()

        f = open("processed.txt", "a")
        f.write(df['Textclean'].to_string())
        f.close()

        #self.cleantext = df['Textclean']
        self.cleantext = df['Textclean']

        with S3(s3root='s3://outputbucket1221/') as s3:
            s3.put_files([('processed', 'processed.txt')])

        self.next(self.labelling)

    @conda(libraries={
        'pandas': '1.0.1',
        'nltk': '3.4.5',
        'google-cloud-language': '1.3.0'
    })
    @step
    def labelling(self):
        from google.cloud import language_v1
        from google.cloud.language_v1 import enums
        import pandas as pd
        import urllib.request
        import boto3
        from metaflow import S3

        def sample_analyze_sentiment(text_content, scoreDataframe):
            """
            Analyzing Sentiment in a String

            Args:
              text_content The text content to analyze
            """
            #scoreDataframe = pd.DataFrame(columns = ['Sentence','Score'])

            client = language_v1.LanguageServiceClient()
            type_ = enums.Document.Type.PLAIN_TEXT
            language = "en"
            document = {
                "content": text_content,
                "type": type_,
                "language": language
            }

            encoding_type = enums.EncodingType.UTF8

            response = client.analyze_sentiment(document,
                                                encoding_type=encoding_type)

            for sentence in response.sentences:
                sent = sentence.text.content
                senti = sentence.sentiment.score

                scoreDataframe = scoreDataframe.append(
                    {
                        'Sentence': sent,
                        'Score': senti
                    }, ignore_index=True)

                return scoreDataframe

        scoreDataframe = pd.DataFrame(columns=['Sentence', 'Score'])

        for x in self.cleantext:
            print(x)
            scoreDataframe = sample_analyze_sentiment(x, scoreDataframe)

        L_clean = []
        for row in scoreDataframe.itertuples():
            if (row.Score > 0):
                values = [row.Sentence, 1]
                L_clean.append(values)
            elif (row.Score < 0):
                values = [row.Sentence, -1]
                L_clean.append(values)
            else:
                values = [row.Sentence, 0]
                L_clean.append(values)

        df_label_clean = pd.DataFrame(L_clean, columns=['Sentence', 'Score'])
        df_label_clean = df_label_clean[(df_label_clean != 0).all(1)]

        df_label_clean.to_csv('labeldataset.csv', index=False)

        with S3(s3root='s3://outputbucket1221/') as s3:
            s3.put_files([('labeldataset.csv', 'labeldataset.csv')])

        self.next(self.end)

    @step
    def end(self):
        """
        End the flow.
        """
        pass
class MultiStepMNISTFlow(FlowSpec):
    """
    Train multiple Iterations of Machine learning models for MNIST Handwritten digit prediction.
    Metaflow will help capture the experiments and then understanding the efficiency of training and accuracy for each of the models.

    """

    mnist_dataset_train_x_raw = IncludeFile(
        "mnist_dataset_train_x_raw",
        help="The path to a mnist training images file.",
        default=script_path('data/mnist/train-images-idx3-ubyte'),
        is_text=False,
        encoding='UTF-8')

    mnist_dataset_train_y_raw = IncludeFile(
        "mnist_dataset_train_y_raw",
        help="The path to a  mnist training labels file.",
        default=script_path('data/mnist/train-labels-idx1-ubyte'),
        is_text=False,
        encoding='UTF-8')

    mnist_dataset_test_x_raw = IncludeFile(
        "mnist_dataset_test_x_raw",
        help="The path to a mnist test images file.",
        default=script_path('data/mnist/t10k-images-idx3-ubyte'),
        is_text=False,
        encoding='UTF-8')

    mnist_dataset_test_y_raw = IncludeFile(
        "mnist_dataset_test_y_raw",
        help="The path to a mnist test labels file.",
        default=script_path('data/mnist/t10k-labels-idx1-ubyte'),
        is_text=False,
        encoding='UTF-8')

    num_training_examples = Parameter('num_training_examples',
                                      help='Number of Training Examples',
                                      default=5000)

    number_of_epochs = Parameter(
        'number_of_epochs',
        help='Number of Epochs to Run for the Training Process',
        default=10)

    # batch_size = Parameter('batch_size',help='Batch Sizes for the Training Process',default=128)

    @step
    def start(self):
        """
        Parse the MNIST Dataset into Flattened and None Flattened Data artifacts. 
        Also set the hyper params to search over in the following steps. 
        """
        import numpy as np
        # $ Collect and create the unflattenned dataset according to the number of examples.
        self.train_unflattened, self.val_unflattened, self.test_unflattened = read_mnist(
            np,
            self.mnist_dataset_train_x_raw,
            self.mnist_dataset_train_y_raw,
            self.mnist_dataset_test_x_raw,
            self.mnist_dataset_test_y_raw,
            flatten=False,
            num_train=self.num_training_examples)

        # $ Collect and create the flattenned dataset according to the number of examples.
        self.train_flattened, self.val_flattened, self.test_flattened = read_mnist(
            np,
            self.mnist_dataset_train_x_raw,
            self.mnist_dataset_train_y_raw,
            self.mnist_dataset_test_x_raw,
            self.mnist_dataset_test_y_raw,
            flatten=True,
            num_train=self.num_training_examples)

        self.hyper_params = list(
            map(lambda x: {'batch_size': x},
                [128, 32, 64]))  # {batch_size : 100} ....

        self.history = {}
        # $ Train models in parallel withe the
        self.next(self.train_sequential_placeholder,
                  self.train_convolution_placeholder,
                  self.train_convolution_batch_norm_placeholder)

    @step
    def train_sequential_placeholder(self):
        """
        This a placeholder on the Sequential NN branched step to run a Foreach on the self.hyper_params 
        """
        self.next(self.train_sequential, foreach='hyper_params')

    @step
    def train_convolution_placeholder(self):
        """
        This a placeholder on the Convolution NN branched step to run a Foreach on the self.hyper_params 
        """
        self.next(self.train_convolution, foreach='hyper_params')

    @step
    def train_convolution_batch_norm_placeholder(self):
        """
        This a placeholder on the Convolution Batch Normalisation NN branched step to run a Foreach on the self.hyper_params 
        """
        self.next(self.train_convolution_batch_norm, foreach='hyper_params')

    @step
    def train_sequential(self):
        """
        Train sequential Neural Network with the input Hyper params
        
        """
        from tensorflow.python.keras.layers import Conv2D, Input, MaxPool2D, Dense, Flatten, MaxPooling2D
        from tensorflow.python.keras.models import Sequential
        train, val, test = self.train_flattened, self.val_flattened, self.test_flattened
        train_X, train_Y = train
        test_X, test_Y = test
        model = Sequential()
        model.add(Dense(128, activation='relu', input_shape=[
            784
        ]))  # fully-connected layer with 128 units and ReLU activation
        model.add(Dense(128, activation='relu'))
        model.add(Dense(10, activation='softmax')
                  )  # output layer with 10 units and a softmax activation

        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['categorical_accuracy', 'accuracy'])
        history = model.fit(train_X,
                            train_Y,
                            validation_split=0.2,
                            epochs=self.number_of_epochs,
                            batch_size=self.input['batch_size'])
        self.history = dict(history.history)
        self.param = self.input
        self.next(self.train_sequential_join)

    @step
    def train_convolution(self):
        """
        Train a Convolutional Neural Network with the input Hyper params. 
        """
        from tensorflow.python.keras.layers import Conv2D, Input, MaxPool2D, Dense, Flatten, MaxPooling2D
        from tensorflow.python.keras.models import Sequential
        train, val, test = self.train_unflattened, self.val_unflattened, self.test_unflattened
        train_X, train_Y = train
        test_X, test_Y = test
        train_X = train_X.reshape(self.num_training_examples, 28, 28, 1)
        test_X = test_X.reshape(test_X.shape[0], 28, 28, 1)
        model = Sequential()
        model.add(
            Conv2D(32,
                   kernel_size=(1, 1),
                   activation='relu',
                   input_shape=(28, 28, 1)))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dense(10, activation='softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['categorical_accuracy', 'accuracy'])
        history = model.fit(train_X,
                            train_Y,
                            validation_split=0.2,
                            epochs=self.number_of_epochs,
                            batch_size=self.input['batch_size'])
        self.history = dict(history.history)
        self.param = self.input
        self.next(self.train_convolution_join)

    @step
    def train_convolution_batch_norm(self):
        """
        Train a Convolutional Neural Network with Batch Norm and Dropout with the input Hyper params. 
        """
        from tensorflow.python.keras.layers import Conv2D, Input, MaxPool2D, Dense, Flatten, MaxPooling2D, BatchNormalization, Activation, Dropout
        from tensorflow.python.keras.models import Sequential
        train, val, test = self.train_unflattened, self.val_unflattened, self.test_unflattened
        train_X, train_Y = train
        test_X, test_Y = test
        train_X = train_X.reshape(self.num_training_examples, 28, 28, 1)
        test_X = test_X.reshape(test_X.shape[0], 28, 28, 1)
        model = Sequential()

        model.add(
            Conv2D(32,
                   kernel_size=(1, 1),
                   use_bias=False,
                   input_shape=(28, 28, 1)))
        model.add(BatchNormalization(axis=3))
        model.add(Activation('relu'))
        model.add(Conv2D(64, kernel_size=(3, 3), use_bias=False))
        model.add(BatchNormalization(axis=3))
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))

        model.add(Conv2D(32, kernel_size=(1, 1), use_bias=False))
        model.add(BatchNormalization(axis=3))
        model.add(Activation('relu'))
        model.add(Conv2D(64, kernel_size=(3, 3), use_bias=False))
        model.add(BatchNormalization(axis=3))
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))

        model.add(Dropout(0.2))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.4))
        model.add(Dense(10, activation='softmax'))

        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['categorical_accuracy', 'accuracy'])
        history = model.fit(train_X,
                            train_Y,
                            validation_split=0.2,
                            epochs=self.number_of_epochs,
                            batch_size=self.input['batch_size'])
        self.history = dict(history.history)
        self.param = self.input
        self.next(self.train_convolution_batch_norm_join)

    @step
    def train_convolution_batch_norm_join(self, inputs):
        """
        Result Data from parallel training sessions run over different hyper params for Conv Batch norm NN is collated in this step
        """
        self.history = [{
            'param': input_val.param,
            'history': input_val.history
        } for input_val in inputs]
        self.next(self.join)

    @step
    def train_convolution_join(self, inputs):
        """
        Result Data from parallel training sessions run over different hyper params for Conv NN is collated in this step
        """
        self.history = [{
            'param': input_val.param,
            'history': input_val.history
        } for input_val in inputs]
        self.next(self.join)

    @step
    def train_sequential_join(self, inputs):
        """
        Result Data from parallel training sessions run over different hyper params for Sequential NN is collated in this step
        """
        self.history = [{
            'param': input_val.param,
            'history': input_val.history
        } for input_val in inputs]
        self.next(self.join)

    @step
    def join(self, inputs):
        """
        All the Traininig results are collated into a single object in the step of the flow. 
        """
        self.history = {
            'convolution':
            inputs.train_convolution_join.history,
            'sequential':
            inputs.train_sequential_join.history,
            'convolution_batch_norm':
            inputs.train_convolution_batch_norm_join.history
        }

        self.next(self.end)

    @step
    def end(self):
        """
        This is the end step of the Computation 
        """
        print("Done Computation")
Exemplo n.º 14
0
class HousePriceFlow2(FlowSpec):
    """House Price Model"""
    train_data = IncludeFile('train_data',
                             help='Training File',
                             default=script_path('data/train.csv'))

    def __init__(self, *args, **kwargs):
        """Constructor"""
        super().__init__(*args, **kwargs)

    @step
    def start(self):
        """This workflow trains a model for visa transaction
        categorization.
        """
        self.trainDF = pd.read_csv(io.StringIO(self.train_data))
        self.next(self.train)

    @pip(libraries={
        'xgboost': '1.3.3',
        'sklearn-pandas': '2.0.4',
        'baikal': '0.4.2',
    })
    @step
    def train(self):
        import xgboost
        from baikal import make_step, Step, Input, Model
        from baikal.steps import Stack
        from sklearn_pandas import gen_features
        import custom_transformations as ct
        from custom_transformations import DataFrameMapperStep, ConcatDataFrame, CatBoostRegressorStep

        # these are the categorical columns in the dataset
        CATEGORICAL_COLUMNS = [
            'KitchenQual',
            'MSSubClass',
            'MSZoning',
            'Street',
            'Alley',
            'LotShape',
            'LandContour',
            'Utilities',
            'LotConfig',
            'LandSlope',
            'Neighborhood',
            'Condition1',
            'Condition2',
            'BldgType',
            'HouseStyle',
            'RoofStyle',
            'RoofMatl',
            'Exterior1st',
            'Exterior2nd',
            'MasVnrType',
            'ExterQual',
            'ExterCond',
            'Foundation',
            'BsmtQual',
            'BsmtCond',
            'BsmtExposure',
            'BsmtFinType1',
            'BsmtFinType2',
            'Heating',
            'HeatingQC',
            'CentralAir',
            'Functional',
            'FireplaceQu',
            'GarageType',
            'GarageFinish',
            'GarageQual',
            'GarageCond',
            'PavedDrive',
            'PoolQC',
            'Fence',
            'MiscFeature',
            'SaleType',
            'SaleCondition',
            'OverallQual',
            'OverallCond',
        ]

        # these columns will be terated as a numerical columns
        NUMERICAL_COLUMNS = [
            'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd',
            'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF',
            'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
            'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
            'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
            'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
            'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch',
            'PoolArea', 'MiscVal', 'MoSold', 'YrSold'
        ]

        # These columns have missing values and the one for which we will add missing indicator variable
        MISSING_INDICATOR = [
            'LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
            'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
            'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
            'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
            'MiscFeature'
        ]

        ## Categorical Columns for which we want One Hot Encoding
        ONEHOT_COLUMNS = [
            'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
            'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2',
            'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType',
            'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
            'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
            'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
            'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish',
            'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
            'MiscFeature', 'SaleType', 'SaleCondition'
        ]

        ## Categorical Columns for which we want to have target encoding
        TARGET_COLUMNS = [
            'MSSubClass', 'Neighborhood', 'Exterior1st', 'Exterior2nd'
        ]

        ## Columns for that require log transformations
        LOG_COLUMNS = [
            'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
            'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
            'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
            'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal'
        ]

        # Define Steps
        ElasticNetStep = make_step(ElasticNet, class_name='ElasticNet')
        ConcatStep = make_step(ConcatDataFrame, class_name='Concat')
        XGBRegressorStep = make_step(xgboost.XGBRegressor,
                                     class_name='XGBRegressor')
        LinearRegressionStep = make_step(sklearn.linear_model.LinearRegression,
                                         class_name='LinearRegression')

        # Define sklearn-pandas transformations. Here I am using gen_features utility to
        # define transformations for individual columns.
        baseProcessing = (
            gen_features(columns=[[x] for x in MISSING_INDICATOR],
                         classes=[{
                             'class': MissingIndicator,
                             'features': 'all',
                             'sparse': False,
                             'error_on_new': False
                         }],
                         prefix='na_') +
            gen_features(
                columns=LOG_COLUMNS,
                classes=[{
                    'class': FunctionTransformer,
                    'func': lambda x: x.astype(np.float).reshape((-1, 1))
                }, {
                    'class': SimpleImputer,
                    'strategy': 'mean'
                }, {
                    'class': FunctionTransformer,
                    'func': np.log1p
                }]) +
            gen_features(
                columns=list(set(NUMERICAL_COLUMNS) - set(LOG_COLUMNS)),
                classes=[{
                    'class': FunctionTransformer,
                    'func': lambda x: x.astype(np.float).reshape((-1, 1))
                }, {
                    'class': SimpleImputer,
                    'strategy': 'mean'
                }],
            ) + [
                # constructing new features -- age of the house
                (['YrSold', 'YearBuilt'], [
                    FunctionTransformer(
                        func=lambda x: np.clip(x[:, 0] - x[:, 1], 0, 1000)),
                    FunctionTransformer(np.log1p)
                ], {
                    'alias': 'age'
                }),

                # constructing new feature -- remodeling age
                (['YrSold', 'YearRemodAdd'], [
                    FunctionTransformer(
                        func=lambda x: np.clip(x[:, 0] - x[:, 1], 0, 1000)),
                    FunctionTransformer(np.log1p)
                ], {
                    'alias': 'remodel_age'
                }),

                # new feature -- total surface area
                (['1stFlrSF', '2ndFlrSF', 'TotalBsmtSF'], [
                    FunctionTransformer(lambda x: np.nansum(x, axis=1)),
                    FunctionTransformer(np.log1p)
                ], {
                    'alias': 'numerical_TotalArea'
                })
            ])

        # Since CatBoost model can handle categorical data, we don't need to encode categorical variables
        # we will simply impute missing values and let CatBoost model handle categorical data.
        catModelPreprocessing = gen_features(
            columns=CATEGORICAL_COLUMNS,
            classes=[{
                'class': FunctionTransformer,
                'func': lambda x: x.astype(np.object).reshape(-1, 1)
            }, {
                'class': SimpleImputer,
                'strategy': 'most_frequent'
            }],
        )

        # for regression and XGBoost, we will need to encode categorical variables ourselfs.
        # Depending on the cardinality of the variable, I am either using one hot encoding or target encoding.
        regressionModelProcessing = (
            gen_features(columns=[[x] for x in ONEHOT_COLUMNS],
                         classes=[{
                             'class': OneHotEncoder,
                             'handle_unknown': 'ignore',
                             'sparse': False
                         }]) + gen_features(columns=[[x]
                                                     for x in TARGET_COLUMNS],
                                            classes=[
                                                {
                                                    'class': TargetEncoder
                                                },
                                                {
                                                    'class': SimpleImputer,
                                                    'strategy': 'mean'
                                                },
                                            ]))

        # Define DAG
        x = Input(name="x")
        y = Input(name='y')

        # Define feature transformations
        d0 = DataFrameMapperStep(baseProcessing,
                                 df_out=True,
                                 name='BasePreprocess')(x, y)
        d1 = DataFrameMapperStep(regressionModelProcessing,
                                 df_out=True,
                                 name='RegressionModelPreprocess')(x, y)
        d2 = DataFrameMapperStep(catModelPreprocessing,
                                 df_out=True,
                                 name='CatModelPreprocess')(x, y)

        # Consolidate features for catboost and elasticnet
        regressionFeatures = ConcatStep(name='RegressionFeatures')([d0, d1])
        catFeatures = ConcatStep(name='CatBoostFeatures')([d0, d2])

        # Generate predictions using three different algorithms.
        m1 = ElasticNetStep(name='ElasticNet')(regressionFeatures, y)
        m2 = XGBRegressorStep(name='XGBoost')(regressionFeatures, y)
        m3 = CatBoostRegressorStep(name='CatBoost',
                                   cat_features=CATEGORICAL_COLUMNS,
                                   iterations=10)(catFeatures, y)

        # combine predictions from the three models
        combinedPredictions = Stack(name='CombinePredictions')([m1, m3])

        # construct an ensemble model
        ensembleModel = LinearRegressionStep()(combinedPredictions, y)
        model = Model(x, ensembleModel, y)
        model.fit(self.trainDF, self.trainDF['SalePrice'])
        self.artifact = {
            'model.pkl': cloudpickle.dumps(model),
            'environment': {
                'pip': {}
            }
        }
        self.next(self.end)

    @step
    def end(self):
        print("done")
Exemplo n.º 15
0
class TitanicPredict(FlowSpec):
    """
    Prediction of the test data set.
    The model is taken from the latest successful run
    or the designated run_id.
    """

    run_id = Parameter('id',
                       help="Run execution ID",
                       default='latest_successful')
    test_data = IncludeFile("test_data",
                            help="The path to Titanic test data file.",
                            default=script_path('test.csv'))

    @step
    def start(self):
        """
        Load test data set
        """
        from io import StringIO

        # Load the data set into a pandas dataframe.
        self.X = pd.read_csv(StringIO(self.test_data))

        print('run id: ', self.run_id)
        if self.run_id == 'latest_successful':
            self.train_run = Flow('TitanicModeling').latest_successful_run
        else:
            self.train_run = Run(f'TitanicModeling/{self.run_id}')

        # Compute our two recomendation types in parallel.
        self.next(self.categorical_prep, self.numerical_prep)

    @step
    def categorical_prep(self):
        """
        Preprocessing categorical features
        - Impute missing values
        - One-Hot encoding
        """
        categorical_columns = self.train_run.data.categorical_columns

        cat_imputer = self.train_run.data.cat_imputer
        ohe = self.train_run.data.ohe

        X_imp = cat_imputer.transform(self.X[categorical_columns])
        X_ohe = ohe.transform(X_imp)
        col_name = ohe.get_feature_names(input_features=categorical_columns)

        self.X_cat = pd.DataFrame(X_ohe, columns=col_name)

        self.next(self.join)

    @step
    def numerical_prep(self):
        """
        Preprocessing numerical features
        - Impute missing values with mean
        """
        numerical_columns = self.train_run.data.numerical_columns

        num_imputer = self.train_run.data.num_imputer
        X_imp = num_imputer.transform(self.X[numerical_columns])

        self.X_num = pd.DataFrame(X_imp, columns=numerical_columns)

        self.next(self.join)

    @step
    def join(self, inputs):
        '''
        Concatinate the categorical and numerical columns.
        '''

        X_cat = inputs.categorical_prep.X_cat
        X_num = inputs.numerical_prep.X_num
        self.train_run = inputs.categorical_prep.train_run
        self.merge_artifacts(inputs)

        self.X_prep = pd.concat([X_cat, X_num], axis=1)

        self.next(self.prediction)

    @step
    def prediction(self):
        """
        Predict suvived / not survived
        from the test data set.
        """

        rf = self.train_run.data.rf
        self.y_pred = rf.predict(self.X_prep)

        self.next(self.end)

    @step
    def end(self):
        """
        Save the result.
        """
        df_result = self.X.copy()
        df_result['y_pred'] = self.y_pred

        df_result.to_csv('result.csv', index=None)
Exemplo n.º 16
0
class TitanicModeling(FlowSpec):
    """
    Construct a ML model from the Titanic data set.
    - preprocess
    - modeling
    """
    train_data = IncludeFile("train_data",
                             help="The path to Titanic train data file.",
                             default=script_path('train.csv'))

    @step
    def start(self):
        """
        Load train data set
        """
        from io import StringIO

        # Load the data set into a pandas dataaframe.
        df_train = pd.read_csv(StringIO(self.train_data))
        self.X = df_train.drop('Survived', axis=1)
        self.y = df_train['Survived']

        # Compute our two recomendation types in parallel.
        self.next(self.categorical_prep, self.numerical_prep)

    @step
    def categorical_prep(self):
        """
        Preprocessing categorical features
        - Impute missing values
        - One-Hot encoding
        """
        self.categorical_columns = ['Pclass', 'Sex', 'Embarked']

        from sklearn.impute import SimpleImputer
        from sklearn.preprocessing import OneHotEncoder

        self.cat_imputer = SimpleImputer(strategy='constant',
                                         fill_value='missing')
        self.ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

        X_imp = self.cat_imputer.fit_transform(
            self.X[self.categorical_columns])
        X_ohe = self.ohe.fit_transform(X_imp)
        col_name = self.ohe.get_feature_names(
            input_features=self.categorical_columns)

        self.X_cat = pd.DataFrame(X_ohe, columns=col_name)

        self.next(self.join)

    @step
    def numerical_prep(self):
        """
        Preprocessing numerical features
        - Impute missing values with mean
        """
        self.numerical_columns = ['Age', 'SibSp', 'Parch', 'Fare']

        from sklearn.impute import SimpleImputer

        self.num_imputer = SimpleImputer(strategy='mean')
        X_imp = self.num_imputer.fit_transform(self.X[self.numerical_columns])

        self.X_num = pd.DataFrame(X_imp, columns=self.numerical_columns)

        self.next(self.join)

    @step
    def join(self, inputs):
        '''
        Concatinate the categorical and numerical columns.
        '''
        X_cat = inputs.categorical_prep.X_cat
        X_num = inputs.numerical_prep.X_num
        self.merge_artifacts(inputs)

        self.X_prep = pd.concat([X_cat, X_num], axis=1)

        self.next(self.model_construction)

    @step
    def model_construction(self):
        """
        Construct a Random Forest model.
        """
        from sklearn.ensemble import RandomForestClassifier

        self.rf = RandomForestClassifier(random_state=17)
        self.rf.fit(self.X_prep, self.y)

        self.next(self.end)

    @step
    def end(self):
        """
        Print out the train socre.
        """
        print("RF train accuracy: %0.3f" % self.rf.score(self.X_prep, self.y))
Exemplo n.º 17
0
class MushroomFlow(FlowSpec):
    """
    A flow to read in the mushroom dataset -- see src.data.download_raw.py
    https://archive.ics.uci.edu/ml/datasets/mushrooms
    """

    raw_data_path = PROJECT_DIR / "data" / "raw" / "mushrooms.csv"
    raw_mushroom_data = IncludeFile(
        "mushroom_data",
        help=
        "Mushroom records drawn from The Audubon Society Field Guide to North American Mushrooms (1981).",
        default=raw_data_path,
    )

    @step
    def start(self):
        """
        Read data/raw/mushrooms.csv into a pandas df
        """
        import pandas as pd
        from io import StringIO

        self.dataframe = pd.read_csv(StringIO(self.raw_mushroom_data))
        self.next(self.rename_cols)

    @step
    def rename_cols(self):
        """
        Give the mushroom data some computer friendlier column names
        Save to "data/interim/mushroom_nice_cols.csv"
        """
        self.dataframe.columns = [
            col.lower().replace(" ", "_") for col in self.dataframe.columns
        ]

        save_path = PROJECT_DIR / "data" / "interim" / "mushroom_nice_cols.csv"
        self.dataframe.to_csv(save_path, index=False)

        self.next(self.one_hot_encode)

    @step
    def one_hot_encode(self):
        """
        Convert the categorical columns to one hot encoding.
        We'll do it by hand this time.
        """
        # col 1 = class
        for col in self.dataframe.columns[1:]:
            possible_vals = self.dataframe[col].unique()
            for v in possible_vals:
                self.dataframe[f"{col}__{v}"] = self.dataframe[col].apply(
                    lambda x: 1 if x == v else 0)
            self.dataframe = self.dataframe.drop(col, axis="columns")

        save_path = PROJECT_DIR / "data" / "interim" / "mushroom_one_hot.csv"
        self.dataframe.to_csv(save_path, index=False)
        self.next(self.end)

    @step
    def end(self):
        """
        Save the final version to "data/processed/mushroom_final.csv"
        End the flow.
        """
        save_path = PROJECT_DIR / "data" / "processed" / "mushroom_final.csv"
        self.dataframe.to_csv(save_path, index=False)
Exemplo n.º 18
0
class MovieStatsFlow(FlowSpec):
    """
    A flow to generate some statistics about the movie genres.

    The flow performs the following steps:
    1) Ingests a CSV into a Pandas Dataframe.
    2) Fan-out over genre using Metaflow foreach.
    3) Compute quartiles for each genre.
    4) Save a dictionary of genre specific statistics.

    """
    movie_data = IncludeFile("movie_data",
                             help="The path to a movie metadata file.",
                             default=script_path('movies.csv'))

    @step
    def start(self):
        """
        The start step:
        1) Loads the movie metadata into pandas dataframe.
        2) Finds all the unique genres.
        3) Launches parallel statistics computation for each genre.

        """
        import pandas
        from io import StringIO

        # Load the data set into a pandas dataframe.
        self.dataframe = pandas.read_csv(StringIO(self.movie_data))

        # The column 'genres' has a list of genres for each movie. Let's get
        # all the unique genres.
        self.genres = {genre for genres \
                       in self.dataframe['genres'] \
                       for genre in genres.split('|')}
        self.genres = list(self.genres)

        # We want to compute some statistics for each genre. The 'foreach'
        # keyword argument allows us to compute the statistics for each genre in
        # parallel (i.e. a fan-out).
        self.next(self.compute_statistics, foreach='genres')

    @step
    def compute_statistics(self):
        """
        Compute statistics for a single genre.

        """
        # The genre currently being processed is a class property called
        # 'input'.
        self.genre = self.input
        print("Computing statistics for %s" % self.genre)

        # Find all the movies that have this genre and build a dataframe with
        # just those movies and just the columns of interest.
        selector = self.dataframe['genres'].\
                   apply(lambda row: self.genre in row)
        self.dataframe = self.dataframe[selector]
        self.dataframe = self.dataframe[['movie_title', 'genres', 'gross']]

        # Get some statistics on the gross box office for these titles.
        points = [.25, .5, .75]
        self.quartiles = self.dataframe['gross'].quantile(points).values

        # Join the results from other genres.
        self.next(self.join)

    @step
    def join(self, inputs):
        """
        Join our parallel branches and merge results into a dictionary.

        """
        # Merge results from the genre specific computations.
        self.genre_stats = {inp.genre.lower(): \
                            {'quartiles': inp.quartiles,
                             'dataframe': inp.dataframe} \
                            for inp in inputs}

        self.next(self.end)

    @step
    def end(self):
        """
        End the flow.

        """
        pass
class ProphetFlow(FlowSpec):
    """
    ProphetFlow use Facebook Prophet to predict future values of a
    timeseries.
    """
    data_file = IncludeFile('datafile',
                            is_text=True,
                            help='Time series data file - csv file format',
                            default='data/daily-min-temperatures.txt')
    columns_mapping = Parameter(
        'columns',
        default={
            'Date': 'ds',
            'Temp': 'y'
        },
        help="Rename columns according to Prophet standards")

    @step
    def start(self):
        """
        Raw data is loaded and prepared
        """
        # Load csv in pandas dataframe
        self.df = pd.read_csv(StringIO(self.data_file))

        # Rename columns to meet Prophet input dataframe standards
        self.df.rename(columns=self.columns_mapping, inplace=True)

        # Convert Date column to datetime64 dtype
        self.df['ds'] = pd.to_datetime(self.df['ds'],
                                       infer_datetime_format=True)

        self.next(self.hyper_tuning)

    @step
    def hyper_tuning(self):
        """
        Hyperparameters tuning
        """
        # Tune hyperparameters of the model
        param_grid = {
            'changepoint_prior_scale': [0.001, 0.01, 0.1, 0.5],
            'seasonality_prior_scale': [0.01, 0.1, 1.0, 10.0],
        }

        # Generate all combinations of parameters
        self.all_params = [
            dict(zip(param_grid.keys(), v))
            for v in itertools.product(*param_grid.values())
        ]

        # Use cross validation to evaluate all parameters
        self.next(self.cross_validation, foreach='all_params')

    @batch(image='vnardone/prophet-metaflow')
    @step
    def cross_validation(self):
        """
        Perform cross-validation on given hyperparameters
        """
        # Fit model with given params
        m = Prophet(**self.input).fit(self.df)
        # Perform cross-validation
        df_cv = cross_validation(m,
                                 initial='730 days',
                                 period='180 days',
                                 horizon='365 days',
                                 parallel="processes")
        df_p = performance_metrics(df_cv, rolling_window=1)

        # Store the RMSE
        self.rmses = df_p['rmse'].values[0]

        self.next(self.train)

    @step
    def train(self, inputs):
        """
        Check cross-validation results and find best parameters.
        A new Prophet model is fitted.
        """
        # Merge artifacts
        self.merge_artifacts(inputs, exclude=['rmses'])

        # Get RMSEs from previous steps
        rmses = [input.rmses for input in inputs]

        # Find the best parameters
        self.hyperparameters = self.all_params[np.argmin(rmses)]

        # Fit a new model using best params
        self.m = Prophet(**self.hyperparameters)
        self.m.fit(self.df)

        self.next(self.end)

    @step
    def end(self):
        """
        Last step, process is finished
        """
        print("ProphetFlow is all done.")
Exemplo n.º 20
0
class GPUMINSTTensorFlow(FlowSpec):
    """
    Train multiple Iterations of Machine learning models for MNIST Handwritten digit prediction.
    Metaflow will help capture the experiments and then understanding the efficiency of training and accuracy for each of the models.

    """

    mnist_dataset_train_x_raw = IncludeFile(
        "mnist_dataset_train_x_raw",
        help="The path to a mnist training images file.",
        default=script_path('data/mnist/train-images-idx3-ubyte'),
        is_text=False,
        encoding='UTF-8')

    mnist_dataset_train_y_raw = IncludeFile(
        "mnist_dataset_train_y_raw",
        help="The path to a  mnist training labels file.",
        default=script_path('data/mnist/train-labels-idx1-ubyte'),
        is_text=False,
        encoding='UTF-8')

    mnist_dataset_test_x_raw = IncludeFile(
        "mnist_dataset_test_x_raw",
        help="The path to a mnist test images file.",
        default=script_path('data/mnist/t10k-images-idx3-ubyte'),
        is_text=False,
        encoding='UTF-8')

    mnist_dataset_test_y_raw = IncludeFile(
        "mnist_dataset_test_y_raw",
        help="The path to a mnist test labels file.",
        default=script_path('data/mnist/t10k-labels-idx1-ubyte'),
        is_text=False,
        encoding='UTF-8')

    num_training_examples = Parameter('num_training_examples',
                                      help='Number of Training Examples',
                                      default=5000)

    number_of_epochs = Parameter(
        'number_of_epochs',
        help='Number of Epochs to Run for the Training Process',
        default=10)

    batch_size = Parameter('batch_size',
                           help='Batch Sizes for the Training Process',
                           default=128)

    @step
    def start(self):
        """
        Parse the MNIST Dataset into Flattened and None Flattened Data artifacts. 

        """
        import numpy as np
        # $ Collect and create the unflattenned dataset according to the number of examples.
        self.train_unflattened, self.val_unflattened, self.test_unflattened = read_mnist(
            np,
            self.mnist_dataset_train_x_raw,
            self.mnist_dataset_train_y_raw,
            self.mnist_dataset_test_x_raw,
            self.mnist_dataset_test_y_raw,
            flatten=False,
            num_train=self.num_training_examples)

        # $ Collect and create the flattenned dataset according to the number of examples.
        self.train_flattened, self.val_flattened, self.test_flattened = read_mnist(
            np,
            self.mnist_dataset_train_x_raw,
            self.mnist_dataset_train_y_raw,
            self.mnist_dataset_test_x_raw,
            self.mnist_dataset_test_y_raw,
            flatten=True,
            num_train=self.num_training_examples)

        # $ Train models in parallel withe the
        self.next(self.train_sequential, self.train_convolution,
                  self.train_convolution_batch_norm)

    @environment(vars={"LC_ALL": "C.UTF-8", "LANG": "C.UTF-8"})
    @kube(cpu=3,
          memory=8000,
          image='tensorflow/tensorflow:1.9.0-gpu-py3',
          gpu=1)
    @step
    def train_sequential(self):
        """
        Train sequential Neural Network with with the Set of parameters. 
        
        """
        from tensorflow.python.keras.layers import Conv2D, Input, MaxPool2D, Dense, Flatten, MaxPooling2D
        from tensorflow.python.keras.models import Sequential
        print("NUMBER OF GPUS BEING USED ::: ")
        from tensorflow.python.client import device_lib
        print(device_lib.list_local_devices())
        train, val, test = self.train_flattened, self.val_flattened, self.test_flattened
        train_X, train_Y = train
        test_X, test_Y = test
        model = Sequential()
        model.add(Dense(128, activation='relu', input_shape=[
            784
        ]))  # fully-connected layer with 128 units and ReLU activation
        model.add(Dense(128, activation='relu'))
        model.add(Dense(10, activation='softmax')
                  )  # output layer with 10 units and a softmax activation

        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['categorical_accuracy', 'accuracy'])
        history = model.fit(train_X,
                            train_Y,
                            validation_split=0.2,
                            epochs=self.number_of_epochs,
                            batch_size=self.batch_size)
        self.history = dict(history.history)
        self.next(self.join)

    @environment(vars={"LC_ALL": "C.UTF-8", "LANG": "C.UTF-8"})
    @kube(cpu=3,
          memory=8000,
          image='tensorflow/tensorflow:1.9.0-gpu-py3',
          gpu=1)
    @step
    def train_convolution(self):
        """
        Train a Convolutional Neural Network with the Set of parameters.
        """
        from tensorflow.python.keras.layers import Conv2D, Input, MaxPool2D, Dense, Flatten, MaxPooling2D
        from tensorflow.python.keras.models import Sequential
        print("NUMBER OF GPUS BEING USED ::: ")
        from tensorflow.python.client import device_lib
        print(device_lib.list_local_devices())
        train, val, test = self.train_unflattened, self.val_unflattened, self.test_unflattened
        train_X, train_Y = train
        test_X, test_Y = test
        train_X = train_X.reshape(self.num_training_examples, 28, 28, 1)
        test_X = test_X.reshape(test_X.shape[0], 28, 28, 1)
        model = Sequential()
        model.add(
            Conv2D(32,
                   kernel_size=(1, 1),
                   activation='relu',
                   input_shape=(28, 28, 1)))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Conv2D(64, kernel_size=(3, 3), activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Conv2D(128, kernel_size=(3, 3), activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dense(10, activation='softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['categorical_accuracy', 'accuracy'])
        history = model.fit(train_X,
                            train_Y,
                            validation_split=0.2,
                            epochs=self.number_of_epochs,
                            batch_size=self.batch_size)
        self.history = dict(history.history)
        self.next(self.join)

    @environment(vars={"LC_ALL": "C.UTF-8", "LANG": "C.UTF-8"})
    @kube(cpu=3,
          memory=8000,
          image='tensorflow/tensorflow:1.9.0-gpu-py3',
          gpu=1)
    @step
    def train_convolution_batch_norm(self):
        """
        Train a Convolutional Neural Network with Batch Norm and Dropout with the Set of parameters.
        """
        from tensorflow.python.keras.layers import Conv2D, Input, MaxPool2D, Dense, Flatten, MaxPooling2D, BatchNormalization, Activation, Dropout
        from tensorflow.python.keras.models import Sequential
        from tensorflow.python.client import device_lib
        print("NUMBER OF GPUS BEING USED ::: ")
        print(device_lib.list_local_devices())

        train, val, test = self.train_unflattened, self.val_unflattened, self.test_unflattened
        train_X, train_Y = train
        test_X, test_Y = test
        train_X = train_X.reshape(self.num_training_examples, 28, 28, 1)
        test_X = test_X.reshape(test_X.shape[0], 28, 28, 1)
        model = Sequential()
        model.add(
            Conv2D(32,
                   kernel_size=(1, 1),
                   use_bias=False,
                   input_shape=(28, 28, 1)))
        model.add(BatchNormalization(axis=3))
        model.add(Activation('relu'))
        model.add(Conv2D(64, kernel_size=(3, 3), use_bias=False))
        model.add(BatchNormalization(axis=3))
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))

        model.add(Conv2D(32, kernel_size=(1, 1), use_bias=False))
        model.add(BatchNormalization(axis=3))
        model.add(Activation('relu'))
        model.add(Conv2D(64, kernel_size=(3, 3), use_bias=False))
        model.add(BatchNormalization(axis=3))
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))

        model.add(Dropout(0.2))
        model.add(Flatten())
        model.add(Dense(128, activation='relu'))
        model.add(Dropout(0.4))
        model.add(Dense(10, activation='softmax'))

        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['categorical_accuracy', 'accuracy'])
        history = model.fit(train_X,
                            train_Y,
                            validation_split=0.2,
                            epochs=self.number_of_epochs,
                            batch_size=self.batch_size)
        self.history = dict(history.history)
        self.next(self.join)

    @step
    def join(self, inputs):
        """
        Join our parallel branches and merge results,

        """
        self.history = {
            'convolution': inputs.train_convolution.history,
            'sequential': inputs.train_sequential.history,
            'convolution_batch_norm':
            inputs.train_convolution_batch_norm.history
        }

        self.next(self.end)

    @step
    def end(self):
        """
        This is the end step of the Computation 

        """
        print("Done Computation")
Exemplo n.º 21
0
class RegressionModel(FlowSpec):
    """
    RegressionModel is a DAG that produces a regression model over product prices. Given as input a set of features
    and a list of prices per product, the output is a Keras model able to predict the price of unseen items.
    """
    DATA_FILE = IncludeFile(
        'dataset',
        help='Text File With Regression Numbers',
        is_text=True,
        default='dataset.txt')

    LEARNING_RATES = Parameter(
        name='learning_rates',
        help='Learning rates to test, comma separeted',
        default='0.1,0.2'
    )

    @step
    def start(self):
        """
        Read data in, and parallelize model building with two params.
        """
        raw_data = StringIO(self.DATA_FILE).readlines()
        self.dataset = [[float(_) for _ in d.strip().split('\t')] for d in raw_data]
        split_index = int(len(self.dataset) * 0.8)
        self.train_dataset = self.dataset[:split_index]
        self.test_dataset = self.dataset[split_index:]
        print("Training data: {}, test data: {}".format(len(self.train_dataset), len(self.test_dataset)))
        self.learning_rates = [float(_) for _ in self.LEARNING_RATES.split(',')]
        self.next(self.train_model, foreach='learning_rates')

    @step
    def train_model(self):
        """
        Train a regression model and use s3 client from metaflow to store the model tar file.
        """
        # this is the current learning rate in the fan-out
        self.learning_rate = self.input
        import numpy as np
        import tensorflow as tf
        from tensorflow.keras import layers
        import tarfile
        import wandb
        from wandb.keras import WandbCallback
        # this name comes in handy later, as a naming convention for building the card
        wandb_run_name = '{}:{}-{}'.format(current.flow_name, current.run_id, self.learning_rate)
        wandb.init(project=current.flow_name, name=wandb_run_name)
        # build the model
        x_train = np.array([[_[0]] for _ in self.train_dataset])
        y_train = np.array([_[1] for _ in self.train_dataset])
        x_test = np.array([[_[0]] for _ in self.test_dataset])
        y_test = np.array([_[1] for _ in self.test_dataset])
        x_model = tf.keras.Sequential([layers.Dense(input_shape=[1,], units=1)])
        # print model summary to a string
        stringlist = []
        x_model.summary(print_fn=lambda x: stringlist.append(x))
        self.model_summary = "\n".join(stringlist)
        x_model.compile(
            optimizer=tf.optimizers.Adam(learning_rate=self.learning_rate),
            loss='mean_absolute_error', metrics=[tf.keras.metrics.MeanSquaredError()])
        history = x_model.fit(
            x_train,
            y_train,
            epochs=50,
            validation_split=0.2,
            callbacks=[WandbCallback()])
        self.hist = history.history
        self.results = x_model.evaluate(x_test, y_test)
        model_name = "regression-model-{}/1".format(self.learning_rate)
        local_tar_name = 'model-{}.tar.gz'.format(self.learning_rate)
        x_model.save(filepath=model_name)
        # zip keras folder to a single tar file
        with tarfile.open(local_tar_name, mode="w:gz") as _tar:
            _tar.add(model_name, recursive=True)
        with open(local_tar_name, "rb") as in_file:
            data = in_file.read()
            with S3(run=self) as s3:
                url = s3.put(local_tar_name, data)
                # save this path for downstream reference!
                self.s3_path = url
        # finally join with the other runs
        self.next(self.join_runs)

    @step
    def join_runs(self, inputs):
        """
        Join the parallel runs, merge results into a dictionary, and finally pick the best model according
        to some custom logic.
        """
        # merge results (loss) from runs with different parameters
        self.results_from_runs = {
            inp.learning_rate:
                {
                    'metrics': inp.results,
                    'tar': inp.s3_path,
                    'summary': inp.model_summary
                }
            for inp in inputs}
        print("Current results: {}".format(self.results_from_runs))
        # pick one according to some logic: here just pick a random one
        self.best_learning_rate = choice(list(self.results_from_runs.keys()))
        # we anonymize the actual s3 path for demo purposes - remove this for your own DAG
        model_s3_path = self.results_from_runs[self.best_learning_rate]['tar']
        self.best_s3_model_path = '/'.join(model_s3_path.split('/')[-3:])
        self.best_model_metrics = self.results_from_runs[self.best_learning_rate]['metrics']
        self.best_model_summary = self.results_from_runs[self.best_learning_rate]['summary']
        # note that we use the local Keras model folder to reload the model in the next steps and run tests
        # to make it batch-proof, we should use the s3 version instead
        self.best_model_local_path = "regression-model-{}/1".format(self.best_learning_rate)
        self.next(self.behavioral_tests)

    @step
    def behavioral_tests(self):
        """
        Run behavioral tests on the best model. Behavioral tests are on top of formal checks (e.g. assert that
        the model has such and such a size) and quantitative performance (e.g. accuracy on the test set), and are
        designed to provide "sanity checks" on model behavior, and highlight predictions for cases that are
        deemed qualitatively important for the overall DAG success. Please note that in this sample DAG all
        tests are of course made up ;-)

        For an overview of the general philosophy behind it, see: https://arxiv.org/abs/2005.04118
        """
        import json
        from tensorflow import keras
        import numpy as np
        # open test file
        with open('behavioral_tests.json') as f:
            behavioral_tests = json.load(f)
        # load model from local folder, previously stored. If you run with batch config,
        # the model needs to be loaded from the s3 tar!
        best_model = keras.models.load_model(self.best_model_local_path)
        print("Model loaded. Found a total of # {} behavioral tests".format(len(behavioral_tests['tests'])))
        tests = []
        for t in behavioral_tests['tests']:
            new_test_result = dict(t)
            new_test_result['prediction'] = best_model.predict(np.array(t['input']))[0][0]
            # quickly parse the expected formula and decide if test is successful or not
            expected_is_greater = t['expected'].split()[0].strip() == '>' # boolean, True if expression is '> X'
            expected_value = float(t['expected'].split()[1].strip())
            test_result = (new_test_result['prediction'] > expected_value) == expected_is_greater
            new_test_result['is_successful'] = test_result
            print("Running test {}:\nPredicted value: {}, expected: {}, success: {}".format(
                t['name'],
                new_test_result['prediction'],
                t['expected'],
                new_test_result['is_successful']
            ))
            tests.append(new_test_result)
        # store and version test results in Metaflow
        self.behavioral_test_results = tests
        # go to the final step
        self.next(self.end)

    @step
    def end(self):
        """
        DAG is done: it is a good time to clean up local files like tar archives etc.
        """
        print('Dag ended!')
class ArchetypeEstimator(FlowSpec):
    """
    A Flow to estimate in the case of an unknown archetype for a deck, what could be the close4st archetype that can be associated
    """
    
    # Define an input parameter for the Flow (number of top cards to keep to define the )
    nbrcards =  Parameter("topcards", help = "Top cards to choose", default = random.choice(list(range(1,40))))
    
    decks_data = IncludeFile("decks_data",
                                help="The path to a decks file.",
                                default=ext.script_path("../data/hearthstone_deckarchetypeestimator/decks_sample.csv"))

    @step
    def start(self):
        """
        Launch the Flow
        """
        self.tags_script = "nolayer"
        self.limittopcards = self.nbrcards
        print("Let's go !!\n I know I could have done something here like loading the decks but I wanted to have a dedicated step for that :-)")
        self.next(self.collect_decks)
    
    @step
    def collect_decks(self):
        """
        Step to collect and process the decks used for the Flow
        """
        import pandas as pd
        
        # Collect the decks

        df_decks = pd.read_csv(StringIO(self.decks_data))
        
        # Do some operations (cleaning, formatting) on the dataframe
        df_decks = df_decks[df_decks["is_gooddeck"] == 1]
        df_decks["cards"] = df_decks["cards"].apply(lambda elt: ast.literal_eval(elt))
        df_decks["createddate"] = pd.to_datetime(df_decks["createddate"])
        df_decks["individual_cards"] = df_decks["cards"].apply(lambda elt: list(dict.fromkeys(elt)))
        df_decks["deckid"] = pd.to_numeric(df_decks["deckid"], downcast = "integer")
        
        # Save the decks
        self.df_decks = df_decks
        self.next(self.segment_decks)
        
    @step 
    def segment_decks(self):
        """
        Step to do the segmentation of the data between the different datasets (train, test and score)
        """
        from sklearn.model_selection import train_test_split

        # Select the data to score
        self.df_decks_toscore = self.df_decks[self.df_decks["archetype"] == "Unknown"]
        
        # Build the training and testing set
        df_decks_training = self.df_decks[self.df_decks["archetype"] != "Unknown"]
        self.df_decks_totrain, self.df_decks_totest = train_test_split(df_decks_training, train_size = 0.8)

        self.next(self.collect_archetypes)
        
    @step
    def collect_archetypes(self):
        """
        Step to estimate the archetypes that will be predicted
        """
        # Rank the archetype by their presence in the training data
        stats_deckarchetype = self.df_decks_totrain.groupby(["archetype"]).size().sort_values(ascending = False)
        
        # Store the archetypes in a list
        all_archetypes = list(stats_deckarchetype.index)
        print("Collect the archetypes possible",all_archetypes)
        
        # Save the archetypes
        self.archetypes = all_archetypes[:5] # It's just the top5 decks for quick execution but you can drop this limit 
        #(you will need to add the attribute --max-num-splits to your run command with a value > 100 like 150 for example)
        print("Collect the archetypes for the mlmagic", self.archetypes)

        self.next(self.collect_topcards, foreach = "archetypes")
    
    @step
    def collect_topcards(self):
        """
        Step to estimate the top cards for each archetype
        """
        # Select the right decks (the one associated to the deck)
        df_decks_archetype = self.df_decks_totrain[self.df_decks_totrain["archetype"] == self.input]

        # Collect all the cards played with this archetype
        all_cards_archetype = df_decks_archetype["individual_cards"].explode()
        
        # Compute and sort the calculation of the occurency of the card usage
        df_countcards_archetype = all_cards_archetype.to_frame(name = "cardid").groupby(["cardid"]).size().reset_index()
        df_countcards_archetype.columns = ["cardid","occurency"]
        df_countcards_archetype.sort_values(["occurency"], ascending = False, inplace = True)
        
        # Select the top cards for the archetype based on the occurency of the card usage in the decks
        self.archetype = self.input
        self.topcards = df_countcards_archetype["cardid"].head(self.nbrcards).to_list()
        
        self.next(self.build_features)
    
    #@resources(memory=60000, cpu=1)
    @batch
    @step
    def build_features(self, inputs):
        """
        Step to compute the features for the model (occurency of the cards in the deck/archetype)
        """
        # Get all the top cards
        features_cardid = [input.topcards for input in inputs]
        informations_topcards = {}
        for input in inputs:
            features_cardid.append(input.topcards)
            informations_topcards[input.archetype] = input.topcards

        self.informations_topcards = informations_topcards

        # Join all the top cards
        features_cardid = list(itertools.chain.from_iterable(features_cardid))
        # Drop the duplicates (some cards can be on the top cards of multiple archetypes)
        features_cardid = list(dict.fromkeys(features_cardid))
        self.features = features_cardid
        
        # Get the artifacts from the previous steps (exclude just the top cards and archetypes artifacts)
        self.merge_artifacts(inputs, exclude = ["topcards","archetype"])
        
        self.next(self.trigger_prepare_datas)
        
    @step
    def trigger_prepare_datas(self):
        """
        Step to trigger the pareparation of the data with a branch
        """
        self.next(self.prepare_datatraining, self.prepare_datatesting, self.prepare_datascoring)

    @step
    def prepare_datatraining(self):
        """
        Step to build the training and testing set
        """
        self.array_features_totrain, self.array_labels_totrain , self.df_totrain = ext.prepare_data(self.df_decks_totrain, self.features)

        self.next(self.prepare_mlmagic)
        
    @step
    def prepare_datatesting(self):
        """
        Step to build the scoring set
        """
        self.array_features_totest, self.array_labels_totest , self.df_totest = ext.prepare_data(self.df_decks_totest, self.features)
        
        self.next(self.prepare_mlmagic)
    
    @step
    def prepare_datascoring(self):
        """
        Step to build the scoring set
        """
        self.array_features_toscore, self.array_labels_toscore, self.df_toscore = ext.prepare_data(self.df_decks_toscore, self.features)
        
        self.next(self.prepare_mlmagic)

    @step  
    def prepare_mlmagic(self, inputs):
        """
        Step to prepare he data for the ml part and the parameters to test for the HPO
        """
        print("Data are ready")
        self.parameters_model = random.choices(combinations_parameters_randomforest, k = 5)
        self.merge_artifacts(inputs, exclude = ["df_decks_toscore","df_decks_totest", "df_decks_totrain"])

        self.next(self.trigger_build_model)
        
    @step
    def trigger_build_model(self):
        """
        Step to trigger the HPO that will build a model for each parameters_model selected in the previous step
        """
        self.next(self.build_model, foreach = "parameters_model")
    
    @step
    def build_model(self):
        """
        Step to compute the model with specific parameters
        """
        from sklearn.ensemble import RandomForestClassifier
        from sklearn.metrics import accuracy_score

        tic = time.time()

        # Prepare the model for the training
        parameters = self.input
        model = RandomForestClassifier(n_estimators = parameters["n_estimators"],
                                       criterion = parameters["criterion"],
                                       max_depth = parameters["max_depth"],
                                       random_state=0)
        
        # Fit the model
        model.fit(self.array_features_totrain, self.array_labels_totrain)
        time_training = time.time() - tic
        
        # Make the preidctions on the testing set
        tic = time.time()
        array_labels_predictions =  model.predict(self.array_features_totest)
        time_testing = time.time() - tic
        
        # Storing time
        # Store the model details
        self.model_parameters = parameters
        self.model_object = model
        self.classes = model.classes_
        
        # Store the metrics
        self.accuracy = accuracy_score(self.array_labels_totest, array_labels_predictions)
        self.time_training = time_training
        self.time_testing = time_testing   

        self.next(self.select_and_score)
    
    @step
    def select_and_score(self, inputs):
        """
        Step to select the right model and do the scoring 
        """
        # Get the best model based on the accuracy metric
        accuracy_reference = 0
        for input in inputs:
            if input.accuracy > accuracy_reference :
                self.model = input.model_object
                self.parameters = input.model_parameters
                self.classes = input.classes
                self.accuracy = input.accuracy
                self.time_training = input.time_training
                
                accuracy_reference = input.accuracy

        # Time to brag on the best model
        print(f"The best RF has the following : {self.parameters}")
        print(f"With an accuracy of {round(self.accuracy,2)} for a training time of {round(self.time_training,1)} seconds")
                
        # Get the artifacts from the previous steps (and exclude all the model thingy from the hpo)
        self.merge_artifacts(inputs, exclude = ["model_object","model_parameters","accuracy","classes","time_training","time_testing"])
        
        # Build the final version for the testing and scoring dataframe (computing the predictions)
        df_tested = ext.update_df_withpredictions(self.model, self.classes, self.array_features_totest, self.df_totest)
        df_scored = ext.update_df_withpredictions(self.model, self.classes, self.array_features_toscore, self.df_toscore)

        # Get a flag on the testing set if it was a good prediction
        df_tested["is_goodprediction"] = df_tested.apply(lambda row: row["prediction"] == row["archetype"], axis = 1)

        # Store the dataframe
        self.df_tested = df_tested
        self.df_scored = df_scored
        self.next(self.end)
        
    @step 
    def end(self):
        """
        Step to conclude the Flow
        """
        print("Done")
        pass
class SentimentAnalysis(FlowSpec):

    edgar_data = IncludeFile("data", default='testlist.csv')

    @conda(libraries={'pandas': '1.0.1'})
    @step
    def start(self):
        import pandas
        from io import StringIO

        # Load the data set into a pandas dataframe.
        self.dataframe = pandas.read_csv(StringIO(self.edgar_data))

        self.links = list(self.dataframe['link'])
        self.next(self.scrapping)

    @conda(
        libraries={
            'pandas': '1.0.1',
            're': '4.4.0',
            'pathlib': '1.0.1',
            'random': '2.2.1',
            'urllib.parse': '',
            'bs4': '4.8.2',
            'furl': '2.1.0',
            'selenium': '3.141.1'
        })
    @step
    def scrapping(self):
        import re
        from pathlib import Path
        import urllib.request
        import boto3
        from metaflow import S3
        from urllib.parse import urljoin
        import pandas as pd
        from bs4 import BeautifulSoup
        from furl import furl
        from selenium import webdriver

        soup = BeautifulSoup(html, 'lxml')
        meta, participants, content = {}, [], []
        h1 = soup.find('h1', itemprop='headline')
        if h1 is None:
            return
        h1 = h1.text
        meta['company'] = h1[:h1.find('(')].strip()
        meta['symbol'] = h1[h1.find('(') + 1:h1.find(')')]
        match = quarter_pattern.search(title)
        if match:
            meta['quarter'] = match.group(0)

        SA_URL = 'https://seekingalpha.com/'
        TRANSCRIPT = re.compile('Earnings Call Transcript')

        next_page = True
        page = 1
        driver = webdriver.Firefox()
        while next_page:
            print(f'Page: {page}')
            url = f'{SA_URL}/earnings/earnings-call-transcripts/{page}'
            driver.get(urljoin(SA_URL, url))
            response = driver.page_source

            soup = BeautifulSoup(response, 'lxml')
            links = soup.find_all(name='a', string=TRANSCRIPT)
            if len(links) == 0:
                next_page = False
            else:
                for link in links:
                    transcript_url = link.attrs.get('href')
                    article_url = furl(urljoin(SA_URL, transcript_url)).add(
                        {'part': 'single'})
                    driver.get(article_url.url)
                    html = driver.page_source
                    result = parse_html(html)
                    if result is not None:
                        meta, participants, content = result
                        meta['link'] = link
                        store_result(meta, participants, content)
                    sleep(5 + (random() - .5) * 2)
        driver.close()
        for x in participants:
            f = open("testdata.txt", "a")
            f.write(str_response)
            f.close()

        with S3(s3root='s3://outputbucket1221/') as s3:
            s3.put_files([('testdata', 'testdata.txt')])

        self.next(self.predicting)

    @conda(libraries={
        'pandas': '1.0.1',
        'nltk': '3.4.5',
        'smart_open': '1.9.0'
    })
    @step
    def predicting(self):
        import boto3
        from metaflow import S3
        import re
        import pandas as pd
        from nltk import tokenize
        import string
        import nltk
        from nltk.corpus import stopwords
        from smart_open import smart_open
        import requests

        nltk.download('punkt')
        nltk.download('stopwords')
        nltk.download('words')

        listed = []
        with smart_open('s3://inputbucket1221/testdata.txt', 'r') as s3_source:
            Line = s3_source.readline()

            while Line != '':
                Line1 = Line.split(".")
                for Sentence in Line1:
                    listed.append(Sentence)
                Line = s3_source.readline()

        L = []
        for x in listed:
            if len(x) > 5:
                L.append(x)

        df = pd.DataFrame()

        df['Text'] = L
        print(df['Text'])

        def remove_punct(text):
            text = "".join(
                [char for char in text if char not in string.punctuation])
            text = re.sub('[0-9]+', '', text)
            return text

        df['Textclean'] = df['Text'].apply(lambda x: remove_punct(x))
        df = df.dropna()

        url = 'http://localhost:5000/model/predict'
        #myobj = {"The Model Asset Exchange is a crucial element of a developer's toolkit."}
        #data = {'text':['We did it!']}

        #x = requests.post(url, json = data)

        #print(x.text)
        senti_list = []
        for x in df['Textclean']:
            data = {'text': [x]}
            res = (requests.post(url, json=data)).text.split()
            if float(res[4][:-1]) > float(res[6][:-3]):
                temp = (x, "Positive")
                senti_list.append(temp)
            elif float(res[4][:-1]) < float(res[6][:-3]):
                temp = (x, "Negative")
                senti_list.append(temp)
            else:
                temp = (x, "Neutral")
                senti_list.append(temp)
        sentiment = pd.DataFrame(senti_list, columns=['Sentence', 'Sentiment'])

        #sentiment.to_csv('sample.csv',index=False)

        sentiment.to_csv('final_sentiment.csv', index=False)

        with S3(s3root='s3://outputbucket1221/') as s3:
            s3.put_files([('final_sentiment.csv', 'final_sentiment.csv')])

        self.next(self.end)

    @step
    def end(self):
        """
        End the flow.
        """
        pass
Exemplo n.º 24
0
class CEMExperiment(FlowSpec):
    """
    Train logit on on user specified task via CEM
    """

    config = IncludeFile(
        "config",
        required=True,
        help="Configuration file contains model and experiment hyperparameters",
    )

    @step
    def start(self):
        """Initialize experiment parameters"""

        # Load experiment configuration parameters into memory
        self.params = yaml.safe_load(self.config)

        # Start timer
        self.start_time = time.time()

        self.next(self.run)

    @step
    def run(self):
        """Execute CMA Experiment"""

        # Master dictionary that tracks all run information with the following
        # key structure:
        #   'epoch' -> 'agent' ->
        #      {'parameters', 'run' -> [rewards, actions, states]}
        self.run_dict = NestedDefaultDictionary()

        # Write out logs for Tensorboard
        logdir = os.path.join(
            "logs",
            "cma",
            self.params["run"],
            datetime.now().strftime("%Y%m%d-%H%M%S"),
            "metrics",
        )
        os.makedirs(logdir, exist_ok=True)

        file_writer = tf.summary.create_file_writer(logdir)
        file_writer.set_as_default()

        # Experiment/ES parameters
        gamma = self.params["gamma"]
        n_episodes = self.params["n-episodes"]
        pop_size = self.params["pop-size"]
        iterations = self.params["iterations"]
        env = gym.make(self.params["env"])

        # Initializing experiment
        param_size = env.observation_space.shape[0] + 1
        pop_model = CMA_ES(param_size)
        thetas = pop_model.generate_population(pop_size)
        (
            utilities,
            self.pop_mean_util,
            self.pop_median_util,
            self.pop_var_util,
        ) = empty_lists(4)

        # Create metric iterables
        util_metrics = (
            self.pop_mean_util,
            self.pop_median_util,
            self.pop_var_util,
        )
        metric_names = ("Mean", "Median", "Variance")
        metric_funcs = (np.mean, np.median, np.var)

        # Define closure for static parameters
        def execute_training_runs(theta, epoch):
            agent = CEMLogitAgent(theta)
            return train_agent(agent, env, n_episodes, gamma,
                               self._log_run(epoch, agent))

        for k in range(iterations):
            tf.summary.scalar(
                f"CEM Population Covariance Trace",
                data=np.trace(pop_model.cov),
                step=k,
            )
            # For each agent run X many episodes - calculate mean utility
            # across the X runs for each agent.
            utilities = [execute_training_runs(theta, k) for theta in thetas]

            # Keep "best" agents in population sample and refit the
            # population generator using utility weighted parameters of
            # agents in sample.
            fittest, _ = get_fittest(thetas, np.array(utilities))
            pop_model.update_generator(fittest, _)

            # Record population returns metrics
            for collection, metric_name, func in zip(util_metrics,
                                                     metric_names,
                                                     metric_funcs):
                collection.append(func(utilities))
                tf.summary.scalar(
                    f"Population Returns {metric_name}",
                    data=collection[-1],
                    step=k,
                )

            print(
                "Iteration {k} complete -- mean utility: {u}".format(
                    k=k, u=self.pop_mean_util[-1]),
                flush=True,
            )
            # Sample new population from new population parameter generator
            thetas = pop_model.generate_population(pop_size)
            utilities = []

        self.next(self.end)

    @step
    def end(self):
        # Log experiment run time
        self.end_time = time.time()
        run_td = timedelta(seconds=self.start_time - self.end_time)
        self.run_time = str(run_td)

    def _log_run(self, epoch, agent):
        def callback(rewards, actions, states):
            # TODO: determine a better strategy for referencing
            # the agents that can work in a distributed environment.
            if not self.run_dict[epoch][hash(agent)]:
                self.run_dict[epoch][hash(agent)] = {
                    "parameters":
                    agent.thetas,
                    "runs": [{
                        "rewards": rewards,
                        "actions": actions,
                        "states": states,
                    }],
                }
            else:
                self.run_dict[epoch][hash(agent)]["runs"].append({
                    "rewards":
                    rewards,
                    "actions":
                    actions,
                    "states":
                    states
                })

        return callback