Exemplo n.º 1
0
def test():
    import os
    import shutil
    from kale.utils import pod_utils as _kale_pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = ""
    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]
    if "v1" not in _kale_directory_file_names:
        raise ValueError("v1" + " does not exists in directory")
    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if (os.path.isfile(os.path.join(_kale_data_directory, f))
            and os.path.splitext(f)[0] == "v1")
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name %s: %s" %
                         ("v1", str(_kale_load_file_name)))
    _kale_load_file_name = _kale_load_file_name[0]
    v1 = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))
Exemplo n.º 2
0
def kid2(CANDIES: int, vol_shared_volume: str):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/shared_volume/notebooks/kale-base-example/.candies_sharing.ipynb.kale.marshal.dir"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "kid1" not in _kale_directory_file_names:
        raise ValueError("kid1" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "kid1"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "kid1" + ": " +
                         str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    kid1 = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import random

    def get_handful(left):
        if left == 0:
            print("There are no candies left! I want to cry :(")
            return 0
        c = random.randint(1, left)
        print("I got %s candies!" % c)
        return c

    kid2 = get_handful(CANDIES - kid1)

    # -----------------------DATA SAVING START---------------------------------
    if "kid2" in locals():
        _kale_resource_save(kid2, os.path.join(_kale_data_directory, "kid2"))
    else:
        print("_kale_resource_save: `kid2` not found.")
    if "kid1" in locals():
        _kale_resource_save(kid1, os.path.join(_kale_data_directory, "kid1"))
    else:
        print("_kale_resource_save: `kid1` not found.")
def eval(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0]
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "trns_output" not in _kale_directory_file_names:
        raise ValueError("trns_output" + " does not exists in directory")

    _kale_load_file_name = [
        f
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f)) and
        os.path.splitext(f)[0] == "trns_output"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "trns_output" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    trns_output = _kale_resource_load(os.path.join(
        _kale_data_directory, _kale_load_file_name))

    if "training_input_fn" not in _kale_directory_file_names:
        raise ValueError("training_input_fn" + " does not exists in directory")

    _kale_load_file_name = [
        f
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f)) and
        os.path.splitext(f)[0] == "training_input_fn"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "training_input_fn" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    training_input_fn = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "estimator" not in _kale_directory_file_names:
        raise ValueError("estimator" + " does not exists in directory")

    _kale_load_file_name = [
        f
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f)) and
        os.path.splitext(f)[0] == "estimator"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "estimator" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    estimator = _kale_resource_load(os.path.join(
        _kale_data_directory, _kale_load_file_name))

    if "tf_transform_output" not in _kale_directory_file_names:
        raise ValueError("tf_transform_output" +
                         " does not exists in directory")

    _kale_load_file_name = [
        f
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f)) and
        os.path.splitext(f)[0] == "tf_transform_output"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "tf_transform_output" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    tf_transform_output = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(
        DATA_DIR, 'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = ['trip_start_hour',
                                'trip_start_day', 'trip_start_month']

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company',
                          'pickup_community_area', 'dropoff_community_area']

    # allow nan values in these features.
    OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract',
                         'company', 'trip_seconds', 'dropoff_community_area']

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    # tf.get_logger().setLevel(logging.ERROR)

    eval_result = estimator.evaluate(input_fn=lambda: training_input_fn(
        tf_transform_output,
        os.path.join(trns_output, 'eval' + '*'),
        BATCH_SIZE,
        "tips"),
        steps=50)

    print(eval_result)
def train(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0]
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "trns_output" not in _kale_directory_file_names:
        raise ValueError("trns_output" + " does not exists in directory")

    _kale_load_file_name = [
        f
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f)) and
        os.path.splitext(f)[0] == "trns_output"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "trns_output" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    trns_output = _kale_resource_load(os.path.join(
        _kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(
        DATA_DIR, 'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = ['trip_start_hour',
                                'trip_start_day', 'trip_start_month']

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company',
                          'pickup_community_area', 'dropoff_community_area']

    # allow nan values in these features.
    OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract',
                         'company', 'trip_seconds', 'dropoff_community_area']

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    # tf.get_logger().setLevel(logging.ERROR)

    def training_input_fn(transformed_output, transformed_examples, batch_size, target_name):
        """
        Args:
          transformed_output: tft.TFTransformOutput
          transformed_examples: Base filename of examples
          batch_size: Batch size.
          target_name: name of the target column.
        Returns:
          The input function for training or eval.
        """
        dataset = tf.data.experimental.make_batched_features_dataset(
            file_pattern=transformed_examples,
            batch_size=batch_size,
            features=transformed_output.transformed_feature_spec(),
            reader=tf.data.TFRecordDataset,
            shuffle=True)
        transformed_features = dataset.make_one_shot_iterator().get_next()
        transformed_labels = transformed_features.pop(target_name)
        return transformed_features, transformed_labels

    def get_feature_columns():
        """Callback that returns a list of feature columns for building a tf.estimator.
        Returns:
          A list of tf.feature_column.
        """
        return (
            [tf.feature_column.numeric_column(key, shape=()) for key in DENSE_FLOAT_FEATURE_KEYS] +
            [tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_identity(key, num_buckets=VOCAB_SIZE + OOV_SIZE)) for key in VOCAB_FEATURE_KEYS] +
            [tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_identity(key, num_buckets=FEATURE_BUCKET_COUNT, default_value=0)) for key in BUCKET_FEATURE_KEYS] +
            [tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_identity(
                key, num_buckets=num_buckets, default_value=0)) for key, num_buckets in zip(CATEGORICAL_FEATURE_KEYS, MAX_CATEGORICAL_FEATURE_VALUES)]
        )
    training_output = os.path.join(DATA_DIR, "training")
    if os.path.exists(training_output):
        shutil.rmtree(training_output)

    hidden_layer_size = [int(x.strip()) for x in HIDDEN_LAYER_SIZE.split(',')]

    tf_transform_output = tft.TFTransformOutput(trns_output)

    # Set how often to run checkpointing in terms of steps.
    config = tf.estimator.RunConfig(save_checkpoints_steps=1000)
    n_classes = tf_transform_output.vocabulary_size_by_name(
        "vocab_" + LABEL_KEY)
    # Create estimator
    estimator = tf.estimator.DNNClassifier(
        feature_columns=get_feature_columns(),
        hidden_units=hidden_layer_size,
        n_classes=n_classes,
        config=config,
        model_dir=training_output)

    # TODO: Simplify all this: https://www.tensorflow.org/guide/premade_estimators
    estimator.train(input_fn=lambda: training_input_fn(
        tf_transform_output,
        os.path.join(trns_output, 'train' + '*'),
        BATCH_SIZE,
        "tips"),
        steps=STEPS)

    # -----------------------DATA SAVING START---------------------------------
    if "tf_transform_output" in locals():
        _kale_resource_save(tf_transform_output, os.path.join(
            _kale_data_directory, "tf_transform_output"))
    else:
        print("_kale_resource_save: `tf_transform_output` not found.")
    if "training_input_fn" in locals():
        _kale_resource_save(training_input_fn, os.path.join(
            _kale_data_directory, "training_input_fn"))
    else:
        print("_kale_resource_save: `training_input_fn` not found.")
    if "estimator" in locals():
        _kale_resource_save(estimator, os.path.join(
            _kale_data_directory, "estimator"))
    else:
        print("_kale_resource_save: `estimator` not found.")
    if "trns_output" in locals():
        _kale_resource_save(trns_output, os.path.join(
            _kale_data_directory, "trns_output"))
    else:
        print("_kale_resource_save: `trns_output` not found.")
def data_transformation(EPOCHS: int, STEPS: int, BATCH_SIZE: int, HIDDEN_LAYER_SIZE: str, LEARNING_RATE: float):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/marshal"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0]
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "column_names" not in _kale_directory_file_names:
        raise ValueError("column_names" + " does not exists in directory")

    _kale_load_file_name = [
        f
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f)) and
        os.path.splitext(f)[0] == "column_names"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "column_names" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    column_names = _kale_resource_load(os.path.join(
        _kale_data_directory, _kale_load_file_name))

    if "schema" not in _kale_directory_file_names:
        raise ValueError("schema" + " does not exists in directory")

    _kale_load_file_name = [
        f
        for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f)) and
        os.path.splitext(f)[0] == "schema"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "schema" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    schema = _kale_resource_load(os.path.join(
        _kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import os
    import shutil
    import logging
    import apache_beam as beam
    import tensorflow as tf
    import tensorflow_transform as tft
    import tensorflow_model_analysis as tfma
    import tensorflow_data_validation as tfdv

    from apache_beam.io import textio
    from apache_beam.io import tfrecordio

    from tensorflow_transform.beam import impl as beam_impl
    from tensorflow_transform.beam.tft_beam_io import transform_fn_io
    from tensorflow_transform.coders.csv_coder import CsvCoder
    from tensorflow_transform.coders.example_proto_coder import ExampleProtoCoder
    from tensorflow_transform.tf_metadata import dataset_metadata
    from tensorflow_transform.tf_metadata import metadata_io
    DATA_DIR = 'data/'
    TRAIN_DATA = os.path.join(DATA_DIR, 'taxi-cab-classification/train.csv')
    EVALUATION_DATA = os.path.join(
        DATA_DIR, 'taxi-cab-classification/eval.csv')

    # Categorical features are assumed to each have a maximum value in the dataset.
    MAX_CATEGORICAL_FEATURE_VALUES = [24, 31, 12]
    CATEGORICAL_FEATURE_KEYS = ['trip_start_hour',
                                'trip_start_day', 'trip_start_month']

    DENSE_FLOAT_FEATURE_KEYS = ['trip_miles', 'fare', 'trip_seconds']

    # Number of buckets used by tf.transform for encoding each feature.
    FEATURE_BUCKET_COUNT = 10

    BUCKET_FEATURE_KEYS = [
        'pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']

    # Number of vocabulary terms used for encoding VOCAB_FEATURES by tf.transform
    VOCAB_SIZE = 1000

    # Count of out-of-vocab buckets in which unrecognized VOCAB_FEATURES are hashed.
    OOV_SIZE = 10

    VOCAB_FEATURE_KEYS = ['pickup_census_tract', 'dropoff_census_tract', 'payment_type', 'company',
                          'pickup_community_area', 'dropoff_community_area']

    # allow nan values in these features.
    OPTIONAL_FEATURES = ['dropoff_latitude', 'dropoff_longitude', 'pickup_census_tract', 'dropoff_census_tract',
                         'company', 'trip_seconds', 'dropoff_community_area']

    LABEL_KEY = 'tips'
    FARE_KEY = 'fare'
    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
    # tf.get_logger().setLevel(logging.ERROR)

    def to_dense(tensor):
        """Takes as input a SparseTensor and return a Tensor with correct default value
        Args:
          tensor: tf.SparseTensor
        Returns:
          tf.Tensor with default value
        """
        if not isinstance(tensor, tf.sparse.SparseTensor):
            return tensor
        if tensor.dtype == tf.string:
            default_value = ''
        elif tensor.dtype == tf.float32:
            default_value = 0.0
        elif tensor.dtype == tf.int32:
            default_value = 0
        else:
            raise ValueError(f"Tensor type not recognized: {tensor.dtype}")

        return tf.squeeze(tf.sparse_to_dense(tensor.indices,
                                             [tensor.dense_shape[0], 1],
                                             tensor.values, default_value=default_value), axis=1)
        # TODO: Update to below version
        # return tf.squeeze(tf.sparse.to_dense(tensor, default_value=default_value), axis=1)

    def preprocess_fn(inputs):
        """tf.transform's callback function for preprocessing inputs.
        Args:
          inputs: map from feature keys to raw not-yet-transformed features.
        Returns:
          Map from string feature key to transformed feature operations.
        """
        outputs = {}
        for key in DENSE_FLOAT_FEATURE_KEYS:
            # Preserve this feature as a dense float, setting nan's to the mean.
            outputs[key] = tft.scale_to_z_score(to_dense(inputs[key]))

        for key in VOCAB_FEATURE_KEYS:
            # Build a vocabulary for this feature.
            if inputs[key].dtype == tf.string:
                vocab_tensor = to_dense(inputs[key])
            else:
                vocab_tensor = tf.as_string(to_dense(inputs[key]))
            outputs[key] = tft.compute_and_apply_vocabulary(
                vocab_tensor, vocab_filename='vocab_' + key,
                top_k=VOCAB_SIZE, num_oov_buckets=OOV_SIZE)

        for key in BUCKET_FEATURE_KEYS:
            outputs[key] = tft.bucketize(
                to_dense(inputs[key]), FEATURE_BUCKET_COUNT)

        for key in CATEGORICAL_FEATURE_KEYS:
            outputs[key] = tf.cast(to_dense(inputs[key]), tf.int64)

        taxi_fare = to_dense(inputs[FARE_KEY])
        taxi_tip = to_dense(inputs[LABEL_KEY])
        # Test if the tip was > 20% of the fare.
        tip_threshold = tf.multiply(taxi_fare, tf.constant(0.2))
        outputs[LABEL_KEY] = tf.logical_and(
            tf.logical_not(tf.math.is_nan(taxi_fare)),
            tf.greater(taxi_tip, tip_threshold))

        for key in outputs:
            if outputs[key].dtype == tf.bool:
                outputs[key] = tft.compute_and_apply_vocabulary(tf.as_string(outputs[key]),
                                                                vocab_filename='vocab_' + key)

        return outputs
    trns_output = os.path.join(DATA_DIR, "transformed")
    if os.path.exists(trns_output):
        shutil.rmtree(trns_output)

    tft_input_metadata = dataset_metadata.DatasetMetadata(schema)

    runner = 'DirectRunner'
    with beam.Pipeline(runner, options=None) as p:
        with beam_impl.Context(temp_dir=os.path.join(trns_output, 'tmp')):
            converter = CsvCoder(column_names, tft_input_metadata.schema)

            # READ TRAIN DATA
            train_data = (
                p
                | 'ReadTrainData' >> textio.ReadFromText(TRAIN_DATA, skip_header_lines=1)
                | 'DecodeTrainData' >> beam.Map(converter.decode))

            # TRANSFORM TRAIN DATA (and get transform_fn function)
            transformed_dataset, transform_fn = (
                (train_data, tft_input_metadata) | beam_impl.AnalyzeAndTransformDataset(preprocess_fn))
            transformed_data, transformed_metadata = transformed_dataset

            # SAVE TRANSFORMED TRAIN DATA
            _ = transformed_data | 'WriteTrainData' >> tfrecordio.WriteToTFRecord(
                os.path.join(trns_output, 'train'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            # READ EVAL DATA
            eval_data = (
                p
                | 'ReadEvalData' >> textio.ReadFromText(EVALUATION_DATA, skip_header_lines=1)
                | 'DecodeEvalData' >> beam.Map(converter.decode))

            # TRANSFORM EVAL DATA (using previously created transform_fn function)
            eval_dataset = (eval_data, tft_input_metadata)
            transformed_eval_data, transformed_metadata = (
                (eval_dataset, transform_fn) | beam_impl.TransformDataset())

            # SAVE EVAL DATA
            _ = transformed_eval_data | 'WriteEvalData' >> tfrecordio.WriteToTFRecord(
                os.path.join(trns_output, 'eval'),
                coder=ExampleProtoCoder(transformed_metadata.schema))

            # SAVE transform_fn FUNCTION FOR LATER USE
            # TODO: check out what is the transform function (transform_fn) that came from previous step
            _ = (transform_fn | 'WriteTransformFn' >>
                 transform_fn_io.WriteTransformFn(trns_output))

            # SAVE TRANSFORMED METADATA
            metadata_io.write_metadata(
                metadata=tft_input_metadata,
                path=os.path.join(trns_output, 'metadata'))

    # -----------------------DATA SAVING START---------------------------------
    if "trns_output" in locals():
        _kale_resource_save(trns_output, os.path.join(
            _kale_data_directory, "trns_output"))
    else:
        print("_kale_resource_save: `trns_output` not found.")
Exemplo n.º 6
0
def train(TRAIN_STEPS: int, vol_shared_volume: str):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/shared_volume/notebooks/pytorch-classif/.cifar10_classification.ipynb.kale.marshal.dir"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "trainloader" not in _kale_directory_file_names:
        raise ValueError("trainloader" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "trainloader"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "trainloader" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    trainloader = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import torch
    import torchvision
    import torchvision.transforms as transforms
    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
               'ship', 'truck')
    import matplotlib.pyplot as plt
    import numpy as np

    # functions to show an image

    def imshow(img):
        img = img / 2 + 0.5  # unnormalize
        npimg = img.numpy()
        plt.imshow(np.transpose(npimg, (1, 2, 0)))
        plt.show()

    import torch.nn as nn
    import torch.nn.functional as F

    class Net(nn.Module):
        def __init__(self):
            nn.Module.__init__(self)
            self.conv1 = nn.Conv2d(3, 6, 5)
            self.pool = nn.MaxPool2d(2, 2)
            self.conv2 = nn.Conv2d(6, 16, 5)
            self.fc1 = nn.Linear(16 * 5 * 5, 120)
            self.fc2 = nn.Linear(120, 84)
            self.fc3 = nn.Linear(84, 10)

        def forward(self, x):
            x = self.pool(F.relu(self.conv1(x)))
            x = self.pool(F.relu(self.conv2(x)))
            x = x.view(-1, 16 * 5 * 5)
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = self.fc3(x)
            return x

    import torch.optim as optim
    device = torch.device("cuda:0")
    net = Net()
    net.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
    device = torch.device("cuda:0")
    for epoch in range(TRAIN_STEPS):  # loop over the dataset multiple times

        running_loss = 0.0
        for i, data in enumerate(trainloader, 0):
            # get the inputs
            inputs, labels = data[0].to(device), data[1].to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0

    print('Finished Training')

    # -----------------------DATA SAVING START---------------------------------
    if "net" in locals():
        _kale_resource_save(net, os.path.join(_kale_data_directory, "net"))
    else:
        print("_kale_resource_save: `net` not found.")
Exemplo n.º 7
0
def testwhole(TRAIN_STEPS: int, vol_shared_volume: str):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/shared_volume/notebooks/pytorch-classif/.cifar10_classification.ipynb.kale.marshal.dir"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "testloader" not in _kale_directory_file_names:
        raise ValueError("testloader" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "testloader"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "testloader" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    testloader = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "net" not in _kale_directory_file_names:
        raise ValueError("net" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "net"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "net" + ": " +
                         str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    net = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import torch
    import torchvision
    import torchvision.transforms as transforms
    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
               'ship', 'truck')
    import matplotlib.pyplot as plt
    import numpy as np

    # functions to show an image

    def imshow(img):
        img = img / 2 + 0.5  # unnormalize
        npimg = img.numpy()
        plt.imshow(np.transpose(npimg, (1, 2, 0)))
        plt.show()

    import torch.nn as nn
    import torch.nn.functional as F

    class Net(nn.Module):
        def __init__(self):
            nn.Module.__init__(self)
            self.conv1 = nn.Conv2d(3, 6, 5)
            self.pool = nn.MaxPool2d(2, 2)
            self.conv2 = nn.Conv2d(6, 16, 5)
            self.fc1 = nn.Linear(16 * 5 * 5, 120)
            self.fc2 = nn.Linear(120, 84)
            self.fc3 = nn.Linear(84, 10)

        def forward(self, x):
            x = self.pool(F.relu(self.conv1(x)))
            x = self.pool(F.relu(self.conv2(x)))
            x = x.view(-1, 16 * 5 * 5)
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = self.fc3(x)
            return x

    device = torch.device("cuda:0")
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data[0].to(device), data[1].to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    print('Accuracy of the network on the 10000 test images: %d %%' %
          (100 * correct / total))
    device = torch.device("cuda:0")
    class_correct = list(0. for i in range(10))
    class_total = list(0. for i in range(10))

    with torch.no_grad():
        for data in testloader:
            images, labels = data[0].to(device), data[1].to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs, 1)
            c = (predicted == labels).squeeze()
            for i in range(4):
                label = labels[i]
                class_correct[label] += c[i].item()
                class_total[label] += 1

    for i in range(10):
        print('Accuracy of %5s : %2d %%' %
              (classes[i], 100 * class_correct[i] / class_total[i]))
Exemplo n.º 8
0
def testontest(TRAIN_STEPS: int, vol_shared_volume: str):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/shared_volume/notebooks/pytorch-classif/.cifar10_classification.ipynb.kale.marshal.dir"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "testloader" not in _kale_directory_file_names:
        raise ValueError("testloader" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "testloader"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "testloader" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    testloader = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "net" not in _kale_directory_file_names:
        raise ValueError("net" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "net"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "net" + ": " +
                         str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    net = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import torch
    import torchvision
    import torchvision.transforms as transforms
    classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse',
               'ship', 'truck')
    import matplotlib.pyplot as plt
    import numpy as np

    # functions to show an image

    def imshow(img):
        img = img / 2 + 0.5  # unnormalize
        npimg = img.numpy()
        plt.imshow(np.transpose(npimg, (1, 2, 0)))
        plt.show()

    import torch.nn as nn
    import torch.nn.functional as F

    class Net(nn.Module):
        def __init__(self):
            nn.Module.__init__(self)
            self.conv1 = nn.Conv2d(3, 6, 5)
            self.pool = nn.MaxPool2d(2, 2)
            self.conv2 = nn.Conv2d(6, 16, 5)
            self.fc1 = nn.Linear(16 * 5 * 5, 120)
            self.fc2 = nn.Linear(120, 84)
            self.fc3 = nn.Linear(84, 10)

        def forward(self, x):
            x = self.pool(F.relu(self.conv1(x)))
            x = self.pool(F.relu(self.conv2(x)))
            x = x.view(-1, 16 * 5 * 5)
            x = F.relu(self.fc1(x))
            x = F.relu(self.fc2(x))
            x = self.fc3(x)
            return x

    device = torch.device("cuda:0")
    dataiter = iter(testloader)
    n = dataiter.next()
    images, labels = n[0].to(device), n[1].to(device)

    # print images
    # imshow(torchvision.utils.make_grid(images))
    print('GroundTruth: ',
          ' '.join('%5s' % classes[labels[j]] for j in range(4)))
    outputs = net(images)
    _, predicted = torch.max(outputs, 1)

    print('Predicted: ',
          ' '.join('%5s' % classes[predicted[j]] for j in range(4)))

    # -----------------------DATA SAVING START---------------------------------
    if "testloader" in locals():
        _kale_resource_save(testloader,
                            os.path.join(_kale_data_directory, "testloader"))
    else:
        print("_kale_resource_save: `testloader` not found.")
    if "net" in locals():
        _kale_resource_save(net, os.path.join(_kale_data_directory, "net"))
    else:
        print("_kale_resource_save: `net` not found.")
Exemplo n.º 9
0
def results(vol_shared_volume_kale_volumes: str):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/shared_volume/datasets_for_notebooks/titanic/"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "acc_log" not in _kale_directory_file_names:
        raise ValueError("acc_log" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "acc_log"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "acc_log" + ": " +
                         str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    acc_log = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "acc_gaussian" not in _kale_directory_file_names:
        raise ValueError("acc_gaussian" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "acc_gaussian"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "acc_gaussian" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    acc_gaussian = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "acc_random_forest" not in _kale_directory_file_names:
        raise ValueError("acc_random_forest" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "acc_random_forest"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "acc_random_forest" + ": " +
                         str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    acc_random_forest = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "acc_decision_tree" not in _kale_directory_file_names:
        raise ValueError("acc_decision_tree" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "acc_decision_tree"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "acc_decision_tree" + ": " +
                         str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    acc_decision_tree = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "acc_linear_svc" not in _kale_directory_file_names:
        raise ValueError("acc_linear_svc" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "acc_linear_svc"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "acc_linear_svc" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    acc_linear_svc = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import numpy as np
    import pandas as pd
    import seaborn as sns
    from matplotlib import pyplot as plt
    from matplotlib import style

    from sklearn import linear_model
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import SGDClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC
    from sklearn.naive_bayes import GaussianNB

    results = pd.DataFrame({
        'Model': [
            'Support Vector Machines', 'logistic Regression', 'Random Forest',
            'Naive Bayes', 'Decision Tree'
        ],
        'Score': [
            acc_linear_svc, acc_log, acc_random_forest, acc_gaussian,
            acc_decision_tree
        ]
    })
    result_df = results.sort_values(by='Score', ascending=False)
    result_df = result_df.set_index('Score')
    print(result_df)
Exemplo n.º 10
0
def randomforest(vol_shared_volume_kale_volumes: str):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/shared_volume/datasets_for_notebooks/titanic/"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "train_df" not in _kale_directory_file_names:
        raise ValueError("train_df" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "train_df"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "train_df" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    train_df = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "train_labels" not in _kale_directory_file_names:
        raise ValueError("train_labels" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "train_labels"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "train_labels" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    train_labels = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import numpy as np
    import pandas as pd
    import seaborn as sns
    from matplotlib import pyplot as plt
    from matplotlib import style

    from sklearn import linear_model
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import SGDClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC
    from sklearn.naive_bayes import GaussianNB

    random_forest = RandomForestClassifier(n_estimators=100)
    random_forest.fit(train_df, train_labels)
    acc_random_forest = round(
        random_forest.score(train_df, train_labels) * 100, 2)

    # -----------------------DATA SAVING START---------------------------------
    if "acc_random_forest" in locals():
        _kale_resource_save(
            acc_random_forest,
            os.path.join(_kale_data_directory, "acc_random_forest"))
    else:
        print("_kale_resource_save: `acc_random_forest` not found.")
Exemplo n.º 11
0
def datapreprocessing(vol_shared_volume_kale_volumes: str):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/shared_volume/datasets_for_notebooks/titanic/"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "train_df" not in _kale_directory_file_names:
        raise ValueError("train_df" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "train_df"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "train_df" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    train_df = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "test_df" not in _kale_directory_file_names:
        raise ValueError("test_df" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "test_df"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "test_df" + ": " +
                         str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    test_df = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import numpy as np
    import pandas as pd
    import seaborn as sns
    from matplotlib import pyplot as plt
    from matplotlib import style

    from sklearn import linear_model
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import SGDClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC
    from sklearn.naive_bayes import GaussianNB

    data = [train_df, test_df]
    for dataset in data:
        dataset['relatives'] = dataset['SibSp'] + dataset['Parch']
        dataset.loc[dataset['relatives'] > 0, 'not_alone'] = 0
        dataset.loc[dataset['relatives'] == 0, 'not_alone'] = 1
        dataset['not_alone'] = dataset['not_alone'].astype(int)
    train_df['not_alone'].value_counts()
    # This does not contribute to a person survival probability
    train_df = train_df.drop(['PassengerId'], axis=1)
    import re
    deck = {"A": 1, "B": 2, "C": 3, "D": 4, "E": 5, "F": 6, "G": 7, "U": 8}
    data = [train_df, test_df]

    for dataset in data:
        dataset['Cabin'] = dataset['Cabin'].fillna("U0")
        dataset['Deck'] = dataset['Cabin'].map(
            lambda x: re.compile("([a-zA-Z]+)").search(x).group())
        dataset['Deck'] = dataset['Deck'].map(deck)
        dataset['Deck'] = dataset['Deck'].fillna(0)
        dataset['Deck'] = dataset['Deck'].astype(int)
    # we can now drop the cabin feature
    train_df = train_df.drop(['Cabin'], axis=1)
    test_df = test_df.drop(['Cabin'], axis=1)
    data = [train_df, test_df]

    for dataset in data:
        mean = train_df["Age"].mean()
        std = test_df["Age"].std()
        is_null = dataset["Age"].isnull().sum()
        # compute random numbers between the mean, std and is_null
        rand_age = np.random.randint(mean - std, mean + std, size=is_null)
        # fill NaN values in Age column with random values generated
        age_slice = dataset["Age"].copy()
        age_slice[np.isnan(age_slice)] = rand_age
        dataset["Age"] = age_slice
        dataset["Age"] = train_df["Age"].astype(int)
    train_df["Age"].isnull().sum()
    train_df['Embarked'].describe()
    # fill with most common value
    common_value = 'S'
    data = [train_df, test_df]

    for dataset in data:
        dataset['Embarked'] = dataset['Embarked'].fillna(common_value)
    train_df.info()

    # -----------------------DATA SAVING START---------------------------------
    if "train_df" in locals():
        _kale_resource_save(train_df,
                            os.path.join(_kale_data_directory, "train_df"))
    else:
        print("_kale_resource_save: `train_df` not found.")
    if "test_df" in locals():
        _kale_resource_save(test_df,
                            os.path.join(_kale_data_directory, "test_df"))
    else:
        print("_kale_resource_save: `test_df` not found.")
Exemplo n.º 12
0
def featureengineering(vol_shared_volume_kale_volumes: str):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/shared_volume/datasets_for_notebooks/titanic/"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "PREDICTION_LABEL" not in _kale_directory_file_names:
        raise ValueError("PREDICTION_LABEL" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "PREDICTION_LABEL"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " +
                         "PREDICTION_LABEL" + ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    PREDICTION_LABEL = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "train_df" not in _kale_directory_file_names:
        raise ValueError("train_df" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "train_df"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "train_df" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    train_df = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "test_df" not in _kale_directory_file_names:
        raise ValueError("test_df" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "test_df"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "test_df" + ": " +
                         str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    test_df = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import numpy as np
    import pandas as pd
    import seaborn as sns
    from matplotlib import pyplot as plt
    from matplotlib import style

    from sklearn import linear_model
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import SGDClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC
    from sklearn.naive_bayes import GaussianNB

    data = [train_df, test_df]

    for dataset in data:
        dataset['Fare'] = dataset['Fare'].fillna(0)
        dataset['Fare'] = dataset['Fare'].astype(int)
    data = [train_df, test_df]
    titles = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}

    for dataset in data:
        # extract titles
        dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.',
                                                    expand=False)
        # replace titles with a more common title or as Rare
        dataset['Title'] = dataset['Title'].replace([
            'Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev',
            'Sir', 'Jonkheer', 'Dona'
        ], 'Rare')
        dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
        dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
        dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')
        # convert titles into numbers
        dataset['Title'] = dataset['Title'].map(titles)
        # filling NaN with 0, to get safe
        dataset['Title'] = dataset['Title'].fillna(0)
    train_df = train_df.drop(['Name'], axis=1)
    test_df = test_df.drop(['Name'], axis=1)
    genders = {"male": 0, "female": 1}
    data = [train_df, test_df]

    for dataset in data:
        dataset['Sex'] = dataset['Sex'].map(genders)
    train_df = train_df.drop(['Ticket'], axis=1)
    test_df = test_df.drop(['Ticket'], axis=1)
    ports = {"S": 0, "C": 1, "Q": 2}
    data = [train_df, test_df]

    for dataset in data:
        dataset['Embarked'] = dataset['Embarked'].map(ports)
    data = [train_df, test_df]
    for dataset in data:
        dataset['Age'] = dataset['Age'].astype(int)
        dataset.loc[dataset['Age'] <= 11, 'Age'] = 0
        dataset.loc[(dataset['Age'] > 11) & (dataset['Age'] <= 18), 'Age'] = 1
        dataset.loc[(dataset['Age'] > 18) & (dataset['Age'] <= 22), 'Age'] = 2
        dataset.loc[(dataset['Age'] > 22) & (dataset['Age'] <= 27), 'Age'] = 3
        dataset.loc[(dataset['Age'] > 27) & (dataset['Age'] <= 33), 'Age'] = 4
        dataset.loc[(dataset['Age'] > 33) & (dataset['Age'] <= 40), 'Age'] = 5
        dataset.loc[(dataset['Age'] > 40) & (dataset['Age'] <= 66), 'Age'] = 6
        dataset.loc[dataset['Age'] > 66, 'Age'] = 6

    # let's see how it's distributed train_df['Age'].value_counts()
    data = [train_df, test_df]

    for dataset in data:
        dataset.loc[dataset['Fare'] <= 7.91, 'Fare'] = 0
        dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454),
                    'Fare'] = 1
        dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31),
                    'Fare'] = 2
        dataset.loc[(dataset['Fare'] > 31) & (dataset['Fare'] <= 99),
                    'Fare'] = 3
        dataset.loc[(dataset['Fare'] > 99) & (dataset['Fare'] <= 250),
                    'Fare'] = 4
        dataset.loc[dataset['Fare'] > 250, 'Fare'] = 5
        dataset['Fare'] = dataset['Fare'].astype(int)
    data = [train_df, test_df]
    for dataset in data:
        dataset['Age_Class'] = dataset['Age'] * dataset['Pclass']
    for dataset in data:
        dataset['Fare_Per_Person'] = dataset['Fare'] / (dataset['relatives'] +
                                                        1)
        dataset['Fare_Per_Person'] = dataset['Fare_Per_Person'].astype(int)
    # Let's take a last look at the training set, before we start training the models.
    train_df.head(10)
    train_labels = train_df[PREDICTION_LABEL]
    train_df = train_df.drop(PREDICTION_LABEL, axis=1)

    # -----------------------DATA SAVING START---------------------------------
    if "train_df" in locals():
        _kale_resource_save(train_df,
                            os.path.join(_kale_data_directory, "train_df"))
    else:
        print("_kale_resource_save: `train_df` not found.")
    if "train_labels" in locals():
        _kale_resource_save(train_labels,
                            os.path.join(_kale_data_directory, "train_labels"))
    else:
        print("_kale_resource_save: `train_labels` not found.")
def logisticregression(rok_workspace_aidays01_2rlcyd0k8_url: str):

    import os
    import shutil
    from kale.utils import pod_utils
    from kale.marshal import resource_save as _kale_resource_save
    from kale.marshal import resource_load as _kale_resource_load

    _kale_data_directory = "/home/jovyan/examples/titanic-ml-dataset/.titanic_dataset_ml.ipynb.kale.marshal.dir"

    if not os.path.isdir(_kale_data_directory):
        os.makedirs(_kale_data_directory, exist_ok=True)

    pod_utils.snapshot_pipeline_step(
        "titanic-ml-fylgn", "logisticregression",
        "/home/jovyan/examples/titanic-ml-dataset/titanic_dataset_ml.ipynb")

    # -----------------------DATA LOADING START--------------------------------
    _kale_directory_file_names = [
        os.path.splitext(f)[0] for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
    ]

    if "train_labels" not in _kale_directory_file_names:
        raise ValueError("train_labels" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "train_labels"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "train_labels" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    train_labels = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))

    if "train_df" not in _kale_directory_file_names:
        raise ValueError("train_df" + " does not exists in directory")

    _kale_load_file_name = [
        f for f in os.listdir(_kale_data_directory)
        if os.path.isfile(os.path.join(_kale_data_directory, f))
        and os.path.splitext(f)[0] == "train_df"
    ]
    if len(_kale_load_file_name) > 1:
        raise ValueError("Found multiple files with name " + "train_df" +
                         ": " + str(_kale_load_file_name))
    _kale_load_file_name = _kale_load_file_name[0]
    train_df = _kale_resource_load(
        os.path.join(_kale_data_directory, _kale_load_file_name))
    # -----------------------DATA LOADING END----------------------------------

    import numpy as np
    import pandas as pd
    import seaborn as sns
    from matplotlib import pyplot as plt
    from matplotlib import style

    from sklearn import linear_model
    from sklearn.linear_model import LogisticRegression
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.linear_model import Perceptron
    from sklearn.linear_model import SGDClassifier
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.svm import SVC
    from sklearn.naive_bayes import GaussianNB

    logreg = LogisticRegression(solver='lbfgs', max_iter=110)
    logreg.fit(train_df, train_labels)
    acc_log = round(logreg.score(train_df, train_labels) * 100, 2)

    # -----------------------DATA SAVING START---------------------------------
    if "acc_log" in locals():
        _kale_resource_save(acc_log,
                            os.path.join(_kale_data_directory, "acc_log"))
    else:
        print("_kale_resource_save: `acc_log` not found.")