def prepare_water_treatment_data(data):
    # Randomize Data
    randomized_data = util.randomize_data(data.normal_data)

    # Split Testing / Training Data
    training_data, test_data = util.split_training_data(randomized_data)

    # Split Features / Targets
    training_features, training_targets = util.split_features_target(
        training_data)
    test_features, test_targets = util.split_features_target(test_data)
    anomalous_features, anomalous_targets = util.split_features_target(
        data.anomalous_data)

    # Standardize Data
    std_training_features, mean, std = util.standardize_data(training_features)
    std_test_features, _, _ = util.standardize_data(test_features, mean, std)
    std_anomalous_features, _, _ = util.standardize_data(
        anomalous_features, mean, std)

    # Impute missing values
    imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0)
    imp.fit(std_training_features)

    return imp, std_training_features, std_test_features, std_anomalous_features
示例#2
0
def execute(data, training_data_ratio=2.0 / 3.0, k=1):
    """
    Execute the "Locally-Weighted" Linear Regression (using Closed-Form Linear Regression)
    :param data: Raw Data frame parsed from CSV
    :param training_data_ratio: The percent (0.0 to 1.0) of input data to use in training.
    :param k: Smoothing parameter for local weight computation
    :return: Nothing
    """
    # 2. Randomize the data
    randomized_data = util.randomize_data(data)

    # 3. Select the first 2 / 3(round up) of the data for training and the remaining for testing
    training_data, test_data = util.split_data(randomized_data,
                                               training_data_ratio)
    training_outputs = util.get_output(training_data)

    # 4. Standardize the data(except for the last column of course) using the training data
    standardized_training_data, mean, std = util.standardize_data(
        util.get_features(training_data))

    # Add offset column at the front
    standardized_training_data.insert(0, "Bias", 1)

    std_test_data, _, _ = util.standardize_data(util.get_features(test_data),
                                                mean, std)
    std_test_data.insert(0, "Bias", 1)

    squared_errors = []
    # 5. Then for each testing sample
    for i in xrange(0, len(std_test_data)):

        testing_sample = std_test_data.iloc[i]
        expected_output = test_data.loc[testing_sample.name][-1]

        theta_query = compute_theta_query(testing_sample,
                                          standardized_training_data,
                                          training_outputs, k)

        # (b) Evaluate the testing sample using the local model.
        actual_output = np.dot(testing_sample, theta_query)

        # (c) Compute the squared error of the testing sample.
        squared_errors.append(util.compute_se(expected_output, actual_output))

    # 6. Compute the root mean squared error (RMSE)
    sum_of_squared_errors = 0
    for error in squared_errors:
        sum_of_squared_errors += error

    mean_squared_error = sum_of_squared_errors / len(squared_errors)

    rmse = math.sqrt(mean_squared_error)

    return rmse
示例#3
0
def execute(dataframe, training_data_ratio=2.0 / 3):
    """
    Execute Multi-class SVM
    :param dataframe: The input dataset containing the classifier as the last column
    :param training_data_ratio: The percentage of data to use for training (default: 2/3)
    :return: A list of metrics on performance for the one-vs-many, and the accuracy of one-vs-one SVM
    """

    # Seed our randomizer to ensure we get repeatable results
    random.seed(0)

    # 2. Randomizes the data.
    randomized_data = util.randomize_data(dataframe)

    # 3. Selects the first 2/3 (round up) of the data for training and the remaining for testing
    training_data, test_data = util.split_data(randomized_data,
                                               training_data_ratio)

    # 4. Standardizes the data (except for the last column of course) using the training data
    training_features, training_targets = util.split_features_target(
        training_data)
    std_training_features, mean, std = util.standardize_data(training_features)

    # Due to the standard deviation being zero, we end up with NaN entries, reset them to zero
    std_training_features.fillna(0, inplace=True)

    test_features, test_targets = util.split_features_target(test_data)
    std_test_features, _, _ = util.standardize_data(test_features, mean, std)

    # Due to the standard deviation being zero, we end up with NaN entries, reset them to zero
    std_test_features.fillna(0, inplace=True)

    target_classes = training_targets.unique()

    # 5. First trains and evaluates using a One vs All approach:
    one_vs_many_metrics = execute_one_vs_many(std_test_features,
                                              std_training_features,
                                              target_classes, test_targets,
                                              training_targets)

    # 6. Trains and evaluates using a One vs One approach:
    num_classified_incorrectly = execute_one_vs_one(std_test_features,
                                                    std_training_features,
                                                    target_classes,
                                                    test_targets,
                                                    training_targets)

    num_classified_correctly = len(test_features) - num_classified_incorrectly
    one_vs_one_accuracy = num_classified_correctly / float(len(test_features))

    return one_vs_many_metrics, one_vs_one_accuracy
    def execute(self, dataframe):
        """
        Execute the Binary-Artificial Neural Network problem
        :param dataframe: Input raw data
        :return: (final test error, list of training errors for each training iteration)
        """

        # 2. Randomizes the data.
        print "Randomizing Data"
        random_data = util.randomize_data(dataframe)

        # 3. Selects the first 2/3 (round up) of the data for training and the remaining for testing
        print "Splitting Test and Training Data"
        training_data, test_data = util.split_data(random_data,
                                                   self._training_data_ratio)

        # 4. Standardizes the data (except for the last column of course as well as the bias feature)
        #    using the training data
        print "Standardizing Training Data"
        standardized_training_data, mean, std = util.standardize_data(
            self.__select_features(training_data))

        # 5. Trains an artificial neural network using the training data
        #    Our last column is the label column
        # 6. During the training process, compute the training error after each iteration.
        #    You will use this to plot the training error vs. iteration number.
        expected_training_outputs = self.__select_target_labels(
            training_data).values.reshape(-1, 1)
        print "Training Neural Network"
        training_errors = self._network.train_binary(
            standardized_training_data, expected_training_outputs,
            self._iterations)

        # 7. Classifies the testing data using the trained neural network.
        print "Classifying Testing Data"
        expected_test_output = self.__select_target_labels(test_data)
        std_test_data, _, _ = util.standardize_data(
            self.__select_features(test_data), mean, std)

        actual_test_output = self._network.evaluate(std_test_data.values)

        # 8. Compute the testing error.
        print "Computing Metrics"
        self.__update_metrics(expected_test_output, actual_test_output)
        test_error = self._metrics.calculate_error()
        print "Test Error: ", test_error

        return test_error, training_errors
示例#5
0
def apply_solution(test_input, training_mean, training_std, weights):
    """
    Apply the closed form linear regression to the given dataframe.
    The input dataframe is expected to contain only the input columns, and not the output column
    :param test_input: Non-Standardized Dataframe, the expected output column is expected to be excluded
    :param weights: The weights produced by learning
    :param training_mean: The mean value used in standardizing the training set
    :param training_std: the standard deviation value using in standardizing the training set
    :return:
    """
    standardized_test_inputs, _, _ = util.standardize_data(
        test_input, training_mean, training_std)
    standardized_test_inputs.insert(0, "Bias", 1)

    return standardized_test_inputs.dot(weights)
def execute(data, num_folds=5):
    """
    Compute the Root Mean Squared Error using num_folds for cross validation
    :param data: Raw Data frame parsed from CSV
    :param num_folds: The number of folds to use
    :return: Root Mean Squared Error
    """
    assert data is not None, "data must be a valid DataFrame"
    assert num_folds > 1, "num_folds must be greater than one."

    # 2. Randomizes the data
    randomized_data = util.randomize_data(data)

    # 3. Creates S folds (for our purposes S = 5, but make your code generalizable, that is it should
    #   work for any legal value of S)
    folds = divide_data(randomized_data, num_folds)

    squared_errors = []
    # 4. For i = 1 to S
    for i in xrange(0, num_folds):
        #   (a) Select fold i as your testing data and the remaining (S - 1) folds as your training data
        test_data = folds[i]
        training_data = select_training_data(folds, i)

        #   (b) Standardizes the data (except for the last column of course) based on the training data
        standardized_train_data, mean, std = util.standardize_data(
            util.get_features(training_data))

        # Add offset column at the front
        standardized_train_data.insert(0, "Bias", 1)

        #   (c) Train a closed-form linear regression model
        training_outputs = util.get_output(training_data)
        weights = cflr.find_weights(standardized_train_data, training_outputs)

        #   (d) Compute the squared error for each sample in the current testing fold
        expected = util.get_output(test_data)
        actual = cflr.apply_solution(util.get_features(test_data), mean, std,
                                     weights)

        squared_error = (expected - actual)**2
        squared_errors.append(squared_error)

    # 5. Compute the RMSE using all the errors.
    rmse = compute_rmse(len(data), squared_errors)

    return rmse
示例#7
0
    def __init__(self, df_m15, df_h1, serial=False):
        self.df_m15 = standardize_data(
            df_m15, method="log_and_diff").dropna().reset_index(drop=True)
        self.df_h1 = df_h1
        self.net_worth = INITIAL_BALANCE
        self.prev_net_worth = INITIAL_BALANCE
        self.usd_held = INITIAL_BALANCE
        self.eur_held = 0
        self.current_step = 0
        self.reward = 0
        self.serial = serial
        # trade history
        self.trades = []
        # our profit in last 5 trades
        self.returns = np.zeros(10)

        # index of episodes (1 episode equivalent to 1 week of trading)
        self.episode_indices_m15, self.h1_indices = get_episode(
            self.df_m15, self.df_h1)
        self.action_space = spaces.Discrete(6)
        # observation space, includes: OLHC prices (normalized), close price (unnormalized),
        # time in minutes(encoded), day of week(encoded), action history, net worth changes history
        # both minutes, days feature are encoded using sin and cos function to retain circularity
        self.observation_space = spaces.Box(low=-10,
                                            high=10,
                                            shape=(12, WINDOW_SIZE + 1),
                                            dtype=np.float16)
        self.metrics = Metric(INITIAL_BALANCE)
        self.setup_active_df()
        self.agent_history = {
            "actions":
            np.zeros(len(self.active_df) + WINDOW_SIZE),
            "net_worth":
            np.zeros(len(self.active_df) + WINDOW_SIZE),
            "eur_held":
            np.zeros(len(self.active_df) + WINDOW_SIZE),
            "usd_held":
            np.full(len(self.active_df), self.usd_held / BALANCE_NORM_FACTOR)
        }
示例#8
0
def execute(data):
    """

    :param data: Raw Data frame parsed from CSV
    :return: Nothing
    """

    # 2. Randomizes the data
    randomized_data = util.randomize_data(data)

    # 3. Selects the first 2/3 (round up) of the data for training and the remaining for testing
    training_data_size = 2.0 / 3.0
    training_data, test_data = util.split_data(randomized_data,
                                               training_data_size)

    # Capture the predicted outputs
    training_outputs = training_data[training_data.columns[-1]]

    # 4. Standardizes the data (except for the last column of course) using the training data
    training_inputs, training_mean, training_std = util.standardize_data(
        util.get_features(training_data))

    # Add offset column at the front
    training_inputs.insert(0, "Bias", 1)

    # 5. Computes the closed-form solution of linear regression
    weights = find_weights(training_inputs, training_outputs)

    # 6. Applies the solution to the testing samples
    test_input = util.get_features(test_data)
    expected = util.get_output(test_data)
    actual = apply_solution(test_input, training_mean, training_std, weights)

    # 7. Computes the root mean squared error (RMSE)
    rmse = util.compute_rmse(expected, actual)

    return weights, rmse
示例#9
0
def execute(data, training_data_ratio=2.0 / 3):
    """
    Execute the Naive Bayes classification
    :param data: Dataframe containing training and test data
    :param training_data_ratio:
    :return:
    """

    spam_class_name = 1
    not_spam_class_name = 0

    # 2. Randomize the data.
    print "Randomizing Data"
    randomized_data = util.randomize_data(data)

    # 3. Split the data in for training and testing
    print "Splitting Data for Test and Training"
    training_data, test_data = util.split_data(randomized_data, training_data_ratio)

    # 4. Standardize Training Data (except for class labels)
    print "Standardizing Training Data"
    training_features, training_data_target = util.split_features_target(training_data)
    std_training_features, mean, std = util.standardize_data(training_features)

    # 5. Divides the training data into two groups: Spam samples, Non-Spam samples.
    target_groups = training_data_target.groupby(training_data_target)

    total_training_size = float(len(training_data))

    print "Computing probability of priors"
    data_class_probability = {class_name: len(target_group) / total_training_size
                              for (class_name, target_group) in target_groups}

    # 6. Creates Normal models for each feature for each class.
    print "Creating normal models for each feature, for each class"
    models = {}
    for class_name, target_group in target_groups:
        models[class_name] = {}
        for feature_name in training_features.columns:
            dataset = std_training_features.loc[target_group.index][feature_name]
            feature_mean = dataset.mean()
            feature_std = dataset.std()
            models[class_name][feature_name] = {"mean":feature_mean, "standard_deviation": feature_std}

    # 7. Classify each testing sample using these models and choosing the class label based
    #    on which class probability is higher.
    print "Evaluating models for each test data point"
    test_features, test_targets = util.split_features_target(test_data)
    std_test_features, _, _ = util.standardize_data(test_features, mean, std)

    true_positives = 0
    true_negatives = 0
    false_positives = 0
    false_negatives = 0
    for i in xrange(len(std_test_features)):
        probability_per_class = compute_posterior(models, data_class_probability, std_test_features.iloc[i])

        # Select the class label of the class with highest probability
        assigned_class = max(probability_per_class.iteritems(), key=operator.itemgetter(1))[0]
        expected_class = test_targets.iloc[i]

        # Tally up each of our counters for performance measurements
        if expected_class == spam_class_name:
            if assigned_class == spam_class_name:
                true_positives += 1
            else: # assigned_class == not_spam_class_name
                false_negatives += 1
        else: # expected_class == not_spam_class_name
            if assigned_class == not_spam_class_name:
                true_negatives += 1
            else: # assigned_class == spam_class_name
                false_positives += 1

    # 8. Computes the statistics using the testing data results
    metrics = BinaryClassifierMetric(true_positives, false_positives, true_negatives, false_negatives)

    return metrics
def execute(data,
            learning_rate=0.001,
            training_data_ratio=2.0 / 3,
            max_iterations=1000000):
    """
    Perform Batch Gradient Descent

    :param data: Raw Data frame parsed from CSV
    :param learning_rate: The rate at which to advance along the gradient
    :param training_data_ratio: The percent of given data to use for training (remaining percent is used for testing)
    :param max_iterations: The maximum number of iterations to execute before exiting
    :return: Nothing
    """

    # 2. Randomizes the data
    print "Randomizing Data"
    randomized_data = util.randomize_data(data)

    # 3. Selects the first 2 / 3 (round up) of the data for training and the remaining for testing
    print "Selecting Training Data"
    training_data, test_data = util.split_data(randomized_data,
                                               training_data_ratio)

    # 4. Standardizes the data(except for the last column of course) base on the training data
    print "Standardizing Data"
    std_training_data, mean, std = util.standardize_data(
        util.get_features(training_data))
    std_training_data.insert(0, "Bias", 1)

    std_test_data, _, _ = util.standardize_data(util.get_features(test_data),
                                                mean, std)
    std_test_data.insert(0, "Bias", 1)

    iteration = 0
    prior_rmse = 0
    current_rmse = 100  # Doesn't matter what this value is, so long as it doesn't equal prior rmse
    eps = np.spacing(1)
    N = len(std_training_data)

    # Start with randomized values for theta
    theta = np.array([random.uniform(-1, 1) for _ in xrange(0, 3)])

    # Capture our expected values for the training data
    expected = util.get_output(training_data)
    test_data_expected = util.get_output(test_data)

    # Capture the RMSE for test and training over all iterations
    test_rmse_values = []
    training_rmse_values = []

    print "Performing Gradient Descent Linear Regression"
    # 5. While the termination criteria (mentioned above in the implementation details) hasn't been met
    while iteration <= max_iterations and abs(current_rmse -
                                              prior_rmse) >= eps:
        prior_rmse = current_rmse

        #   (a) Compute the RMSE of the training data
        #       By applying the current theta values to the training set & comparing results
        actual = std_training_data.dot(theta)
        current_rmse = util.compute_rmse(expected, actual)

        #   (b) While we can't let the testing set affect our training process, also compute the RMSE of
        #       the testing error at each iteration of the algorithm (it'll be interesting to see).
        #       Same thing as (a), but use test inputs / outputs
        test_data_actual = std_test_data.dot(theta)
        test_data_rmse = util.compute_rmse(test_data_expected,
                                           test_data_actual)

        #   (c) Update each parameter using batch gradient descent
        #       By use of the learning rate
        for i in xrange(len(theta)):
            # We know the length of theta is the same as the num columns in std_training_data
            errors = (actual - expected
                      ) * std_training_data[std_training_data.columns[i]]
            cumulative_error = errors.sum()
            theta[i] -= learning_rate / N * cumulative_error

        iteration += 1
        test_rmse_values.append(test_data_rmse)
        training_rmse_values.append(current_rmse)

    print "Completed in {0} iterations".format(iteration)

    print "Plotting Errors"
    image_path = plot_rmse_values(test_rmse_values, training_rmse_values,
                                  learning_rate)
    print "Saved Image to '{0}'".format(image_path)

    # 6. Compute the RMSE of the testing data.
    print "Computing RMSE of Test Data"
    test_data_actual = std_test_data.dot(theta)
    test_data_rmse = util.compute_rmse(test_data_expected, test_data_actual)
    return theta, test_data_rmse
def create_cross_validation_data(data,
                                 num_folds=4,
                                 normal_sample_size=None,
                                 anomalous_sample_size=None):
    """
    Create num_folds cross-validation datasets
    :param data:
    :param num_folds:
    :param normal_sample_size: The number of samples to take from the normal data
    :param anomalous_sample_size: The number of samples to take from the anomalous data
    :return:
    """

    if normal_sample_size is None:
        normal_data = data.normal_data
    else:
        normal_sample_size = min(normal_sample_size, len(data.normal_data))
        normal_data = data.normal_data.sample(normal_sample_size)

    if anomalous_sample_size is None:
        anomalous_data = data.anomalous_data
    else:
        anomalous_sample_size = min(anomalous_sample_size,
                                    len(data.anomalous_data))
        anomalous_data = data.anomalous_data.sample(anomalous_sample_size)

    randomized_data = util.randomize_data(normal_data)
    randomized_features, _ = util.split_features_target(randomized_data)

    anomaly_features, _ = util.split_features_target(anomalous_data)

    datasets = []

    shuffler = ShuffleSplit(n_splits=num_folds,
                            train_size=1 / float(num_folds),
                            test_size=None,
                            random_state=0)

    for train_index, test_index in shuffler.split(randomized_features):

        training_features = randomized_features.iloc[train_index]
        test_features = randomized_features.iloc[test_index]

        imputer = Imputer(missing_values='NaN',
                          strategy='most_frequent',
                          axis=0)
        std_training_features, mean, std = util.standardize_data(
            training_features)

        imputer.fit(std_training_features)
        std_training_features = imputer.transform(std_training_features)

        std_test_features, _, _ = util.standardize_data(
            test_features, mean, std)
        std_test_features = imputer.transform(std_test_features)

        std_anomaly_features, _, _ = util.standardize_data(
            anomaly_features, mean, std)
        std_anomaly_features = imputer.transform(std_anomaly_features)

        datasets.append(
            CrossValidationDataSet(imputer, std_training_features,
                                   std_test_features, std_anomaly_features))

    return datasets