def prepare_water_treatment_data(data): # Randomize Data randomized_data = util.randomize_data(data.normal_data) # Split Testing / Training Data training_data, test_data = util.split_training_data(randomized_data) # Split Features / Targets training_features, training_targets = util.split_features_target( training_data) test_features, test_targets = util.split_features_target(test_data) anomalous_features, anomalous_targets = util.split_features_target( data.anomalous_data) # Standardize Data std_training_features, mean, std = util.standardize_data(training_features) std_test_features, _, _ = util.standardize_data(test_features, mean, std) std_anomalous_features, _, _ = util.standardize_data( anomalous_features, mean, std) # Impute missing values imp = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) imp.fit(std_training_features) return imp, std_training_features, std_test_features, std_anomalous_features
def execute(data, training_data_ratio=2.0 / 3.0, k=1): """ Execute the "Locally-Weighted" Linear Regression (using Closed-Form Linear Regression) :param data: Raw Data frame parsed from CSV :param training_data_ratio: The percent (0.0 to 1.0) of input data to use in training. :param k: Smoothing parameter for local weight computation :return: Nothing """ # 2. Randomize the data randomized_data = util.randomize_data(data) # 3. Select the first 2 / 3(round up) of the data for training and the remaining for testing training_data, test_data = util.split_data(randomized_data, training_data_ratio) training_outputs = util.get_output(training_data) # 4. Standardize the data(except for the last column of course) using the training data standardized_training_data, mean, std = util.standardize_data( util.get_features(training_data)) # Add offset column at the front standardized_training_data.insert(0, "Bias", 1) std_test_data, _, _ = util.standardize_data(util.get_features(test_data), mean, std) std_test_data.insert(0, "Bias", 1) squared_errors = [] # 5. Then for each testing sample for i in xrange(0, len(std_test_data)): testing_sample = std_test_data.iloc[i] expected_output = test_data.loc[testing_sample.name][-1] theta_query = compute_theta_query(testing_sample, standardized_training_data, training_outputs, k) # (b) Evaluate the testing sample using the local model. actual_output = np.dot(testing_sample, theta_query) # (c) Compute the squared error of the testing sample. squared_errors.append(util.compute_se(expected_output, actual_output)) # 6. Compute the root mean squared error (RMSE) sum_of_squared_errors = 0 for error in squared_errors: sum_of_squared_errors += error mean_squared_error = sum_of_squared_errors / len(squared_errors) rmse = math.sqrt(mean_squared_error) return rmse
def execute(dataframe, training_data_ratio=2.0 / 3): """ Execute Multi-class SVM :param dataframe: The input dataset containing the classifier as the last column :param training_data_ratio: The percentage of data to use for training (default: 2/3) :return: A list of metrics on performance for the one-vs-many, and the accuracy of one-vs-one SVM """ # Seed our randomizer to ensure we get repeatable results random.seed(0) # 2. Randomizes the data. randomized_data = util.randomize_data(dataframe) # 3. Selects the first 2/3 (round up) of the data for training and the remaining for testing training_data, test_data = util.split_data(randomized_data, training_data_ratio) # 4. Standardizes the data (except for the last column of course) using the training data training_features, training_targets = util.split_features_target( training_data) std_training_features, mean, std = util.standardize_data(training_features) # Due to the standard deviation being zero, we end up with NaN entries, reset them to zero std_training_features.fillna(0, inplace=True) test_features, test_targets = util.split_features_target(test_data) std_test_features, _, _ = util.standardize_data(test_features, mean, std) # Due to the standard deviation being zero, we end up with NaN entries, reset them to zero std_test_features.fillna(0, inplace=True) target_classes = training_targets.unique() # 5. First trains and evaluates using a One vs All approach: one_vs_many_metrics = execute_one_vs_many(std_test_features, std_training_features, target_classes, test_targets, training_targets) # 6. Trains and evaluates using a One vs One approach: num_classified_incorrectly = execute_one_vs_one(std_test_features, std_training_features, target_classes, test_targets, training_targets) num_classified_correctly = len(test_features) - num_classified_incorrectly one_vs_one_accuracy = num_classified_correctly / float(len(test_features)) return one_vs_many_metrics, one_vs_one_accuracy
def execute(self, dataframe): """ Execute the Binary-Artificial Neural Network problem :param dataframe: Input raw data :return: (final test error, list of training errors for each training iteration) """ # 2. Randomizes the data. print "Randomizing Data" random_data = util.randomize_data(dataframe) # 3. Selects the first 2/3 (round up) of the data for training and the remaining for testing print "Splitting Test and Training Data" training_data, test_data = util.split_data(random_data, self._training_data_ratio) # 4. Standardizes the data (except for the last column of course as well as the bias feature) # using the training data print "Standardizing Training Data" standardized_training_data, mean, std = util.standardize_data( self.__select_features(training_data)) # 5. Trains an artificial neural network using the training data # Our last column is the label column # 6. During the training process, compute the training error after each iteration. # You will use this to plot the training error vs. iteration number. expected_training_outputs = self.__select_target_labels( training_data).values.reshape(-1, 1) print "Training Neural Network" training_errors = self._network.train_binary( standardized_training_data, expected_training_outputs, self._iterations) # 7. Classifies the testing data using the trained neural network. print "Classifying Testing Data" expected_test_output = self.__select_target_labels(test_data) std_test_data, _, _ = util.standardize_data( self.__select_features(test_data), mean, std) actual_test_output = self._network.evaluate(std_test_data.values) # 8. Compute the testing error. print "Computing Metrics" self.__update_metrics(expected_test_output, actual_test_output) test_error = self._metrics.calculate_error() print "Test Error: ", test_error return test_error, training_errors
def apply_solution(test_input, training_mean, training_std, weights): """ Apply the closed form linear regression to the given dataframe. The input dataframe is expected to contain only the input columns, and not the output column :param test_input: Non-Standardized Dataframe, the expected output column is expected to be excluded :param weights: The weights produced by learning :param training_mean: The mean value used in standardizing the training set :param training_std: the standard deviation value using in standardizing the training set :return: """ standardized_test_inputs, _, _ = util.standardize_data( test_input, training_mean, training_std) standardized_test_inputs.insert(0, "Bias", 1) return standardized_test_inputs.dot(weights)
def execute(data, num_folds=5): """ Compute the Root Mean Squared Error using num_folds for cross validation :param data: Raw Data frame parsed from CSV :param num_folds: The number of folds to use :return: Root Mean Squared Error """ assert data is not None, "data must be a valid DataFrame" assert num_folds > 1, "num_folds must be greater than one." # 2. Randomizes the data randomized_data = util.randomize_data(data) # 3. Creates S folds (for our purposes S = 5, but make your code generalizable, that is it should # work for any legal value of S) folds = divide_data(randomized_data, num_folds) squared_errors = [] # 4. For i = 1 to S for i in xrange(0, num_folds): # (a) Select fold i as your testing data and the remaining (S - 1) folds as your training data test_data = folds[i] training_data = select_training_data(folds, i) # (b) Standardizes the data (except for the last column of course) based on the training data standardized_train_data, mean, std = util.standardize_data( util.get_features(training_data)) # Add offset column at the front standardized_train_data.insert(0, "Bias", 1) # (c) Train a closed-form linear regression model training_outputs = util.get_output(training_data) weights = cflr.find_weights(standardized_train_data, training_outputs) # (d) Compute the squared error for each sample in the current testing fold expected = util.get_output(test_data) actual = cflr.apply_solution(util.get_features(test_data), mean, std, weights) squared_error = (expected - actual)**2 squared_errors.append(squared_error) # 5. Compute the RMSE using all the errors. rmse = compute_rmse(len(data), squared_errors) return rmse
def __init__(self, df_m15, df_h1, serial=False): self.df_m15 = standardize_data( df_m15, method="log_and_diff").dropna().reset_index(drop=True) self.df_h1 = df_h1 self.net_worth = INITIAL_BALANCE self.prev_net_worth = INITIAL_BALANCE self.usd_held = INITIAL_BALANCE self.eur_held = 0 self.current_step = 0 self.reward = 0 self.serial = serial # trade history self.trades = [] # our profit in last 5 trades self.returns = np.zeros(10) # index of episodes (1 episode equivalent to 1 week of trading) self.episode_indices_m15, self.h1_indices = get_episode( self.df_m15, self.df_h1) self.action_space = spaces.Discrete(6) # observation space, includes: OLHC prices (normalized), close price (unnormalized), # time in minutes(encoded), day of week(encoded), action history, net worth changes history # both minutes, days feature are encoded using sin and cos function to retain circularity self.observation_space = spaces.Box(low=-10, high=10, shape=(12, WINDOW_SIZE + 1), dtype=np.float16) self.metrics = Metric(INITIAL_BALANCE) self.setup_active_df() self.agent_history = { "actions": np.zeros(len(self.active_df) + WINDOW_SIZE), "net_worth": np.zeros(len(self.active_df) + WINDOW_SIZE), "eur_held": np.zeros(len(self.active_df) + WINDOW_SIZE), "usd_held": np.full(len(self.active_df), self.usd_held / BALANCE_NORM_FACTOR) }
def execute(data): """ :param data: Raw Data frame parsed from CSV :return: Nothing """ # 2. Randomizes the data randomized_data = util.randomize_data(data) # 3. Selects the first 2/3 (round up) of the data for training and the remaining for testing training_data_size = 2.0 / 3.0 training_data, test_data = util.split_data(randomized_data, training_data_size) # Capture the predicted outputs training_outputs = training_data[training_data.columns[-1]] # 4. Standardizes the data (except for the last column of course) using the training data training_inputs, training_mean, training_std = util.standardize_data( util.get_features(training_data)) # Add offset column at the front training_inputs.insert(0, "Bias", 1) # 5. Computes the closed-form solution of linear regression weights = find_weights(training_inputs, training_outputs) # 6. Applies the solution to the testing samples test_input = util.get_features(test_data) expected = util.get_output(test_data) actual = apply_solution(test_input, training_mean, training_std, weights) # 7. Computes the root mean squared error (RMSE) rmse = util.compute_rmse(expected, actual) return weights, rmse
def execute(data, training_data_ratio=2.0 / 3): """ Execute the Naive Bayes classification :param data: Dataframe containing training and test data :param training_data_ratio: :return: """ spam_class_name = 1 not_spam_class_name = 0 # 2. Randomize the data. print "Randomizing Data" randomized_data = util.randomize_data(data) # 3. Split the data in for training and testing print "Splitting Data for Test and Training" training_data, test_data = util.split_data(randomized_data, training_data_ratio) # 4. Standardize Training Data (except for class labels) print "Standardizing Training Data" training_features, training_data_target = util.split_features_target(training_data) std_training_features, mean, std = util.standardize_data(training_features) # 5. Divides the training data into two groups: Spam samples, Non-Spam samples. target_groups = training_data_target.groupby(training_data_target) total_training_size = float(len(training_data)) print "Computing probability of priors" data_class_probability = {class_name: len(target_group) / total_training_size for (class_name, target_group) in target_groups} # 6. Creates Normal models for each feature for each class. print "Creating normal models for each feature, for each class" models = {} for class_name, target_group in target_groups: models[class_name] = {} for feature_name in training_features.columns: dataset = std_training_features.loc[target_group.index][feature_name] feature_mean = dataset.mean() feature_std = dataset.std() models[class_name][feature_name] = {"mean":feature_mean, "standard_deviation": feature_std} # 7. Classify each testing sample using these models and choosing the class label based # on which class probability is higher. print "Evaluating models for each test data point" test_features, test_targets = util.split_features_target(test_data) std_test_features, _, _ = util.standardize_data(test_features, mean, std) true_positives = 0 true_negatives = 0 false_positives = 0 false_negatives = 0 for i in xrange(len(std_test_features)): probability_per_class = compute_posterior(models, data_class_probability, std_test_features.iloc[i]) # Select the class label of the class with highest probability assigned_class = max(probability_per_class.iteritems(), key=operator.itemgetter(1))[0] expected_class = test_targets.iloc[i] # Tally up each of our counters for performance measurements if expected_class == spam_class_name: if assigned_class == spam_class_name: true_positives += 1 else: # assigned_class == not_spam_class_name false_negatives += 1 else: # expected_class == not_spam_class_name if assigned_class == not_spam_class_name: true_negatives += 1 else: # assigned_class == spam_class_name false_positives += 1 # 8. Computes the statistics using the testing data results metrics = BinaryClassifierMetric(true_positives, false_positives, true_negatives, false_negatives) return metrics
def execute(data, learning_rate=0.001, training_data_ratio=2.0 / 3, max_iterations=1000000): """ Perform Batch Gradient Descent :param data: Raw Data frame parsed from CSV :param learning_rate: The rate at which to advance along the gradient :param training_data_ratio: The percent of given data to use for training (remaining percent is used for testing) :param max_iterations: The maximum number of iterations to execute before exiting :return: Nothing """ # 2. Randomizes the data print "Randomizing Data" randomized_data = util.randomize_data(data) # 3. Selects the first 2 / 3 (round up) of the data for training and the remaining for testing print "Selecting Training Data" training_data, test_data = util.split_data(randomized_data, training_data_ratio) # 4. Standardizes the data(except for the last column of course) base on the training data print "Standardizing Data" std_training_data, mean, std = util.standardize_data( util.get_features(training_data)) std_training_data.insert(0, "Bias", 1) std_test_data, _, _ = util.standardize_data(util.get_features(test_data), mean, std) std_test_data.insert(0, "Bias", 1) iteration = 0 prior_rmse = 0 current_rmse = 100 # Doesn't matter what this value is, so long as it doesn't equal prior rmse eps = np.spacing(1) N = len(std_training_data) # Start with randomized values for theta theta = np.array([random.uniform(-1, 1) for _ in xrange(0, 3)]) # Capture our expected values for the training data expected = util.get_output(training_data) test_data_expected = util.get_output(test_data) # Capture the RMSE for test and training over all iterations test_rmse_values = [] training_rmse_values = [] print "Performing Gradient Descent Linear Regression" # 5. While the termination criteria (mentioned above in the implementation details) hasn't been met while iteration <= max_iterations and abs(current_rmse - prior_rmse) >= eps: prior_rmse = current_rmse # (a) Compute the RMSE of the training data # By applying the current theta values to the training set & comparing results actual = std_training_data.dot(theta) current_rmse = util.compute_rmse(expected, actual) # (b) While we can't let the testing set affect our training process, also compute the RMSE of # the testing error at each iteration of the algorithm (it'll be interesting to see). # Same thing as (a), but use test inputs / outputs test_data_actual = std_test_data.dot(theta) test_data_rmse = util.compute_rmse(test_data_expected, test_data_actual) # (c) Update each parameter using batch gradient descent # By use of the learning rate for i in xrange(len(theta)): # We know the length of theta is the same as the num columns in std_training_data errors = (actual - expected ) * std_training_data[std_training_data.columns[i]] cumulative_error = errors.sum() theta[i] -= learning_rate / N * cumulative_error iteration += 1 test_rmse_values.append(test_data_rmse) training_rmse_values.append(current_rmse) print "Completed in {0} iterations".format(iteration) print "Plotting Errors" image_path = plot_rmse_values(test_rmse_values, training_rmse_values, learning_rate) print "Saved Image to '{0}'".format(image_path) # 6. Compute the RMSE of the testing data. print "Computing RMSE of Test Data" test_data_actual = std_test_data.dot(theta) test_data_rmse = util.compute_rmse(test_data_expected, test_data_actual) return theta, test_data_rmse
def create_cross_validation_data(data, num_folds=4, normal_sample_size=None, anomalous_sample_size=None): """ Create num_folds cross-validation datasets :param data: :param num_folds: :param normal_sample_size: The number of samples to take from the normal data :param anomalous_sample_size: The number of samples to take from the anomalous data :return: """ if normal_sample_size is None: normal_data = data.normal_data else: normal_sample_size = min(normal_sample_size, len(data.normal_data)) normal_data = data.normal_data.sample(normal_sample_size) if anomalous_sample_size is None: anomalous_data = data.anomalous_data else: anomalous_sample_size = min(anomalous_sample_size, len(data.anomalous_data)) anomalous_data = data.anomalous_data.sample(anomalous_sample_size) randomized_data = util.randomize_data(normal_data) randomized_features, _ = util.split_features_target(randomized_data) anomaly_features, _ = util.split_features_target(anomalous_data) datasets = [] shuffler = ShuffleSplit(n_splits=num_folds, train_size=1 / float(num_folds), test_size=None, random_state=0) for train_index, test_index in shuffler.split(randomized_features): training_features = randomized_features.iloc[train_index] test_features = randomized_features.iloc[test_index] imputer = Imputer(missing_values='NaN', strategy='most_frequent', axis=0) std_training_features, mean, std = util.standardize_data( training_features) imputer.fit(std_training_features) std_training_features = imputer.transform(std_training_features) std_test_features, _, _ = util.standardize_data( test_features, mean, std) std_test_features = imputer.transform(std_test_features) std_anomaly_features, _, _ = util.standardize_data( anomaly_features, mean, std) std_anomaly_features = imputer.transform(std_anomaly_features) datasets.append( CrossValidationDataSet(imputer, std_training_features, std_test_features, std_anomaly_features)) return datasets