async def accuracy(self, sources: Sources) -> Accuracy: # Load saved regression line regression_line = self.storage.get("regression_line", None) # Ensure the model has been trained before we try to make a prediction if regression_line is None: raise ModelNotTrained("Train model before assessing for accuracy") # Split regression line tuple into variables, ignore accuracy from # training data since we'll be re-calculating it for the test data m, b, _accuracy = regression_line # X and Y data x = [] y = [] # Go through all records that have the feature we're testing on and the # feature we want to predict. async for record in sources.with_features( [self.config.feature.name, self.config.predict.name]): x.append(record.feature(self.config.feature.name)) y.append(record.feature(self.config.predict.name)) # Use self.logger to report how many records are being used for testing self.logger.debug("Number of test records: %d", len(x)) # Calculate the regression line for test data and accuracy of line regression_line = [m * x + b for x in x] accuracy = coeff_of_deter(y, regression_line) # Update the accuracy to be the accuracy when assessed on the test data self.storage["regression_line"] = m, b, accuracy return Accuracy(accuracy)
async def train(self, sources: Sources): async for record in sources.with_features(self.features + [self.config.predict.NAME]): feature_data = record.features(self.features + [self.config.predict.NAME]) self.xData = np.append(self.xData, feature_data[self.features[0]]) self.yData = np.append(self.yData, feature_data[self.config.predict.NAME]) self.separating_line = self.best_separating_line()
async def train(self, sources: Sources) -> None: async for record in sources.with_features( self.features + [self.parent.config.predict.name]): feature_data = record.features(self.features + [self.parent.config.predict.name]) df = self.pd.DataFrame(feature_data, index=[0]) xdata = df.drop([self.parent.config.predict.name], 1) ydata = df[self.parent.config.predict.name] self.lm.compute(xdata, ydata) self.lm_trained = self.lm.finalize().model self.joblib.dump(self.lm_trained, self.path)
async def train(self, sources: Sources): all_data = [] async for record in sources.with_features( self.features + [self.parent.config.predict.name]): all_data.append(record.features()) df = pd.DataFrame(all_data) y_train = df[[self.parent.config.predict.name]] x_train = df.drop(columns=[self.parent.config.predict.name]) self.model.fit(x_train, y_train) self.model.fit_ensemble(y_train, ensemble_size=self.parent.config.ensemble_size) joblib.dump(self.model, self.path)
async def accuracy(self, sources: Sources) -> Accuracy: if not self.model: raise ModelNotTrained("Train the model before assessing accuracy") test_data = [] async for record in sources.with_features( self.features + [self.parent.config.predict.name]): test_data.append(record.features()) df = pd.DataFrame(test_data) y_test = df[[self.parent.config.predict.name]] x_test = df.drop(columns=[self.parent.config.predict.name]) predictions = await self.get_predictions(x_test) accuracy = await self.accuracy_score(y_test, predictions) return Accuracy(accuracy)
async def train(self, sources: Sources) -> None: async for record in sources.with_features( self.features + [self.parent.config.predict.name]): feature_data = record.features(self.features + [self.parent.config.predict.name]) # NOTE Duplicate feature data due to regression in oneDAL # See https://github.com/intel/dffml/issues/801 df = self.pd.DataFrame([feature_data] * 2, index=[0, 1]) xdata = df.drop([self.parent.config.predict.name], 1) ydata = df[self.parent.config.predict.name] self.lm.compute(xdata, ydata) self.lm_trained = self.lm.finalize().model self.joblib.dump(self.lm_trained, self.path)
async def train(self, sources: Sources) -> None: # X and Y data x = [] y = [] # Go through all records that have the feature we're training on and the # feature we want to predict. async for record in sources.with_features( [self.config.feature.name, self.config.predict.name]): x.append(record.feature(self.config.feature.name)) y.append(record.feature(self.config.predict.name)) # Use self.logger to report how many records are being used for training self.logger.debug("Number of training records: %d", len(x)) # Save m, b, and accuracy self.storage["regression_line"] = best_fit_line(x, y)
async def train(self, sources: Sources) -> None: # X and Y data x = [] y = [] # Go through all records that have the feature we're training on and the # feature we want to predict. Since our model only supports 1 feature, # the self.features list will only have one element at index 0. async for record in sources.with_features(self.features + [self.config.predict.NAME]): x.append(record.feature(self.features[0])) y.append(record.feature(self.config.predict.NAME)) # Use self.logger to report how many records are being used for training self.logger.debug("Number of input records: %d", len(x)) # Save m, b, and accuracy self.storage["regression_line"] = best_fit_line(x, y)
async def accuracy(self, sources: Sources) -> Accuracy: if self.lm_trained is None: raise ModelNotTrained("Train model before assessing for accuracy.") feature_data = [] async for record in sources.with_features( self.features + [self.parent.config.predict.name]): feature_data.append( record.features(self.features + [self.parent.config.predict.name])) df = self.pd.DataFrame(feature_data) xdata = df.drop([self.parent.config.predict.name], 1) ydata = df[self.parent.config.predict.name] preds = self.ac_predictor.compute(xdata, self.lm_trained) # Calculate accuracy with an error margin of 0.1 accuracy_val = sum( self.compare(list(map(abs, map(sub, ydata, preds.prediction))), 0.1)) / len(ydata) return Accuracy(accuracy_val)