def test_weighted_avg(self): y1, y2, y3 = [1, 2, 3], [4, 5, 6], [7, 8, 9] df = pd.DataFrame({'y1': y1, 'y2': y2, 'y3': y3}) # Check unweighted mean self.assertTrue(np.array_equal( df.mean(axis=1).values, ensemble.weighted_avg([y1, y2, y3]) )) # Check weighted mean self.assertTrue(np.array_equal( ensemble.weighted_avg([y1, y2, y3], weights=[1, 1, 2]), np.array([19/4, 23/4, 27/4]) ))
def weighted_avg_from_files( fnames, outfile, weights=[], sample_submission_file=DEFAULT_SAMPLE_SUBMISSION_FILE, sample_submission_idx=DEFAULT_SAMPLE_SUBMISSION_IDX): """Compute weighted avg from submission files, and save results to a new file Parameters ---------- fnames : Iterable of str's Submission file names of y_hats to be averaged outfile : str Output file name, including path weights : Iterable Weights corresponding to y_hats in the same order. If weights is empty, then the model just returns unweighted mean. sample_submission_file : str Path to example submission file provided by Kaggle sample_submission_idx : str Index column name in `sample_submission_file` Returns ------- y_hat_avg : numpy.ndarray Weighted averages """ y_hat_avg = ensemble.weighted_avg( [pd.read_csv(f, index_col=sample_submission_idx, squeeze=True).values for f in fnames], weights=weights ) save_submission(y_hat_avg, outfile, sample_submission_file=sample_submission_file) return y_hat_avg