def group_by_columns(data, group_by_columns, column_to_agg): '''Takes a list of column names and a column to count and counts rows by those column name, including nulls''' all_columns = group_by_columns + [column_to_agg] data.loc[:, column_to_agg] = data.loc[:, column_to_agg].fillna(value=0) data = data.loc[:, all_columns] grouped = data.groupby(group_by_columns) return grouped
def fit_polynomial(data, title, file, alpha=0): """ Fits data to linear models of polynomial degrees 1-5 to compare :param data: water maze cipl data :param title: title of plot (string) :param file: filename :param alpha: alpha value :return: predictions, y intercept, coefficients, and errors """ X_train, X_test, y_train, y_test = train_test_split( data['Trial'], data['Water Maze CIPL']) colors = ['teal', 'yellowgreen', 'gold', 'purple', 'pink', 'brown'] lw = 2 mses = [] for degree in [1, 2, 3, 4, 5]: model = make_pipeline(PolynomialFeatures(degree), Ridge(alpha=alpha)) model.fit(X_train.values.reshape(-1, 1), y_train) if degree == 1: line = model.predict(np.linspace(0, 30, 100).reshape(-1, 1)) intercept = model._final_estimator.intercept_ coef = model._final_estimator.coef_ y_plot = model.predict(np.linspace(0, 30, 100).reshape(-1, 1)) mse = mean_squared_error(y_test, model.predict(X_test.values.reshape(-1, 1))) mses.append(mse) plt.plot(np.linspace(0, 30, 100), y_plot, color=colors[degree - 1], linewidth=lw, label="degree {0} test error:{1}".format( degree, mse.round(2))) ys = data.groupby('Trial').mean()['Water Maze CIPL'] plt.scatter(range(1, 25), ys, color='black', edgecolors='black', s=30, marker='o', label="Average trial performance") plt.legend(loc=2, prop={'size': 8}) plt.title(title) plt.xlim(0, 30) plt.ylim(0, 60) plt.xlabel('Trial') plt.ylabel('CIPL') plt.savefig('Results/Regression/Learning/' + file) plt.show() return line, intercept, coef, mses
def create_counts(data, group_by_columns, column_to_count): '''Takes a list of column names and a column to count and counts rows by those column name, including nulls''' all_columns = group_by_columns + [column_to_count] data = data.loc[:, all_columns].fillna(value=1) grouped = data.groupby(group_by_columns) return grouped.count()