def modeled_indices(X, y, df, model_type= None): ''' We get the largest negatives residuals here INPUT Feature matrix, Labels ie price_filtering, Specify which model OUTPUT Sorted residuals from the specified modeled (largest negatives first) ''' #Manual override option if model_type: model = {'L1': Lasso(alpha= 1, tol= .01, warm_start= False, positive= False), 'SVR_lin': SVR('linear', C= 6.3095734448019298, gamma= 0.1, degree= 1), 'RF': RandomForestRegressor(min_samples_split= 2, n_estimators= 10)} y_hat = model[model_type].fit(X, y).predict(X) else: model = search_best_params(X, y, df) y_hat = model.fit(X, y).predict(X) #Used for later for identifying 2stds df['predicted_price'] = y_hat df['residual'] = df['px'] - df['predicted_price'] #Residual as a percentage # df['price_distance_craig'] = 1 - df['px'] / df['predicted_price'] #change specifically so we could show in terms of stand deviation vs percentage df['price_distance_craig'] = df['residual'] / df['residual'].std() return df, find_indices(y_hat, y), model
def routine(X, y, model, df, pxs): ''' INPUT Feature Matrix, labels, Original DataFrame, eBay prices OUTPUT Recommendation Table (Ranked respectively) ''' # this should be in some proportion to the sample size top_n_recs = int(len(df) * .15) if top_n_recs < 10: raise Exception("too few observations") top_indices = find_indices(model.fit(X, y).predict(X), y) df = df.iloc[top_indices][['heading', 'year', 'px']][:top_n_recs] max_price = make_pricing_panel(df, pxs)['spread'].max() average_spread = make_pricing_panel(df, pxs)['spread'].mean() return max_price, average_spread