def main(): # simulate data data = simulate_data(50000) train, test, params = data['train'], data['test'], data['params'] # sneak peak print("This is what the data looks like.") print(train.head()) print() # Look at obvious correlations print("This is how the true parameters differ among categories") print(train.groupby('category').mean().drop('id', axis=1)) print() print("This is how the true parameters differ among counts") print(train.groupby('counts').mean().drop('id', axis=1)) print() print('There is some correlation with the numerical variable too.') print(train[['numerical', 'alpha_true', 'beta_true', 'age', 'alive']].corr()) print() # START MODELING # Create the sbs object using all features. Lets keep gamma small and let # the model "overfit" if necessary. We have enough data. sbs = SBGSurvival(age='age', alive='alive', features=['category', 'counts', 'numerical'], gamma=1e-6, verbose=True) # Train model sbs.fit(train) # Summary of results print(sbs.summary()) print() # Make some predictions pred = pd.concat([test, sbs.predict_params(test)], axis=1) print("Mean Absolute Error for Alpha: " "{}".format((pred['alpha_true'] - pred['alpha']).abs().mean())) print("Mean Absolute Error for Beta: " "{}".format((pred['beta_true'] - pred['beta']).abs().mean())) print() # correlation between true and predicitons print("Predictons better be correlated with true values.") print(pred[['alpha_true', 'alpha']].corr()) print(pred[['beta_true', 'beta']].corr()) # Done print("Not bad.")
def add_random(): data = make_raw_article_data() # Create the sbs object using all features. Lets keep gamma small and let # the model "overfit" if necessary. We have enough data. sbs = SBGSurvival(age='age', alive='alive', features=['is_high_end', 'random'], gamma=1e-1, verbose=True) sbs.fit(data) print(sbs.summary()) pred = pd.concat([data, sbs.predict_params(data)], axis=1) print(pred.head()) print(pred.groupby('is_high_end').mean().drop('id', axis=1))