Пример #1
0
 def __init__(self, fs_tuition, sigmasq, opts):
     self.fs_tuition = fs_tuition
     self.sigmasq = sigmasq
     self.opts = opts
     self.fs_rhs = lu.student_problem_vars()
     self.lsn_rhs = deepcopy(self.fs_rhs)
     self.lsn_rhs.remove('OverallRank')
     self.lsn_rhs.remove('Tuition')
     self.lsn_models = self.gen_matric_ev()
     self.data = None
Пример #2
0
 def lsn_long_est(self, data):
     """ Application/admission estimates from lsnLong.csv """
     rhs = lu.student_problem_vars()
     stages = {}
     stages['app'] = ensemble.GradientBoostingClassifier(
         n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0
     ).fit(data[rhs], data['app'])
     stages['admit'] = ensemble.GradientBoostingClassifier(
         n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0
     ).fit(data.loc[data['app'] == 1, rhs],
           data.loc[data['app'] == 1, 'admit'])
     stages['matric'] = ensemble.GradientBoostingClassifier(
         n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0
     ).fit(data.loc[data['admit'] == 1, rhs],
           data.loc[data['admit'] == 1, 'matric'])
     if self.opts['verbose']:
         self._lsn_long_diagnostics(stages, data, rhs)
     return stages
 def estimator_plot(self, stage, key):
     """ Plotting function for any given stage """
     feature_importance = stage.feature_importances_
     # make importances relative to max importance
     feature_importance = 100.0 * (feature_importance /
                                   feature_importance.max())
     sorted_idx = np.argsort(feature_importance)
     pos = np.arange(sorted_idx.shape[0]) + .5
     axis = self.fig.add_subplot(2, 2, self.plotnum)
     axis.barh(pos, feature_importance[sorted_idx], align='center')
     axis.set_yticks(pos)
     rhs = np.array(lu.student_problem_vars())
     rhs[rhs == 'OverallRank'] = 'Rank'
     rhs[rhs == 'LSDAS_GPA'] = 'GPA'
     rhs[rhs == 'year'] = 'Year'
     axis.set_yticklabels(rhs[sorted_idx].tolist())
     axis.set_xlabel('Relative Importance: {0}'.format(key))
     self.plotnum += 1
Пример #4
0
 def gen_data(self, treat):
     """ Generate random dataset """
     if treat:
         years = [2010, 2011, 2012]
     else:
         years = [2007, 2008, 2009]
     app_data_treat = self.app_data.loc[
         self.app_data['year'].isin(years)
     ].reset_index(drop=True)
     data = []
     for year in np.unique(app_data_treat['year']):
         app_data_year = self.app_data.loc[
             self.app_data['year'] == year
         ].reset_index(drop=True)
         students = np.random.choice(
             app_data_year['user'],
             size=(gen_n_apps(year).tolist()[0] / 5)  # Scale for memory
         )
         keep_vars = lu.student_problem_vars()
         keep_vars.append('school')
         #out = []
         out = mp.Manager().list()
         if self.opts['multiprocessing']:
             mp_args = ((app_data_year, students, keep_vars, out, i)
                        for i in range(len(students)))
             pool = mp.Pool(processes=lc.N_THREADS)
             pool.map(gen_data_task, mp_args)
             pool.close()
             pool.join()
         else:
             for i in xrange(len(students)):
                 out_data = app_data_year.loc[
                     app_data_year['user'] == students[i], keep_vars
                 ]
                 out_data['id'] = i
                 out.append(out_data)
         data_year = pd.concat(list(out))
         data_year.reset_index(inplace=True)
         data.append(data_year)
     data = pd.concat(data)
     data.reset_index(inplace=True)
     return data
Пример #5
0
 def __init__(self, app_data, firststage, opts):
     self.app_data = app_data
     self.data = None
     self.firststage = firststage
     self.opts = opts
     self.rhs = lu.student_problem_vars()