def analyze(eg1, eg2, eg3, analysis_events, es, m1, m2, m3, m1u, m2u, m3u, chunk, seed_events, update_events): print '--- Data Tomographer ---' # create events x1a, y1a = eg1.get(analysis_events) x2a, y2a = eg2.get(analysis_events) x3a, y3a = eg3.get(analysis_events) # pass events through models filter xs, ys = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a), models=(m1, m2, m3), event_gains=(1., 1., 1.)) x1, x2, x3 = xs y1, y2, y3 = ys print 'Current model events at %d:' % chunk print x1.shape[0], x2.shape[0], x3.shape[0] dt = DT([x1a, x2a, x3a], [y1a, y2a, y3a], [x1, x2, x3], [y1, y2, y3], [m1, m2, m3]) file_descriptor = 'seed%d_update%d_' % (seed_events, update_events) dt.plot_kl(ntiles=10, rule='auto', prior=1e-8, verbose=False, saveas='unbiased_feature_kl_'+file_descriptor) dt.plot_stagewise(metric='logloss', verbose=False, saveas='unbiased_stagewise_logloss_'+file_descriptor) if chunk > 0: assert m1u is not None assert m2u is not None assert m3u is not None # pass events through models filter xs, ys = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a), models=(m1u, m2u, m3u), event_gains=(1., 1., 1.)) x1u, x2u, x3u = xs y1u, y2u, y3u = ys print 'Updated model events at %d:' % chunk print x1u.shape[0], x2u.shape[0], x3u.shape[0] dt = DT([x1, x2, x3], [y1, y2, y3], [x1u, x2u, x3u], [y1u, y2u, y3u], [m1u, m2u, m3u]) file_descriptor = 'seed%d_update%d_' % (seed_events, update_events) dt.plot_kl(ntiles=10, rule='auto', prior=1e-8, verbose=False, saveas='unbiased_feature_kl_'+file_descriptor) dt.plot_stagewise(metric='logloss', verbose=False, saveas='unbiased_stagewise_logloss_'+file_descriptor)
def main(): event_type = 'chisq' # distribution of the hidden score for each stream seed_events = 500 # number of events to use on the first round of training update_events = 1500 # number of total events occurring in each round of batch update analysis_events = 1000 # number of events to use on each round of analysis ps = [0.5, 0.5, 0.5] # fraction of class 1 examples in each stream seeds = [42, 13, 79] # random seeds for each stream gs = [1., 1., 1.] # gains to use in weighing each stream probability num_inputs = 10 # number of inputs in each stream classifier_kind = 'gbm' # classifier to use criterion = 'competing_streams' # type of selection condition batch_updates = 12 # number of batch updates to run for the models file_descriptor = 'seed%d_update%d_' % (seed_events, update_events) # will be used for figure names datetimestr = datetime.datetime.now().strftime("%Y%B%d-%H%M") dirname = event_type + '-' + datetimestr if not os.path.exists(dirname): os.makedirs(dirname) save_metadata(event_type, seed_events, update_events, analysis_events, ps, seeds, num_inputs, classifier_kind, criterion, batch_updates, file_descriptor, dirname) pn = plot_namer(dirname=dirname) # EventGenerators eg1 = EG(seed=seeds[0], num_inputs=num_inputs, kind=event_type, balance=ps[0]) eg2 = EG(seed=seeds[1], num_inputs=num_inputs, kind=event_type, balance=ps[1]) eg3 = EG(seed=seeds[2], num_inputs=num_inputs, kind=event_type, balance=ps[2]) # ModelUpdaters mu1 = MU(kind=classifier_kind) mu2 = MU(kind=classifier_kind) mu3 = MU(kind=classifier_kind) # EventSelector es = ES(criterion=criterion) # TrainDataUpdaters tdu = TDU(num_events=seed_events) tdua = TDU(num_events=analysis_events) x1old, x2old, x3old = None, None, None y1old, y2old, y3old = None, None, None x1old_an, x2old_an, x3old_an = None, None, None y1old_an, y2old_an, y3old_an = None, None, None # global behavior: optimal logloss, and KL distributions at each batch update ll_cols = ['update_index', 'logloss_S1', 'logloss_S2', 'logloss_S3'] kl_cols = ['update_index', 'KL_S1', 'KL_S2', 'KL_S3'] df_lgls = pd.DataFrame(columns=ll_cols) df_kl = pd.DataFrame(columns=kl_cols) for batch_update in range(batch_updates): # create train stream events if batch_update == 0: # on the first iteration use seed events, otherwise use update_event events = seed_events else: events = update_events x1r, y1r = eg1.get(events) x2r, y2r = eg2.get(events) x3r, y3r = eg3.get(events) # create analysis stream events x1a, y1a = eg1.get(analysis_events) x2a, y2a = eg2.get(analysis_events) x3a, y3a = eg3.get(analysis_events) # pass events through current models filter if batch_update == 0: xs, ys = es.filter(xs=(x1r, x2r, x3r), ys=(y1r, y2r, y3r), models=(None, None, None), event_gains=ps) xsaf, ysaf = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a), models=(None, None, None), event_gains=ps) else: xs, ys = es.filter(xs=(x1r, x2r, x3r), ys=(y1r, y2r, y3r), models=(m1, m2, m3), event_gains=ps) xsaf, ysaf = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a), models=(m1, m2, m3), event_gains=ps) x1, x2, x3 = xs y1, y2, y3 = ys x1af, x2af, x3af = xsaf y1af, y2af, y3af = ysaf print '---- Event Selector ----' print 'New events at %d:' % batch_update print x1.shape[0], x2.shape[0], x3.shape[0] # update train data X1u, Y1u = tdu.update(x1old, y1old, x1, y1) X2u, Y2u = tdu.update(x2old, y2old, x2, y2) X3u, Y3u = tdu.update(x3old, y3old, x3, y3) X1ua, Y1ua = tdua.update(x1old_an, y1old_an, x1af, y1af) X2ua, Y2ua = tdua.update(x2old_an, y2old_an, x2af, y2af) X3ua, Y3ua = tdua.update(x3old_an, y3old_an, x3af, y3af) # update models using new data m1 = mu1.train(X1u, Y1u, learning_rate=[0.005, 0.01, 0.03, 0.06, 0.1], n_estimators=[250], subsample=0.5, max_depth=[2, 3], random_state=13, folds=5) m2 = mu2.train(X2u, Y2u, learning_rate=[0.005, 0.01, 0.03, 0.06, 0.1], n_estimators=[250], subsample=0.5, max_depth=[2, 3], random_state=13, folds=5) m3 = mu3.train(X3u, Y3u, learning_rate=[0.005, 0.01, 0.03, 0.06, 0.1], n_estimators=[250], subsample=0.5, max_depth=[2, 3], random_state=13, folds=5) # lookahead: pass events through updated models filter xsaf, ysaf = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a), models=(m1, m2, m3), event_gains=ps) x1afnew, x2afnew, x3afnew = xsaf y1afnew, y2afnew, y3afnew = ysaf # look at distribution shifts and algorithm performance print '--- Data Tomographer ---' print 'Old model events at %d:' % batch_update print x1af.shape[0], x2af.shape[0], x3af.shape[0] print '' # unbiased data vs old biased data on updated model dt = DT(xrefs=[x1af, x2af, x3af], yrefs=[y1af, y2af, y3af], xus=[x1a, x2a, x3a], yus=[y1a, y2a, y3a], models=[m1, m2, m3]) dt.plot_kl(ntiles=10, rule='auto', prior=1e-8, verbose=False, saveas=pn('unbiased_feature_kl_' + file_descriptor + str(int(time())))) dt.plot_stagewise(metric='logloss', verbose=False, saveas=pn('unbiased_stagewise_logloss_' + file_descriptor + str(int(time())))) # question: Does the distribution of data through model converge to some value? kls = dt.kuhl_leib(ntiles=10, rule='auto', prior=1e-8, verbose=False) mean_kls = [np.mean(kl) for kl in kls] df = pd.DataFrame(data=[[batch_update] + mean_kls], columns=kl_cols) df_kl = df_kl.append(df, ignore_index=True) # lookahead: old biased data vs new biased data on updated model dt = DT(xrefs=[x1af, x2af, x3af], yrefs=[y1af, y2af, y3af], xus=[x1afnew, x2afnew, x3afnew], yus=[y1afnew, y2afnew, y3afnew], models=[m1, m2, m3]) dt.plot_hist(ntiles=10, rule='auto', minimal=True, plot_selection=([2], [9]), x_axis=(-3.5, 3.5), saveas=pn('biased_feature_histogram_' + str(int(time()))), color='b', edgecolor='none', alpha=0.5) dt.plot_kl(ntiles=10, rule='auto', prior=1e-8, verbose=False, saveas=pn('biased_feature_kl_'+file_descriptor)) dt.plot_stagewise(metric='logloss', verbose=False, saveas=pn('biased_stagewise_logloss_'+file_descriptor + str(int(time())))) # question: Does the logloss on future data converge to some value? ll_af, ll_afnew = dt.stagewise_metric(metric='logloss', verbose=False) df = pd.DataFrame(data=[[batch_update] + [lls[-1] for lls in ll_afnew]], columns=ll_cols) df_lgls = df_lgls.append(df, ignore_index=True) # create "old" data for next iteration x1old, x2old, x3old = X1u, X2u, X3u y1old, y2old, y3old = Y1u, Y2u, Y3u x1old_an, x2old_an, x3old_an = X1ua, X2ua, X3ua y1old_an, y2old_an, y3old_an = Y1ua, Y2ua, Y3ua plt.figure() df_kl[kl_cols[1:]].plot() plt.savefig(pn(event_type + 'mean_kl_' + file_descriptor), bbox_inches='tight') plt.close() plt.figure() df_lgls[ll_cols[1:]].plot() plt.savefig(pn(event_type + 'logloss_' + file_descriptor), bbox_inches='tight') plt.close()
def main(): seed_events = 100 update_events = 30 analysis_events = 1000 p1, p2, p3 = 0.5, 0.5, 0.5 # EventGenerators eg1 = EG(seed=42, num_inputs=10, kind='chisq', balance=p1) eg2 = EG(seed=13, num_inputs=10, kind='chisq', balance=p2) eg3 = EG(seed=79, num_inputs=10, kind='chisq', balance=p3) # ModelUpdaters mu1 = MU(kind='gbm') mu2 = MU(kind='gbm') mu3 = MU(kind='gbm') # EventSelector es = ES(criterion='competing_streams') # TrainDataUpdaters tdu = TDU(num_events=seed_events) atdu = TDU(num_events=analysis_events) # create events X1, Y1 = eg1.get(seed_events) X2, Y2 = eg2.get(seed_events) X3, Y3 = eg3.get(seed_events) # train models m1 = mu1.train(X1, Y1,learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300], subsample=0.5, max_depth=[2, 3], random_state=13, folds=3) m2 = mu2.train(X2, Y2, learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300], subsample=0.5, max_depth=[2, 3], random_state=13, folds=3) m3 = mu3.train(X3, Y3, learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300], subsample=0.5, max_depth=[2, 3], random_state=13, folds=3) for chunk in range(10): # create events x1r, y1r = eg1.get(update_events) x2r, y2r = eg2.get(update_events) x3r, y3r = eg3.get(update_events) # pass events through current models filter xs, ys = es.filter(xs=(x1r, x2r, x3r), ys=(y1r, y2r, y3r), models=(m1, m2, m3), event_gains=(p1, p2, p3)) x1, x2, x3 = xs y1, y2, y3 = ys print 'New events at %d:' % chunk print x1.shape[0], x2.shape[0], x3.shape[0] # update train data X1u, Y1u = tdu.update(X1, Y1, x1, y1) X2u, Y2u = tdu.update(X2, Y2, x2, y2) X3u, Y3u = tdu.update(X3, Y3, x3, y3) # update models using new data m1o, m2o, m3o = m1, m2, m3 m1 = mu1.train(X1u, Y1u, learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300], subsample=0.5, max_depth=[2, 3], random_state=13, folds=3) m2 = mu2.train(X2u, Y2u, learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300], subsample=0.5, max_depth=[2, 3], random_state=13, folds=3) m3 = mu3.train(X3u, Y3u, learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300], subsample=0.5, max_depth=[2, 3], random_state=13, folds=3) # create "old" data for next iteration X1, X2, X3 = X1u, X2u, X3u Y1, Y2, Y3 = Y1u, Y2u, Y3u # look at distribution shifts and algorithm performance print '--- Data Tomographer ---' # create events x1a, y1a = eg1.get(analysis_events) x2a, y2a = eg2.get(analysis_events) x3a, y3a = eg3.get(analysis_events) # pass events through updated models filter xs, ys = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a), models=(m1o, m2o, m3o), event_gains=(1., 1., 1.)) x1o, x2o, x3o = xs y1o, y2o, y3o = ys print 'Old model events at %d:' %chunk print x1o.shape[0], x2o.shape[0], x3o.shape[0] # pass events through updated models filter xs, ys = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a), models=(m1, m2, m3), event_gains=(1., 1., 1.)) x1, x2, x3 = xs y1, y2, y3 = ys print 'New model events at %d:' %chunk print x1.shape[0], x2.shape[0], x3.shape[0] dt = DT([x1o, x2o, x3o], [y1o, y2o, y3o], [x1, x2, x3], [y1, y2, y3], [m1o, m2o, m3o]) file_descriptor = 'seed%d_update%d_' % (seed_events, update_events) dt.plot_kl(ntiles=10, rule='auto', prior=1e-8, verbose=False, saveas='feature_kl_'+file_descriptor) dt.plot_stagewise(metric='logloss', verbose=False, saveas='stagewise_logloss_'+file_descriptor)
def main(): event_types = ["chisq", "chisq", "chisq"] # distribution of the hidden score for each stream seed_events = 500 # number of events to use on the first round of training update_events = 1500 # number of total events occurring in each round of batch update analysis_events = 1000 # number of events to use on each round of analysis ps = [0.4, 0.5, 0.6] # fraction of class 1 examples in each stream seeds = [42, 13, 79] # random seeds for each stream gs = [1.0, 1.0, 1.0] # gains to use in weighing each stream probability num_inputs = 10 # number of inputs in each stream classifier_kinds = ["gbm", "gbm", "gbm"] # classifier to use criterion = "competing_streams" # type of selection condition batch_updates = 12 # number of batch updates to run for the models file_descriptor = "seed%d_update%d_" % (seed_events, update_events) # will be used for figure names datetimestr = datetime.datetime.now().strftime("%Y%B%d-%H%M") dirname = str(len(event_types)) + "_streams-" + "-" + datetimestr if not os.path.exists(dirname): os.makedirs(dirname) save_metadata( event_types, seed_events, update_events, analysis_events, ps, seeds, num_inputs, classifier_kinds, criterion, batch_updates, file_descriptor, dirname, ) pn = plot_namer(dirname=dirname) # EventGenerators egs = [] for ix, event_type in event_types: egs.append(EG(seed=seeds[ix], num_inputs=num_inputs, kind=event_type, balance=ps[ix])) # ModelUpdaters mus = [] for ix, classifier_kind in classifier_kinds: mus.append(MU(kind=classifier_kind)) # EventSelector es = ES(criterion=criterion) # TrainDataUpdaters tdu = TDU(num_events=seed_events) tdua = TDU(num_events=analysis_events) xolds = [None for e in event_types] yolds = [None for e in event_types] xold_ans = [None for e in event_types] yold_ans = [None for e in event_types] # global behavior: optimal logloss, and KL distributions at each batch update ll_cols = ["update_index"] + ["logloss_S%d" % ix for ix, e in event_types] kl_cols = ["update_index"] + ["KL_S%d" % ix for ix, e in event_types] df_lgls = pd.DataFrame(columns=ll_cols) df_kl = pd.DataFrame(columns=kl_cols) for batch_update in range(batch_updates): if batch_update == 0: # on the first iteration use seed events, otherwise use update_event events = seed_events else: events = update_events # create train stream events xrs, yrs = [], [] for eg in egs: xi, yi = eg.get(events) xrs.append(xi) yrs.append(yi) # create analysis stream events xas, yas = [], [] for eg in egs: xi, yi = eg.get(events) xas.append(xi) yas.append(yi) # pass events through current models filter if batch_update == 0: xs, ys = es.filter(xs=xrs, ys=yrs, models=[None for mu in mus], event_gains=gs) xsaf, ysaf = es.filter(x=xas, ys=yas, models=[None for mu in mus], event_gains=gs) else: xs, ys = es.filter(xs=xrs, ys=yrs, models=ms, event_gains=gs) xsaf, ysaf = es.filter(xs=xas, ys=yas, models=ms, event_gains=gs) msg = "" for xi in xs: msg += str(xi.shape[0]) + " " print "---- Event Selector ----" print "New events at %d:" % batch_update print msg #######################################################################################################333 # update train data X1u, Y1u = tdu.update(x1old, y1old, x1, y1) X2u, Y2u = tdu.update(x2old, y2old, x2, y2) X3u, Y3u = tdu.update(x3old, y3old, x3, y3) X1ua, Y1ua = tdua.update(x1old_an, y1old_an, x1af, y1af) X2ua, Y2ua = tdua.update(x2old_an, y2old_an, x2af, y2af) X3ua, Y3ua = tdua.update(x3old_an, y3old_an, x3af, y3af) # update models using new data m1 = mu1.train( X1u, Y1u, learning_rate=[0.005, 0.01, 0.03, 0.06, 0.1], n_estimators=[250], subsample=0.5, max_depth=[2, 3], random_state=13, folds=5, ) m2 = mu2.train( X2u, Y2u, learning_rate=[0.005, 0.01, 0.03, 0.06, 0.1], n_estimators=[250], subsample=0.5, max_depth=[2, 3], random_state=13, folds=5, ) m3 = mu3.train( X3u, Y3u, learning_rate=[0.005, 0.01, 0.03, 0.06, 0.1], n_estimators=[250], subsample=0.5, max_depth=[2, 3], random_state=13, folds=5, ) # lookahead: pass events through updated models filter xsaf, ysaf = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a), models=(m1, m2, m3), event_gains=ps) x1afnew, x2afnew, x3afnew = xsaf y1afnew, y2afnew, y3afnew = ysaf # look at distribution shifts and algorithm performance print "--- Data Tomographer ---" print "Old model events at %d:" % batch_update print x1af.shape[0], x2af.shape[0], x3af.shape[0] print "" # unbiased data vs old biased data on updated model dt = DT( xrefs=[x1af, x2af, x3af], yrefs=[y1af, y2af, y3af], xus=[x1a, x2a, x3a], yus=[y1a, y2a, y3a], models=[m1, m2, m3], ) dt.plot_kl( ntiles=10, rule="auto", prior=1e-8, verbose=False, saveas=pn("unbiased_feature_kl_" + file_descriptor + str(int(time()))), ) dt.plot_stagewise( metric="logloss", verbose=False, saveas=pn("unbiased_stagewise_logloss_" + file_descriptor + str(int(time()))), ) # question: Does the distribution of data through model converge to some value? kls = dt.kuhl_leib(ntiles=10, rule="auto", prior=1e-8, verbose=False) mean_kls = [np.mean(kl) for kl in kls] df = pd.DataFrame(data=[[batch_update] + mean_kls], columns=kl_cols) df_kl = df_kl.append(df, ignore_index=True) # lookahead: old biased data vs new biased data on updated model dt = DT( xrefs=[x1af, x2af, x3af], yrefs=[y1af, y2af, y3af], xus=[x1afnew, x2afnew, x3afnew], yus=[y1afnew, y2afnew, y3afnew], models=[m1, m2, m3], ) dt.plot_hist( ntiles=10, rule="auto", minimal=True, plot_selection=([2], [9]), x_axis=(-3.5, 3.5), saveas=pn("biased_feature_histogram_" + str(int(time()))), color="b", edgecolor="none", alpha=0.5, ) dt.plot_kl(ntiles=10, rule="auto", prior=1e-8, verbose=False, saveas=pn("biased_feature_kl_" + file_descriptor)) dt.plot_stagewise( metric="logloss", verbose=False, saveas=pn("biased_stagewise_logloss_" + file_descriptor + str(int(time()))) ) # question: Does the logloss on future data converge to some value? ll_af, ll_afnew = dt.stagewise_metric(metric="logloss", verbose=False) df = pd.DataFrame(data=[[batch_update] + [lls[-1] for lls in ll_afnew]], columns=ll_cols) df_lgls = df_lgls.append(df, ignore_index=True) # create "old" data for next iteration x1old, x2old, x3old = X1u, X2u, X3u y1old, y2old, y3old = Y1u, Y2u, Y3u x1old_an, x2old_an, x3old_an = X1ua, X2ua, X3ua y1old_an, y2old_an, y3old_an = Y1ua, Y2ua, Y3ua plt.figure() df_kl[kl_cols[1:]].plot() plt.savefig(pn(event_type + "mean_kl_" + file_descriptor), bbox_inches="tight") plt.close() plt.figure() df_lgls[ll_cols[1:]].plot() plt.savefig(pn(event_type + "logloss_" + file_descriptor), bbox_inches="tight") plt.close()