def analyze(eg1, eg2, eg3, analysis_events, es, m1, m2, m3, m1u, m2u, m3u, chunk, seed_events, update_events):
    print '--- Data Tomographer ---'
    # create events
    x1a, y1a = eg1.get(analysis_events)
    x2a, y2a = eg2.get(analysis_events)
    x3a, y3a = eg3.get(analysis_events)

    # pass events through models filter
    xs, ys = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a),
                       models=(m1, m2, m3), event_gains=(1., 1., 1.))
    x1, x2, x3 = xs
    y1, y2, y3 = ys
    print 'Current model events at %d:' % chunk
    print x1.shape[0], x2.shape[0], x3.shape[0]

    dt = DT([x1a, x2a, x3a], [y1a, y2a, y3a], [x1, x2, x3], [y1, y2, y3], [m1, m2, m3])
    file_descriptor = 'seed%d_update%d_' % (seed_events, update_events)
    dt.plot_kl(ntiles=10, rule='auto', prior=1e-8, verbose=False, saveas='unbiased_feature_kl_'+file_descriptor)
    dt.plot_stagewise(metric='logloss', verbose=False, saveas='unbiased_stagewise_logloss_'+file_descriptor)

    if chunk > 0:
        assert m1u is not None
        assert m2u is not None
        assert m3u is not None
        # pass events through models filter
        xs, ys = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a),
                           models=(m1u, m2u, m3u), event_gains=(1., 1., 1.))
        x1u, x2u, x3u = xs
        y1u, y2u, y3u = ys
        print 'Updated model events at %d:' % chunk
        print x1u.shape[0], x2u.shape[0], x3u.shape[0]

        dt = DT([x1, x2, x3], [y1, y2, y3], [x1u, x2u, x3u], [y1u, y2u, y3u], [m1u, m2u, m3u])
        file_descriptor = 'seed%d_update%d_' % (seed_events, update_events)
        dt.plot_kl(ntiles=10, rule='auto', prior=1e-8, verbose=False, saveas='unbiased_feature_kl_'+file_descriptor)
        dt.plot_stagewise(metric='logloss', verbose=False, saveas='unbiased_stagewise_logloss_'+file_descriptor)
def main():
    event_type = 'chisq'  # distribution of the hidden score for each stream
    seed_events = 500  # number of events to use on the first round of training
    update_events = 1500  # number of total events occurring in each round of batch update
    analysis_events = 1000  # number of events to use on each round of analysis
    ps = [0.5, 0.5, 0.5]  # fraction of class 1 examples in each stream
    seeds = [42, 13, 79]  # random seeds for each stream
    gs = [1., 1., 1.]  # gains to use in weighing each stream probability
    num_inputs = 10  # number of inputs in each stream
    classifier_kind = 'gbm'  # classifier to use
    criterion = 'competing_streams'  # type of selection condition
    batch_updates = 12  # number of batch updates to run for the models
    file_descriptor = 'seed%d_update%d_' % (seed_events, update_events)  # will be used for figure names
    datetimestr = datetime.datetime.now().strftime("%Y%B%d-%H%M")
    dirname = event_type + '-' + datetimestr
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    save_metadata(event_type, seed_events, update_events, analysis_events, ps, seeds, num_inputs,
                  classifier_kind, criterion, batch_updates, file_descriptor, dirname)
    pn = plot_namer(dirname=dirname)

    # EventGenerators
    eg1 = EG(seed=seeds[0], num_inputs=num_inputs, kind=event_type, balance=ps[0])
    eg2 = EG(seed=seeds[1], num_inputs=num_inputs, kind=event_type, balance=ps[1])
    eg3 = EG(seed=seeds[2], num_inputs=num_inputs, kind=event_type, balance=ps[2])

    # ModelUpdaters
    mu1 = MU(kind=classifier_kind)
    mu2 = MU(kind=classifier_kind)
    mu3 = MU(kind=classifier_kind)

    # EventSelector
    es = ES(criterion=criterion)
    # TrainDataUpdaters
    tdu = TDU(num_events=seed_events)
    tdua = TDU(num_events=analysis_events)

    x1old, x2old, x3old = None, None, None
    y1old, y2old, y3old = None, None, None
    x1old_an, x2old_an, x3old_an = None, None, None
    y1old_an, y2old_an, y3old_an = None, None, None

    # global behavior: optimal logloss, and KL distributions at each batch update
    ll_cols = ['update_index', 'logloss_S1', 'logloss_S2', 'logloss_S3']
    kl_cols = ['update_index', 'KL_S1', 'KL_S2', 'KL_S3']
    df_lgls = pd.DataFrame(columns=ll_cols)
    df_kl = pd.DataFrame(columns=kl_cols)

    for batch_update in range(batch_updates):
        # create train stream events
        if batch_update == 0:  # on the first iteration use seed events, otherwise use update_event
            events = seed_events
        else:
            events = update_events
        x1r, y1r = eg1.get(events)
        x2r, y2r = eg2.get(events)
        x3r, y3r = eg3.get(events)
        # create analysis stream events
        x1a, y1a = eg1.get(analysis_events)
        x2a, y2a = eg2.get(analysis_events)
        x3a, y3a = eg3.get(analysis_events)

        # pass events through current models filter
        if batch_update == 0:
            xs, ys = es.filter(xs=(x1r, x2r, x3r), ys=(y1r, y2r, y3r),
                               models=(None, None, None), event_gains=ps)
            xsaf, ysaf = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a),
                                   models=(None, None, None), event_gains=ps)
        else:
            xs, ys = es.filter(xs=(x1r, x2r, x3r), ys=(y1r, y2r, y3r),
                               models=(m1, m2, m3), event_gains=ps)
            xsaf, ysaf = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a),
                                   models=(m1, m2, m3), event_gains=ps)
        x1, x2, x3 = xs
        y1, y2, y3 = ys
        x1af, x2af, x3af = xsaf
        y1af, y2af, y3af = ysaf
        print '---- Event Selector ----'
        print 'New events at %d:' % batch_update
        print x1.shape[0], x2.shape[0], x3.shape[0]

        # update train data
        X1u, Y1u = tdu.update(x1old, y1old, x1, y1)
        X2u, Y2u = tdu.update(x2old, y2old, x2, y2)
        X3u, Y3u = tdu.update(x3old, y3old, x3, y3)
        X1ua, Y1ua = tdua.update(x1old_an, y1old_an, x1af, y1af)
        X2ua, Y2ua = tdua.update(x2old_an, y2old_an, x2af, y2af)
        X3ua, Y3ua = tdua.update(x3old_an, y3old_an, x3af, y3af)

        # update models using new data
        m1 = mu1.train(X1u, Y1u, learning_rate=[0.005, 0.01, 0.03, 0.06, 0.1], n_estimators=[250],
                       subsample=0.5, max_depth=[2, 3], random_state=13, folds=5)
        m2 = mu2.train(X2u, Y2u, learning_rate=[0.005, 0.01, 0.03, 0.06, 0.1], n_estimators=[250],
                       subsample=0.5, max_depth=[2, 3], random_state=13, folds=5)
        m3 = mu3.train(X3u, Y3u, learning_rate=[0.005, 0.01, 0.03, 0.06, 0.1], n_estimators=[250],
                       subsample=0.5, max_depth=[2, 3], random_state=13, folds=5)

        # lookahead: pass events through updated models filter
        xsaf, ysaf = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a),
                               models=(m1, m2, m3), event_gains=ps)
        x1afnew, x2afnew, x3afnew = xsaf
        y1afnew, y2afnew, y3afnew = ysaf

        # look at distribution shifts and algorithm performance
        print '--- Data Tomographer ---'
        print 'Old model events at %d:' % batch_update
        print x1af.shape[0], x2af.shape[0], x3af.shape[0]
        print ''

        # unbiased data vs old biased data on updated model
        dt = DT(xrefs=[x1af, x2af, x3af], yrefs=[y1af, y2af, y3af],
                xus=[x1a, x2a, x3a], yus=[y1a, y2a, y3a],
                models=[m1, m2, m3])
        dt.plot_kl(ntiles=10, rule='auto', prior=1e-8, verbose=False,
                   saveas=pn('unbiased_feature_kl_' + file_descriptor + str(int(time()))))
        dt.plot_stagewise(metric='logloss', verbose=False,
                          saveas=pn('unbiased_stagewise_logloss_' + file_descriptor + str(int(time()))))
        # question: Does the distribution of data through model converge to some value?
        kls = dt.kuhl_leib(ntiles=10, rule='auto', prior=1e-8, verbose=False)
        mean_kls = [np.mean(kl) for kl in kls]
        df = pd.DataFrame(data=[[batch_update] + mean_kls], columns=kl_cols)
        df_kl = df_kl.append(df, ignore_index=True)

        # lookahead: old biased data vs new biased data on updated model
        dt = DT(xrefs=[x1af, x2af, x3af], yrefs=[y1af, y2af, y3af],
                xus=[x1afnew, x2afnew, x3afnew], yus=[y1afnew, y2afnew, y3afnew],
                models=[m1, m2, m3])
        dt.plot_hist(ntiles=10, rule='auto', minimal=True, plot_selection=([2], [9]), x_axis=(-3.5, 3.5),
                     saveas=pn('biased_feature_histogram_' + str(int(time()))), color='b', edgecolor='none', alpha=0.5)
        dt.plot_kl(ntiles=10, rule='auto', prior=1e-8, verbose=False, saveas=pn('biased_feature_kl_'+file_descriptor))
        dt.plot_stagewise(metric='logloss', verbose=False,
                          saveas=pn('biased_stagewise_logloss_'+file_descriptor + str(int(time()))))
        # question: Does the logloss on future data converge to some value?
        ll_af, ll_afnew = dt.stagewise_metric(metric='logloss', verbose=False)
        df = pd.DataFrame(data=[[batch_update] + [lls[-1] for lls in ll_afnew]], columns=ll_cols)
        df_lgls = df_lgls.append(df, ignore_index=True)

        # create "old" data for next iteration
        x1old, x2old, x3old = X1u, X2u, X3u
        y1old, y2old, y3old = Y1u, Y2u, Y3u
        x1old_an, x2old_an, x3old_an = X1ua, X2ua, X3ua
        y1old_an, y2old_an, y3old_an = Y1ua, Y2ua, Y3ua

    plt.figure()
    df_kl[kl_cols[1:]].plot()
    plt.savefig(pn(event_type + 'mean_kl_' + file_descriptor), bbox_inches='tight')
    plt.close()

    plt.figure()
    df_lgls[ll_cols[1:]].plot()
    plt.savefig(pn(event_type + 'logloss_' + file_descriptor), bbox_inches='tight')
    plt.close()
def main():
    seed_events = 100
    update_events = 30
    analysis_events = 1000
    p1, p2, p3 = 0.5, 0.5, 0.5

    # EventGenerators
    eg1 = EG(seed=42, num_inputs=10, kind='chisq', balance=p1)
    eg2 = EG(seed=13, num_inputs=10, kind='chisq', balance=p2)
    eg3 = EG(seed=79, num_inputs=10, kind='chisq', balance=p3)

    # ModelUpdaters
    mu1 = MU(kind='gbm')
    mu2 = MU(kind='gbm')
    mu3 = MU(kind='gbm')

    # EventSelector
    es = ES(criterion='competing_streams')
    # TrainDataUpdaters
    tdu = TDU(num_events=seed_events)
    atdu = TDU(num_events=analysis_events)

    # create events
    X1, Y1 = eg1.get(seed_events)
    X2, Y2 = eg2.get(seed_events)
    X3, Y3 = eg3.get(seed_events)

    # train models
    m1 = mu1.train(X1, Y1,learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300],
                   subsample=0.5, max_depth=[2, 3], random_state=13, folds=3)
    m2 = mu2.train(X2, Y2, learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300],
                   subsample=0.5, max_depth=[2, 3], random_state=13, folds=3)
    m3 = mu3.train(X3, Y3, learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300],
                   subsample=0.5, max_depth=[2, 3], random_state=13, folds=3)

    for chunk in range(10):
        # create events
        x1r, y1r = eg1.get(update_events)
        x2r, y2r = eg2.get(update_events)
        x3r, y3r = eg3.get(update_events)

        # pass events through current models filter
        xs, ys = es.filter(xs=(x1r, x2r, x3r), ys=(y1r, y2r, y3r),
                           models=(m1, m2, m3), event_gains=(p1, p2, p3))
        x1, x2, x3 = xs
        y1, y2, y3 = ys
        print 'New events at %d:' % chunk
        print x1.shape[0], x2.shape[0], x3.shape[0]

        # update train data
        X1u, Y1u = tdu.update(X1, Y1, x1, y1)
        X2u, Y2u = tdu.update(X2, Y2, x2, y2)
        X3u, Y3u = tdu.update(X3, Y3, x3, y3)

        # update models using new data
        m1o, m2o, m3o = m1, m2, m3

        m1 = mu1.train(X1u, Y1u, learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300],
                       subsample=0.5, max_depth=[2, 3], random_state=13, folds=3)
        m2 = mu2.train(X2u, Y2u, learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300],
                       subsample=0.5, max_depth=[2, 3], random_state=13, folds=3)
        m3 = mu3.train(X3u, Y3u, learning_rate=[0.01, 0.03, 0.1], n_estimators=[50, 100, 150, 200, 300],
                       subsample=0.5, max_depth=[2, 3], random_state=13, folds=3)

        # create "old" data for next iteration
        X1, X2, X3 = X1u, X2u, X3u
        Y1, Y2, Y3 = Y1u, Y2u, Y3u

        # look at distribution shifts and algorithm performance
        print '--- Data Tomographer ---'
        # create events
        x1a, y1a = eg1.get(analysis_events)
        x2a, y2a = eg2.get(analysis_events)
        x3a, y3a = eg3.get(analysis_events)

        # pass events through updated models filter
        xs, ys = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a),
                           models=(m1o, m2o, m3o), event_gains=(1., 1., 1.))
        x1o, x2o, x3o = xs
        y1o, y2o, y3o = ys
        print 'Old model events at %d:' %chunk
        print x1o.shape[0], x2o.shape[0], x3o.shape[0]

        # pass events through updated models filter
        xs, ys = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a),
                           models=(m1, m2, m3), event_gains=(1., 1., 1.))
        x1, x2, x3 = xs
        y1, y2, y3 = ys
        print 'New model events at %d:' %chunk
        print x1.shape[0], x2.shape[0], x3.shape[0]

        dt = DT([x1o, x2o, x3o], [y1o, y2o, y3o], [x1, x2, x3], [y1, y2, y3], [m1o, m2o, m3o])
        file_descriptor = 'seed%d_update%d_' % (seed_events, update_events)
        dt.plot_kl(ntiles=10, rule='auto', prior=1e-8, verbose=False, saveas='feature_kl_'+file_descriptor)
        dt.plot_stagewise(metric='logloss', verbose=False, saveas='stagewise_logloss_'+file_descriptor)
def main():
    event_types = ["chisq", "chisq", "chisq"]  # distribution of the hidden score for each stream
    seed_events = 500  # number of events to use on the first round of training
    update_events = 1500  # number of total events occurring in each round of batch update
    analysis_events = 1000  # number of events to use on each round of analysis
    ps = [0.4, 0.5, 0.6]  # fraction of class 1 examples in each stream
    seeds = [42, 13, 79]  # random seeds for each stream
    gs = [1.0, 1.0, 1.0]  # gains to use in weighing each stream probability
    num_inputs = 10  # number of inputs in each stream
    classifier_kinds = ["gbm", "gbm", "gbm"]  # classifier to use
    criterion = "competing_streams"  # type of selection condition
    batch_updates = 12  # number of batch updates to run for the models
    file_descriptor = "seed%d_update%d_" % (seed_events, update_events)  # will be used for figure names
    datetimestr = datetime.datetime.now().strftime("%Y%B%d-%H%M")
    dirname = str(len(event_types)) + "_streams-" + "-" + datetimestr
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    save_metadata(
        event_types,
        seed_events,
        update_events,
        analysis_events,
        ps,
        seeds,
        num_inputs,
        classifier_kinds,
        criterion,
        batch_updates,
        file_descriptor,
        dirname,
    )
    pn = plot_namer(dirname=dirname)

    # EventGenerators
    egs = []
    for ix, event_type in event_types:
        egs.append(EG(seed=seeds[ix], num_inputs=num_inputs, kind=event_type, balance=ps[ix]))

    # ModelUpdaters
    mus = []
    for ix, classifier_kind in classifier_kinds:
        mus.append(MU(kind=classifier_kind))

    # EventSelector
    es = ES(criterion=criterion)
    # TrainDataUpdaters
    tdu = TDU(num_events=seed_events)
    tdua = TDU(num_events=analysis_events)

    xolds = [None for e in event_types]
    yolds = [None for e in event_types]
    xold_ans = [None for e in event_types]
    yold_ans = [None for e in event_types]

    # global behavior: optimal logloss, and KL distributions at each batch update
    ll_cols = ["update_index"] + ["logloss_S%d" % ix for ix, e in event_types]
    kl_cols = ["update_index"] + ["KL_S%d" % ix for ix, e in event_types]
    df_lgls = pd.DataFrame(columns=ll_cols)
    df_kl = pd.DataFrame(columns=kl_cols)

    for batch_update in range(batch_updates):
        if batch_update == 0:  # on the first iteration use seed events, otherwise use update_event
            events = seed_events
        else:
            events = update_events
        # create train stream events
        xrs, yrs = [], []
        for eg in egs:
            xi, yi = eg.get(events)
            xrs.append(xi)
            yrs.append(yi)
        # create analysis stream events
        xas, yas = [], []
        for eg in egs:
            xi, yi = eg.get(events)
            xas.append(xi)
            yas.append(yi)

        # pass events through current models filter
        if batch_update == 0:
            xs, ys = es.filter(xs=xrs, ys=yrs, models=[None for mu in mus], event_gains=gs)
            xsaf, ysaf = es.filter(x=xas, ys=yas, models=[None for mu in mus], event_gains=gs)
        else:
            xs, ys = es.filter(xs=xrs, ys=yrs, models=ms, event_gains=gs)
            xsaf, ysaf = es.filter(xs=xas, ys=yas, models=ms, event_gains=gs)
        msg = ""
        for xi in xs:
            msg += str(xi.shape[0]) + " "
        print "---- Event Selector ----"
        print "New events at %d:" % batch_update
        print msg

        #######################################################################################################333
        # update train data
        X1u, Y1u = tdu.update(x1old, y1old, x1, y1)
        X2u, Y2u = tdu.update(x2old, y2old, x2, y2)
        X3u, Y3u = tdu.update(x3old, y3old, x3, y3)
        X1ua, Y1ua = tdua.update(x1old_an, y1old_an, x1af, y1af)
        X2ua, Y2ua = tdua.update(x2old_an, y2old_an, x2af, y2af)
        X3ua, Y3ua = tdua.update(x3old_an, y3old_an, x3af, y3af)

        # update models using new data
        m1 = mu1.train(
            X1u,
            Y1u,
            learning_rate=[0.005, 0.01, 0.03, 0.06, 0.1],
            n_estimators=[250],
            subsample=0.5,
            max_depth=[2, 3],
            random_state=13,
            folds=5,
        )
        m2 = mu2.train(
            X2u,
            Y2u,
            learning_rate=[0.005, 0.01, 0.03, 0.06, 0.1],
            n_estimators=[250],
            subsample=0.5,
            max_depth=[2, 3],
            random_state=13,
            folds=5,
        )
        m3 = mu3.train(
            X3u,
            Y3u,
            learning_rate=[0.005, 0.01, 0.03, 0.06, 0.1],
            n_estimators=[250],
            subsample=0.5,
            max_depth=[2, 3],
            random_state=13,
            folds=5,
        )

        # lookahead: pass events through updated models filter
        xsaf, ysaf = es.filter(xs=(x1a, x2a, x3a), ys=(y1a, y2a, y3a), models=(m1, m2, m3), event_gains=ps)
        x1afnew, x2afnew, x3afnew = xsaf
        y1afnew, y2afnew, y3afnew = ysaf

        # look at distribution shifts and algorithm performance
        print "--- Data Tomographer ---"
        print "Old model events at %d:" % batch_update
        print x1af.shape[0], x2af.shape[0], x3af.shape[0]
        print ""

        # unbiased data vs old biased data on updated model
        dt = DT(
            xrefs=[x1af, x2af, x3af],
            yrefs=[y1af, y2af, y3af],
            xus=[x1a, x2a, x3a],
            yus=[y1a, y2a, y3a],
            models=[m1, m2, m3],
        )
        dt.plot_kl(
            ntiles=10,
            rule="auto",
            prior=1e-8,
            verbose=False,
            saveas=pn("unbiased_feature_kl_" + file_descriptor + str(int(time()))),
        )
        dt.plot_stagewise(
            metric="logloss",
            verbose=False,
            saveas=pn("unbiased_stagewise_logloss_" + file_descriptor + str(int(time()))),
        )
        # question: Does the distribution of data through model converge to some value?
        kls = dt.kuhl_leib(ntiles=10, rule="auto", prior=1e-8, verbose=False)
        mean_kls = [np.mean(kl) for kl in kls]
        df = pd.DataFrame(data=[[batch_update] + mean_kls], columns=kl_cols)
        df_kl = df_kl.append(df, ignore_index=True)

        # lookahead: old biased data vs new biased data on updated model
        dt = DT(
            xrefs=[x1af, x2af, x3af],
            yrefs=[y1af, y2af, y3af],
            xus=[x1afnew, x2afnew, x3afnew],
            yus=[y1afnew, y2afnew, y3afnew],
            models=[m1, m2, m3],
        )
        dt.plot_hist(
            ntiles=10,
            rule="auto",
            minimal=True,
            plot_selection=([2], [9]),
            x_axis=(-3.5, 3.5),
            saveas=pn("biased_feature_histogram_" + str(int(time()))),
            color="b",
            edgecolor="none",
            alpha=0.5,
        )
        dt.plot_kl(ntiles=10, rule="auto", prior=1e-8, verbose=False, saveas=pn("biased_feature_kl_" + file_descriptor))
        dt.plot_stagewise(
            metric="logloss", verbose=False, saveas=pn("biased_stagewise_logloss_" + file_descriptor + str(int(time())))
        )
        # question: Does the logloss on future data converge to some value?
        ll_af, ll_afnew = dt.stagewise_metric(metric="logloss", verbose=False)
        df = pd.DataFrame(data=[[batch_update] + [lls[-1] for lls in ll_afnew]], columns=ll_cols)
        df_lgls = df_lgls.append(df, ignore_index=True)

        # create "old" data for next iteration
        x1old, x2old, x3old = X1u, X2u, X3u
        y1old, y2old, y3old = Y1u, Y2u, Y3u
        x1old_an, x2old_an, x3old_an = X1ua, X2ua, X3ua
        y1old_an, y2old_an, y3old_an = Y1ua, Y2ua, Y3ua

    plt.figure()
    df_kl[kl_cols[1:]].plot()
    plt.savefig(pn(event_type + "mean_kl_" + file_descriptor), bbox_inches="tight")
    plt.close()

    plt.figure()
    df_lgls[ll_cols[1:]].plot()
    plt.savefig(pn(event_type + "logloss_" + file_descriptor), bbox_inches="tight")
    plt.close()