def __init__(self, child, groupby, aggregates, window): QueryOperator.__init__(self) self.child = child self.can_query_stream_cache = self.can_query_stream_impl() self.groupby = groupby self.aggregates = aggregates self.window = window self.aggregator = Aggregator(self.aggregates, self.groupby, self.window)
class GroupBy(QueryOperator): """ Groups results from child by some set of fields, and runs the aggregate function(s) over them, emitting results and resetting buckets every window seconds. """ def __init__(self, child, groupby, aggregates, window): QueryOperator.__init__(self) self.child = child self.can_query_stream_cache = self.can_query_stream_impl() self.groupby = groupby self.aggregates = aggregates self.window = window self.aggregator = Aggregator(self.aggregates, self.groupby, self.window) def filter(self, updates, return_passes, return_fails): if return_passes: (passes, fails) = self.child.filter(updates, return_passes, return_fails) new_emissions = [] new_emissions.extend(self.aggregator.update(passes)) return (new_emissions, None) else: return (None, None) def filter_params(self): return self.child.filter_params() def can_query_stream(self): return self.can_query_stream_cache def can_query_stream_impl(self): return self.child.can_query_stream() def assign_descriptor(self, tuple_descriptor): self.tuple_descriptor = tuple_descriptor self.aggregator.tuple_descriptor = tuple_descriptor with_aggregates = self.groupby.duplicate() for aggregate in self.aggregates: with_aggregates.add_descriptor(aggregate) with_aggregates.add_descriptor(TwitterFields.created_field) for alias, fd in tuple_descriptor.descriptors.items(): with_aggregates.add_descriptor(fd) self.child.assign_descriptor(with_aggregates)
def launch_test(test_params): """ This function instanciates the base classifiers and the aggregators. It dispatches the computation on several cores of the micro-processor (if any). It also takes care of experimental data saving, figure edition and so on. """ ########################################## # PARAMETERS BELOW SHOULD NOT BE CHANGED # ########################################## rate_resolution = 0.01 if (test_params["dataset"] in ['moons', 'circles', 'blobs', 'neoblobs']): n = test_params["n"] if 'beta' in test_params.keys(): mybeta = test_params["beta"] else: mybeta = 0.5 Data = synthDataset(test_params["dataset"], n, beta=mybeta) synth_data = True else: raise ValueError("Unknown dataset name.") n_class = Data.n_class Ns = Data.Ns n_adver = 0 if "adversary" in test_params.keys(): for i in range(len(test_params["adversary"])): n_adver += test_params["adversary"][i][1] n_bad = 0 if "noisy" in test_params.keys(): for i in range(len(test_params["noisy"])): n_bad += test_params["noisy"][i][1] if 'names' in test_params.keys(): names = test_params["names"] else: names = [] for i in range(Ns): names.append("Tree") if (test_params["tnorm"] in ['Aczel-Alsina']): lambda_range = np.logspace(0, 1, 101) lambda_default = 5.0 elif (test_params["tnorm"] in ['convex']): lambda_range = np.linspace(0.0, 1.0, 101) lambda_default = 0.5 else: raise ValueError('Unknown tnorm name.') if (test_params["tnorm_ada"] in ['convex']): lambda_range_ada = np.linspace(0.0, 1.0, 101) lambda_default_ada = 0.5 elif (test_params["tnorm_ada"] in ['Aczel-Alsina']): lambda_range_ada = np.logspace(0, 1, 101) lambda_default_ada = 1.0 else: raise ValueError('Unknown tnorm name.') r_range = np.logspace(-1, 2, 201) alpha_range = np.linspace(0, 1, 101) regul_range = np.logspace(-2, 2, 101) rho_range = np.logspace(-2, 2, 101) # Models clf = [] predictors = [] for i in range(Ns): if names[i] == 'Tree': clf.append(DecisionTreeClassifier(max_depth=2)) if names[i] == 'Reg Log': clf.append(LogisticRegression(penalty='l2', C=1.0, solver='lbfgs')) if names[i] == 'NBC': clf.append(GaussianNB()) if names[i] == 'QDA': clf.append(QuadraticDiscriminantAnalysis()) if names[i] == 'SVM_lin': clf.append(SVC(kernel="linear", C=0.025)) if names[i] == 'SVM_nlin': clf.append(SVC(gamma=2, C=1)) if names[i] == 'GP': clf.append(GaussianProcessClassifier(1.0 * RBF(1.0))) if names[i] == 'RF': clf.append( RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)) if names[i] == 'MLP': clf.append(MLPClassifier(alpha=1)) if names[i] == 'Ada': clf.append(AdaBoostClassifier()) if names[i] == 'kNN': clf.append(KNeighborsClassifier(n_neighbors=5)) predictors.append( Predictor('Base Clf ' + str(i + 1), 'base_clf', clf[i], rate_resolution, 'k')) adv = [] for i in range(len(test_params["adversary"])): for j in range(test_params["adversary"][i][1]): adv.append( Adversary(clf[test_params["adversary"][i][0]], n_class, test_params["adversary"][i][2])) predictors.append( Predictor('Advers. Clf ' + str(i + 1), 'base_clf', adv[i], rate_resolution, 'k')) Ns += 1 bad = [] for i in range(len(test_params["noisy"])): for j in range(test_params["noisy"][i][1]): bad.append( NoisyClf(clf[test_params["noisy"][i][0]], n_class, test_params["noisy"][i][2])) predictors.append( Predictor('Noisy Clf ' + str(i + 1), 'base_clf', bad[i], rate_resolution, 'k')) Ns += 1 for i in range(test_params["clone"]): predictors.append(predictors[0].clone()) predictors[-1].name = 'Clone ' + str(i + 1) clf.append(predictors[-1].machine) Ns += 1 for i in range(test_params["random"]): clf.append(DummyClassifier(strategy='uniform')) predictors.append( Predictor('Random Clf ' + str(i + 1), 'base_clf', clf[-1], rate_resolution, 'k')) Ns += 1 for i in range(test_params["dummy"]): clf.append(DummyClassifier(strategy='most_frequent')) predictors.append( Predictor('Constant Clf ' + str(i + 1), 'base_clf', clf[-1], rate_resolution, 'k')) Ns += 1 selec = Aggregator('selection', Ns, n_class) predictors.append( Predictor('Selected Clf.', 'agg', selec, rate_resolution, 'b')) wvote = Aggregator('weighted_vote', Ns, n_class, params={ "r_range": r_range, "expo": False }) predictors.append( Predictor('Weighted Vote Ens.', 'agg', wvote, rate_resolution, 'orange')) expow = Aggregator('weighted_vote', Ns, n_class, params={"r_range": r_range}) predictors.append( Predictor('Expo. Weighted Vote Ens.', 'agg', expow, rate_resolution, 'brown')) naive = Aggregator('naive', Ns, n_class, params={"method": 'indep'}) predictors.append( Predictor('Naive Bayes.', 'agg', naive, rate_resolution, '--m')) spocc = Aggregator( 'spocc', Ns, n_class, { "tnorm": test_params["tnorm"], "hyper": lambda_default, "hyper_range": lambda_range }) predictors.append( Predictor('SPOCC (' + test_params["tnorm"] + ')', 'agg', spocc, rate_resolution, 'm')) adaspocc = Aggregator( 'adaspocc', Ns, n_class, { "tnorm": test_params["tnorm_ada"], "hyper": lambda_default_ada, "hyper_range": lambda_range_ada, "alpha_range": alpha_range, "rho": 1, "rho_range": rho_range }) predictors.append( Predictor('adaSPOCC (' + test_params["tnorm_ada"] + ')', 'agg', adaspocc, rate_resolution, 'm')) stack = Aggregator('stacked_logreg', Ns, n_class, {"regul_range": regul_range}) predictors.append( Predictor('Stacked Log. Reg.', 'agg', stack, rate_resolution, 'r')) clf_ctl = LogisticRegression(penalty='l2', C=1.0) predictors.append( Predictor('Centralized Clf.', 'global', clf_ctl, rate_resolution, '--g')) if (test_params["dataset"] not in ['20newsgroup', 'mnist', 'drive']): bayes = Aggregator('bayes', Ns, n_class) predictors.append( Predictor('Bayes Agg.', 'agg', bayes, rate_resolution, ':g')) if synth_data: optim = Oracle(test_params["dataset"]) predictors.append( Predictor('Optimal Clf.', 'oracle', optim, rate_resolution, 'g')) main_loop_partial = partial(main_loop, n=n, Data=Data, mode=test_params["mode"], Ns=Ns, predictors=predictors, sent=test_params["sent"], iter_max=test_params["iter_max"]) results = main_loop_partial(test_params["rand_state"]) accuracy, conf_matrix, failed_loc = results for i in range(len(predictors)): print(predictors[i].name + " has accuracy " + str(accuracy[i]))