def _unit_test_params(cls): yield { "models": [ compose.Pipeline( preprocessing.StandardScaler(), linear_model.LinearRegression(optimizer=optim.SGD( lr=1e-2)), ), compose.Pipeline( preprocessing.StandardScaler(), linear_model.LinearRegression(optimizer=optim.SGD( lr=1e-1)), ), ], "metric": metrics.MAE(), } yield { "models": [ compose.Pipeline( preprocessing.StandardScaler(), linear_model.LinearRegression(optimizer=optim.SGD(lr=lr)), ) for lr in [1e-4, 1e-3, 1e-2, 1e-1] ], "metric": metrics.MAE(), }
class RiverML: # fraud detection model model = compose.Pipeline(preprocessing.StandardScaler(), linear_model.LogisticRegression()) # ROCAUC metric to score the model as it trains metric = metrics.ROCAUC()
def __init__(self): optimizer = optim.SGD(0.1) self.model = compose.Pipeline( preprocessing.StandardScaler(), linear_model.LogisticRegression(optimizer)) self.metric = metrics.Accuracy() self.count = 0
def __init__(self, my_id=1, bootstrap_servers='', list_of_partitions=[], request_topic='', inference_topic='', group_id='my_grp'): """ Constructor :type interval: int :param interval: Check interval, in seconds """ self.model = compose.Pipeline( preprocessing.MinMaxScaler(), anomaly.HalfSpaceTrees( seed=42)) # tree.HoeffdingTreeClassifier(max_depth=10) self.metric = metrics.ROCAUC() # metrics.Accuracy() # self.my_id = my_id self.t = request_topic self.result_t = inference_topic self.my_grp_id = group_id self.result_t_p = 8 self.bootstrap_servers = bootstrap_servers # self.list_of_partitions = list_of_partitions self.tls = [] x = 0 for i in list_of_partitions: self.tls.insert(x, TopicPartition(self.t, i)) x = x + 1 #self.tls=list_of_partitions print(self.tls) conf = { 'bootstrap.servers': bootstrap_servers, 'sasl.mechanism': 'PLAIN', 'security.protocol': 'SASL_SSL', 'ssl.ca.location': '/tmp/cacert.pem', 'sasl.username': '******', 'sasl.password': '******', # 'sasl.username': '******', # 'sasl.password': '******', # 'key.serializer': StringSerializer('utf_8'), # 'value.serializer': StringSerializer('utf_8'), 'client.id': 'test-sw-1' } self.producer = Producer(conf) conf = { 'bootstrap.servers': bootstrap_servers, 'sasl.mechanism': 'PLAIN', 'security.protocol': 'SASL_SSL', 'sasl.username': '******', 'sasl.password': '******', 'ssl.ca.location': '/tmp/cacert.pem', 'group.id': group_id, 'auto.offset.reset': 'latest' } self.consumer = consumer = Consumer(conf) self.consumer.assign(self.tls)
def _unit_test_params(cls): return { "models": [ compose.Pipeline( preprocessing.StandardScaler(), linear_model.LinearRegression(optimizer=optim.SGD( lr=0.01)), ), compose.Pipeline( preprocessing.StandardScaler(), linear_model.LinearRegression(optimizer=optim.SGD(lr=0.1)), ), ], "metric": metrics.MAE(), }
def test_list_of_funcs(): def f(x): return {"f": 1} def g(x): return {"g": 2} def times_2(x): return {k: v * 2 for k, v in x.items()} expected = {"f": 2, "g": 4} assert compose.Pipeline([f, g], times_2).transform_one(None) == expected assert ([f, g] | compose.FuncTransformer(times_2)).transform_one(None) == expected
def test_no_learn_unsupervised_one(func): pipeline = compose.Pipeline( ("scale", preprocessing.StandardScaler()), ("log_reg", linear_model.LogisticRegression()), ) dataset = [(dict(a=x, b=x), x) for x in range(100)] for x, y in dataset: counts_pre = dict(pipeline.steps["scale"].counts) func(pipeline, x, learn_unsupervised=True) counts_post = dict(pipeline.steps["scale"].counts) func(pipeline, x, learn_unsupervised=False) counts_no_learn = dict(pipeline.steps["scale"].counts) assert counts_pre != counts_post assert counts_post == counts_no_learn
def test_predict_class_given_unseen_features(): model = compose.Pipeline( ("tokenize", feature_extraction.BagOfWords()), ("nb", naive_bayes.MultinomialNB(alpha=1)), ) docs = [ ("cloudy cold", 0), ("sunny warm", 1), ] for sentence, label in docs: model = model.learn_one(sentence, label) # Assert model parameters needed to calculate the likelihoods assert model["nb"].n_terms == 4 assert model["nb"].class_totals[0] == 2 assert model["nb"].class_totals[1] == 2 # Given new, unseen text, predict the label text = "new word" tokens = model["tokenize"].transform_one(text) cp = model["nb"].p_feature_given_class # P(new|0) # = (N_new_0 + 1) / N_0 + N_terms) # = (0 + 1) / (model['nb'].class_totals[0] + model['nb'].n_terms) assert cp("new", 0) == (0 + 1) / (2 + 4) # Since class_totals[0] == class_totals[1], and both words in text are new/unseen, # expect the class-conditional probabilities to be the same assert cp("new", 0) == cp("word", 0) assert cp("new", 0) == cp("new", 1) assert cp("new", 0) == cp("word", 1) jll = model["nb"].joint_log_likelihood(tokens) # Expect JLLs to be equal assert jll[0] == jll[1] # P(0|new word) # = P(new|0) * P(word|0) * P(0) assert jll[0] == math.log(cp("new", 0) * cp("word", 0) * (1 / 2)) # JLLs for both labels are the same, but 0 was the first label to be added to model['nb'].class_counts assert model.predict_one(text) == 0
def test_no_learn_unsupervised_score_one(): pipeline = compose.Pipeline( ("scale", preprocessing.StandardScaler()), ("anomaly", anomaly.HalfSpaceTrees()), ) dataset = [(dict(a=x, b=x), x) for x in range(100)] for x, y in dataset: counts_pre = dict(pipeline.steps["scale"].counts) pipeline.score_one(x, learn_unsupervised=True) counts_post = dict(pipeline.steps["scale"].counts) pipeline.score_one(x, learn_unsupervised=False) counts_no_learn = dict(pipeline.steps["scale"].counts) assert counts_pre != counts_post assert counts_post == counts_no_learn
def __init__(self, step, name): self.name = name self.optimizer = SynchronousSGD(0.01, name, None) self.model = compose.Pipeline( preprocessing.StandardScaler(), linear_model.LogisticRegression(self.optimizer)) self.metrics = [ metrics.Accuracy(), metrics.MAE(), metrics.RMSE(), metrics.Precision(), metrics.Recall() ] self.count = 0 if step is None: self.step = 50 else: self.step = int(step)
def test_learn_one_warm_up_mode(): pipeline = compose.Pipeline( ("scale", preprocessing.StandardScaler()), ("log_reg", linear_model.LogisticRegression()), ) dataset = [(dict(a=x, b=x), bool(x % 2)) for x in range(100)] for x, y in dataset: counts_pre = dict(pipeline["scale"].counts) with utils.warm_up_mode(): pipeline.learn_one(x, y) counts_post = dict(pipeline["scale"].counts) pipeline.learn_one(x, y) counts_no_learn = dict(pipeline["scale"].counts) assert counts_pre != counts_post assert counts_post == counts_no_learn
def test_learn_many_warm_up_mode(): pipeline = compose.Pipeline( ("scale", preprocessing.StandardScaler()), ("log_reg", linear_model.LogisticRegression()), ) dataset = [(dict(a=x, b=x), x) for x in range(100)] for i in range(0, len(dataset), 5): X = pd.DataFrame([x for x, _ in dataset][i:i + 5]) y = pd.Series([bool(y % 2) for _, y in dataset][i:i + 5]) counts_pre = dict(pipeline["scale"].counts) with utils.warm_up_mode(): pipeline.learn_many(X, y) counts_post = dict(pipeline["scale"].counts) pipeline.learn_many(X, y) counts_no_learn = dict(pipeline["scale"].counts) assert counts_pre != counts_post assert counts_post == counts_no_learn
"optimizer": [ (optim.SGD, {"lr": [1, 2]}), ( optim.Adam, { "beta_1": [0.1, 0.01, 0.001], "lr": [0.1, 0.01, 0.001, 0.0001], }, ), ] } }, 2 + 3 * 4, ), ( compose.Pipeline(("Scaler", None), linear_model.LinearRegression()), { "Scaler": [ preprocessing.MinMaxScaler(), preprocessing.MaxAbsScaler(), preprocessing.StandardScaler(), ], "LinearRegression": {"optimizer": {"lr": [1e-1, 1e-2, 1e-3]}}, }, 3 * 3, ), ], ) def test_expand_param_grid_count(model, param_grid, count): assert len(utils.expand_param_grid(model, param_grid)) == count
AUTOML_CLASSIFICATION_PIPELINE = compose.Pipeline( ( 'Scaler', PipelineHelperTransformer([ ('StandardScaler', preprocessing.StandardScaler()), ('MinMaxScaler', preprocessing.MinMaxScaler()), ('MinAbsScaler', preprocessing.MaxAbsScaler()), # todo create dummy # ('RobustScaler', preprocessing.RobustScaler()), # ('AdaptiveStandardScaler', preprocessing.AdaptiveStandardScaler()), # ('LDA', preprocessing.LDA()), ])), # ('FeatureExtractor', PipelineHelperTransformer([ # ('PolynomialExtender', feature_extraction.PolynomialExtender()), # ('RBF', feature_extraction.RBFSampler()), # ])), ( 'Classifier', PipelineHelperClassifier([ ('HT', tree.HoeffdingTreeClassifier()), # ('FT', tree.ExtremelyFastDecisionTreeClassifier()), ('LR', linear_model.LogisticRegression()), # ('HAT', tree.HoeffdingAdaptiveTreeClassifier()), ('GNB', naive_bayes.GaussianNB()), # ('MNB', naive_bayes.MultinomialNB()), # ('PAC', linear_model.PAClassifier()), # ('ARF', ensemble.AdaptiveRandomForestClassifier()), ('KNN', neighbors.KNNClassifier()), ])))
"Tokyo Tokyo", "Macao Macao new", "new", ] def yield_batch_unseen_data(): yield from [pd.Series(x) for x in yield_unseen_data()] @pytest.mark.parametrize( "inc_model, batch_model, bag, sk_model", [ pytest.param( compose.Pipeline( ("tokenize", feature_extraction.BagOfWords(lowercase=False)), ("model", model(alpha=alpha)), ), compose.Pipeline( ("tokenize", feature_extraction.BagOfWords(lowercase=False)), ("model", model(alpha=alpha)), ), feature_extraction.BagOfWords(lowercase=False), sk_model(alpha=alpha), id=f"{model.__name__} - {alpha}", ) for model, sk_model in [ (naive_bayes.MultinomialNB, sk_naive_bayes.MultinomialNB), (naive_bayes.BernoulliNB, sk_naive_bayes.BernoulliNB), (naive_bayes.ComplementNB, sk_naive_bayes.ComplementNB), ] for alpha in [alpha for alpha in range(1, 4)] ], )
from river import tree from river import compose from river import preprocessing from river import linear_model from tqdm import tqdm scaler = preprocessing.StandardScaler() log_reg = linear_model.LinearRegression() hf_tree = tree.HoeffdingTreeClassifier( grace_period=100, split_confidence=1e-5, ) model = compose.Pipeline() model |= hf_tree for index, raw in tqdm(train_features.iterrows(), total=train_features.shape[0]): model.learn_one(raw, train_target[index]) correct_cnt = 0 for index, raw in tqdm(test_features.iterrows(), total=test_features.shape[0]): test_pred = model.predict_one(raw) if test_pred == test_target[index]: correct_cnt += 1 print("test accuracy: {} %".format(correct_cnt / test_features.shape[0]))
'beta_1': [.1, .01, .001], 'lr': [.1, .01, .001, .0001] })] }, 2 + 3 * 4), (preprocessing.StandardScaler() | linear_model.LinearRegression(), { 'LinearRegression': { 'optimizer': [(optim.SGD, { 'lr': [1, 2] }), (optim.Adam, { 'beta_1': [.1, .01, .001], 'lr': [.1, .01, .001, .0001] })] } }, 2 + 3 * 4), (compose.Pipeline(('Scaler', None), linear_model.LinearRegression()), { 'Scaler': [ preprocessing.MinMaxScaler(), preprocessing.MaxAbsScaler(), preprocessing.StandardScaler() ], 'LinearRegression': { 'optimizer': { 'lr': [1e-1, 1e-2, 1e-3] } } }, 3 * 3) ]) def test_expand_param_grid_count(model, param_grid, count): assert len(utils.expand_param_grid(model, param_grid)) == count
task=datasets.base.REG, n_features=1, n_samples=1440) def __iter__(self): return stream.iter_csv(self.path, target='interval_qps', converters={'interval_qps': int}) def get_ordinal_date(x): return {'ordinal_date': int(x['secs_elapsed'])} model = compose.Pipeline( ('ordinal_date', compose.FuncTransformer(get_ordinal_date)), ('scale', preprocessing.MinMaxScaler()), ('lin_reg', linear_model.LinearRegression())) from river import metrics import matplotlib.pyplot as plt # target_data = "../log_traces/Mixgraph/1000_0.0000073_45000/report.csv" target_data = "../log_traces/StorageMaterial.NVMeSSD/12CPU/64MB/report.csv_1180" import os target_data = os.path.abspath(target_data) def evaluate_model(model): metric = metrics.Rolling(metrics.MAE(), 12)
dataset['similarity'] = similarity(dataset['title'], dataset['text']) return dataset train = train_tuple[:] test = test_tuple[:] #Passive Aggressive Classifier PA_model = compose.Pipeline( ('features', compose.TransformerUnion( ('pipe1', compose.Pipeline(('select_numeric_features', compose.Select('length', 'punct%', 'similarity')), ('scale', preprocessing.MinMaxScaler()))), ('pipe2', compose.Pipeline( ('select_text_features', compose.Select('content')), ('tfidf', feature_extraction.TFIDF(on='content')))))), ('modeling', linear_model.PAClassifier())) metric = metrics.ROCAUC() train1 = train[:] PA_score1 = [] y_pred_l1 = [] y_l1 = [] for x, y in train1: x = text_processing(x) y_pred = PA_model.predict_one(x) y_pred_l1.append(y_pred)
tracks = [ ('Random RBF', random_rbf_track), ('AGRAWAL', agrawal_track), ('Anomaly Sine', anomaly_sine_track), ('Concept Drift', concept_drift_track), ('Hyperplane', hyperplane_track), ('Mixed', mixed_track), ('SEA', sea_track), ('Sine', sine_track), ('STAGGER', stagger_track) ] estimator1 = compose.Pipeline( preprocessing.StandardScaler(), #feature_extraction.PolynomialExtender(), tree.HoeffdingTreeClassifier() ) estimator2 = compose.Pipeline( pipelinehelper.PipelineHelperTransformer([ ('scaler', preprocessing.StandardScaler()) ]), #feature_extraction.PolynomialExtender(), ('classifier', tree.HoeffdingTreeClassifier()) ) estimator3 = compose.Pipeline( ('scaler', preprocessing.StandardScaler()), pipelinehelper.PipelineHelperClassifier([ ('classifier', tree.HoeffdingTreeClassifier()) ])
dataset_tuple_a = dataset_tuple[:5000] train = dataset_tuple_a[:] dataset_tuple_b = dataset_tuple[5000:] #len(dataset_tuple_b) Logistic_model = compose.Pipeline( ('features', compose.TransformerUnion( ('pipe1', compose.Pipeline(('drop_non_features', compose.Discard('body', 'date', 'subject', 'text', 'title', 'title_clean')), ('scale', preprocessing.StandardScaler()))), ('pipe2', compose.Pipeline( ('drop_non_featuress', compose.Discard('body', 'body_len', 'body_num', 'date', 'punct%', 'subject', 'text', 'title', 'title_len', 'title_num')), ('tfidf', feature_extraction.TFIDF(on='title_clean')))))), ('modeling', linear_model.LogisticRegression())) #metric = metrics.Accuracy() #evaluate.progressive_val_score(dataset_tuple_a, model, metric) #model.predict_proba_one(z) #model.predict_one(z)
from river import compose from river import preprocessing from river import linear_model from river import metrics from river import datasets from river import optim optimizer = optim.SGD(0.1) model = compose.Pipeline(preprocessing.StandardScaler(), linear_model.LogisticRegression(optimizer)) metric = metrics.ROCAUC() precision = metrics.Precision() for x, y in datasets.Phishing(): y_pred = model.predict_proba_one(x) model.learn_one(x, y) metric.update(y, y_pred) precision.update(y, y_pred) print(metric) print(precision)
def get_ordinal_data(x): return {'ordinal_data': x['month'].toordinal()} def get_month(x): return { calendar.month_name[month]: month == x['month'].month for month in range(1, 13) } # To monthly trend by one-hot encoding the month name model = compose.Pipeline( ('features', compose.TransformerUnion( ('ordinal_date', compose.FuncTransformer(get_ordinal_data)), ('month', compose.FuncTransformer(get_month)), )), ('scale', preprocessing.StandardScaler), ('lin_reg', linear_model.LinearRegression(intercept_lr=0, optimizer=optim.SGD(0.05)))) model = time_series.Detrender(regressor=model, window_size=12) dates = [] dates_pred = [] y_trues = [] y_preds = [] images = [] def elevate_model(model):