def test_standard_scaler_add_remove_columns(): """Checks that no exceptions are raised whenever columns are dropped and/or added.""" X = pd.read_csv(datasets.TrumpApproval().path) ss = preprocessing.StandardScaler() for xb in np.array_split(X, 10): # Pick half of the columns at random cols = np.random.choice(X.columns, len(X.columns) // 2, replace=False) ss.fit_many(xb[cols])
def test_add_remove_columns(): """Checks that no exceptions are raised whenever columns are dropped and/or added.""" X = pd.read_csv(datasets.TrumpApproval().path) Y = X.pop('five_thirty_eight') lin_reg = lm.LinearRegression() for xb, yb in zip(np.array_split(X, 10), np.array_split(Y, 10)): # Pick half of the columns at random cols = np.random.choice(X.columns, len(X.columns) // 2, replace=False) lin_reg.fit_many(xb[cols], yb)
def test_one_many_consistent(): """Checks that using fit_one or fit_many produces the same result.""" X = pd.read_csv(datasets.TrumpApproval().path) Y = X.pop('five_thirty_eight') one = lm.LinearRegression() for x, y in stream.iter_pandas(X, Y): one.fit_one(x, y) many = lm.LinearRegression() for xb, yb in zip(np.array_split(X, len(X)), np.array_split(Y, len(Y))): many.fit_many(xb, yb) for i in X: assert math.isclose(one.weights[i], many.weights[i])
def test_standard_scaler_one_many_consistent(): """Checks that using fit_one or fit_many produces the same result.""" X = pd.read_csv(datasets.TrumpApproval().path) one = preprocessing.StandardScaler() for x, _ in stream.iter_pandas(X): one.fit_one(x) many = preprocessing.StandardScaler() for xb in np.array_split(X, 10): many.fit_many(xb) for i in X: assert math.isclose(one.counts[i], many.counts[i]) assert math.isclose(one.means[i], many.means[i]) assert math.isclose(one.vars[i], many.vars[i])
def test_shuffle_columns(): """Checks that fit_many works identically whether columns are shuffled or not.""" X = pd.read_csv(datasets.TrumpApproval().path) Y = X.pop('five_thirty_eight') normal = lm.LinearRegression() for xb, yb in zip(np.array_split(X, 10), np.array_split(Y, 10)): normal.fit_many(xb, yb) shuffled = lm.LinearRegression() for xb, yb in zip(np.array_split(X, 10), np.array_split(Y, 10)): cols = np.random.permutation(X.columns) shuffled.fit_many(xb[cols], yb) for i in X: assert math.isclose(normal.weights[i], shuffled.weights[i])
def test_standard_scaler_shuffle_columns(): """Checks that fit_many works identically whether columns are shuffled or not.""" X = pd.read_csv(datasets.TrumpApproval().path) normal = preprocessing.StandardScaler() for xb in np.array_split(X, 10): normal.fit_many(xb) shuffled = preprocessing.StandardScaler() for xb in np.array_split(X, 10): cols = np.random.permutation(X.columns) shuffled.fit_many(xb[cols]) for i in X: assert math.isclose(shuffled.counts[i], shuffled.counts[i]) assert math.isclose(shuffled.means[i], shuffled.means[i]) assert math.isclose(shuffled.vars[i], shuffled.vars[i])
def yield_datasets(model): from creme import base from creme import datasets from creme import stream from sklearn import datasets as sk_datasets model = guess_model(model) if isinstance(model, (base.BinaryClassifier, base.MultiClassifier)): yield datasets.Phishing() if isinstance(model, base.MultiClassifier): yield datasets.ImageSegments().take(500) if isinstance(model, base.Regressor): yield datasets.TrumpApproval() if isinstance(model, base.MultiOutputRegressor): yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud()) if isinstance(model, base.MultiOutputClassifier): yield datasets.Music()
def yield_datasets(model): from creme import base from creme import compose from creme import datasets from creme import preprocessing from creme import stream from sklearn import datasets as sk_datasets model = guess_model(model) # Classification if isinstance(model, (base.BinaryClassifier, base.MultiClassifier)): yield datasets.Phishing() # Multi-class classification if isinstance(model, base.MultiClassifier): yield datasets.ImageSegments().take(500) # Regression if isinstance(model, base.Regressor): yield datasets.TrumpApproval() # Multi-output regression if isinstance(model, base.MultiOutputRegressor): # 1 yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud()) # 2 class SolarFlare: """One-hot encoded version of `datasets.SolarFlare`.""" def __iter__(self): oh = (compose.SelectType(str) | preprocessing.OneHotEncoder()) + compose.SelectType(int) for x, y in datasets.SolarFlare(): yield oh.transform_one(x), y yield SolarFlare() # Multi-output classification if isinstance(model, base.MultiOutputClassifier): yield datasets.Music()
def yield_datasets(model): from creme import base from creme import datasets from creme import stream from sklearn import datasets as sk_datasets model = guess_model(model) if isinstance(model, (base.BinaryClassifier, base.MultiClassifier)): yield datasets.Phishing() if isinstance(model, base.MultiClassifier): yield datasets.ImageSegments().take(500) if isinstance(model, base.Regressor): yield datasets.TrumpApproval() if isinstance(model, base.MultiOutputRegressor): yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud()) if isinstance(model, base.MultiOutputClassifier): yeast = stream.iter_sklearn_dataset(sk_datasets.fetch_openml('yeast', version=4)) yield itertools.islice(yeast, 100)
def test_lin_reg_sklearn_coherence(): """Checks that the sklearn and creme implementations produce the same results.""" class SquaredLoss: """sklearn removes the leading 2 from the gradient of the squared loss.""" def gradient(self, y_true, y_pred): return y_pred - y_true ss = preprocessing.StandardScaler() cr = lm.LinearRegression(optimizer=optim.SGD(.01), loss=SquaredLoss()) sk = sklm.SGDRegressor(learning_rate='constant', eta0=.01, alpha=.0) for x, y in datasets.TrumpApproval(): x = ss.fit_one(x).transform_one(x) cr.fit_one(x, y) sk.partial_fit([list(x.values())], [y]) for i, w in enumerate(cr.weights.values()): assert math.isclose(w, sk.coef_[i]) assert math.isclose(cr.intercept, sk.intercept_[0])
def yield_datasets(model): from creme import compose from creme import datasets from creme import preprocessing from creme import stream from creme import utils from sklearn import datasets as sk_datasets # Classification if utils.inspect.isclassifier(model): yield datasets.Phishing() # Multi-class classification if model._multiclass: yield datasets.ImageSegments().take(500) # Regression if utils.inspect.isregressor(model): yield datasets.TrumpApproval() # Multi-output regression if utils.inspect.ismoregressor(model): # 1 yield stream.iter_sklearn_dataset(sk_datasets.load_linnerud()) # 2 class SolarFlare: """One-hot encoded version of `datasets.SolarFlare`.""" def __iter__(self): oh = (compose.SelectType(str) | preprocessing.OneHotEncoder()) + compose.SelectType(int) for x, y in datasets.SolarFlare(): yield oh.transform_one(x), y yield SolarFlare() # Multi-output classification if utils.inspect.ismoclassifier(model): yield datasets.Music()
norm = utils.math.norm(p, order=2) for j in p: p[j] /= norm yield p @pytest.mark.parametrize( 'lm, dataset', [ pytest.param( lm(optimizer=copy.deepcopy(optimizer), initializer=initializer, l2=0), dataset, id=f'{lm.__name__} - {optimizer} - {initializer}' ) for lm, dataset in [ (lm.LinearRegression, datasets.TrumpApproval()), (lm.LogisticRegression, datasets.Bananas()) ] for optimizer, initializer in itertools.product( [ optim.AdaBound(), optim.AdaDelta(), optim.AdaGrad(), optim.AdaMax(), optim.Adam(), optim.AMSGrad(), # TODO: check momentum optimizers # optim.Momentum(), # optim.NesterovMomentum(), optim.RMSProp(), optim.SGD()
logger.debug(f"Overall model accuracy: {mod_acc} \n\n") if callback is not None: history.append(callback(res)) step_count += 1 return step_count >= min_steps, history def stock_data(): BASE_DIR = Path( __file__).resolve().parent.parent.cwd() / 'data' / 'stock_data.csv' BASE_DIR_STR = str(BASE_DIR) return pd.read_csv(BASE_DIR_STR) X_y = datasets.TrumpApproval() def main(): df = stock_data() df = ta.utils.dropna(df) df = format_timeseries_dataframe(df, "Timestamp") df = format_look_ahead(df, "Close", size=-4) df.dropna() df['log_returns'] = 0 df['log_returns'] = np.where(df["Close_future"] > df["Close"], 1, 1) df['log_returns'] = np.where(df["Close_future"] < df["Close"], -1, df['log_returns']) df = fibonacci(df) df = fibonacci_rsi(df) # df = super_hyper_mega_average_true_range(df)
from creme import datasets from creme import linear_model from creme import preprocessing import dill import requests if __name__ == '__main__': host = 'http://localhost:5000' # Set a flavor r = requests.post(host + '/api/init', json={'flavor': 'regression'}) assert r.status_code == 201 # Upload a model model = preprocessing.StandardScaler() | linear_model.LinearRegression() r = requests.post(host + '/api/model', data=dill.dumps(model)) assert r.status_code == 201 # Train on some data for x, y in datasets.TrumpApproval().take(30): r = requests.post(host + '/api/learn', json={ 'features': x, 'ground_truth': y }) assert r.status_code == 201