예제 #1
0
            'mutation_proba':
            0.2,
            'n_generations':
            80,
            'crossover_independent_proba':
            0.5,
            'mutation_independent_proba':
            0.05,
            'tournament_size':
            5,
            'n_gen_no_change':
            10,
            'caching':
            True,
            'n_jobs':
            -1
        }
        pipe = Pipeline([
            ('scaler', MinMaxScaler()),
            ('SVC', GeneticSelectionCV(**params)),
        ])


if __name__ == '__main__':
    freeze_support()
    logger.setup(filename='../feature_selection.log',
                 filemode='w',
                 root_level=logging.DEBUG,
                 log_level=logging.DEBUG,
                 logger='correlation')
    main()
예제 #2
0
            predictions1 = clf.predict(X_train)
            train_report = classification_report(y_train,
                                                 predictions1,
                                                 output_dict=True)
            print(classification_report(y_train, predictions1))
            logger.info("Classification report on test set")
            predictions2 = clf.predict(X_test)
            test_report = classification_report(y_test,
                                                predictions2,
                                                output_dict=True)
            print(classification_report(y_test, predictions2))
            stats = {
                'score': accuracy_score(y_train, predictions1),
                'mse': mean_squared_error(y_train, predictions1),
                'test_score': accuracy_score(y_test, predictions2),
                'test_mse': mean_squared_error(y_test, predictions2),
                'train_report': train_report,
                'test_report': test_report,
            }
            print(stats)
            print("--- end ---")


if __name__ == '__main__':
    logger.setup(filename='../build_boosted_model.log',
                 filemode='w',
                 root_level=logging.DEBUG,
                 log_level=logging.DEBUG,
                 logger='build_boosted_model')
    main()
        with open(estFile.format(_sym), 'wb') as f:
            pickle.dump(clf, f)
        hyperparameters[_sym] = {
            'estimator': estFile.format(_sym),
            'stats': stats
        }
        # feature_importances = np.mean([
        #     p.named_steps.c.feature_importances_ for p in clf.estimators_
        # ], axis=0)

        # importances = {X.columns[i]: v for i, v in enumerate(feature_importances)}
        # labeled = {str(k): v for k, v in sorted(importances.items(), key=lambda item: -item[1])}

        # print({
        #     # 'features':sel_features
        #     'feature_importances': labeled,
        #     # 'rank': {l: i + 1 for i, l in enumerate(labeled.keys())},
        # })
        with open(resultFile, 'w') as f:  # Save results at every update
            json.dump(hyperparameters, f, indent=4)
        print("--- end ---")


if __name__ == '__main__':
    logger.setup(filename='../dataset_info_svc.log',
                 filemode='w',
                 root_level=logging.DEBUG,
                 log_level=logging.DEBUG,
                 logger='dataset_info_bagging')
    main()
예제 #4
0
            sfm.transform(input)
            sup = sfm.get_support()
            sel_features = [c for c, p in zip(features.columns, sup) if p]
            importances = {
                features.columns[i]: v
                for i, v in enumerate(clf.named_steps.c.feature_importances_)
            }
            labeled_importances = {
                str(k): v
                for k, v in sorted(importances.items(),
                                   key=lambda item: -item[1])
            }
            hyperparameters[_sym] = {
                'estimator': estFile.format(_sym),
                'stats': stats,
                'features': sel_features,
                'feature_importances': labeled_importances
            }
            with open(resultFile, 'w') as f:  # Save results at every update
                json.dump(hyperparameters, f, indent=4)
            print("--- end ---")


if __name__ == '__main__':
    logger.setup(filename='../dataset_info_randomforest_sfm.log',
                 filemode='w',
                 root_level=logging.DEBUG,
                 log_level=logging.DEBUG,
                 logger='dataset_info_randomforest_sfm')
    main()
예제 #5
0
    'QTUM',
    'TRX',
    'USDT',
    'VEN',
    'WAVES',
    'XEM',
    'XMR',
    'XRP',
    'ZEC',
    'ZRX'
]

logger.setup(
    filename='../job_test.log',
    filemode='w',
    root_level=logging.DEBUG,
    log_level=logging.DEBUG,
    logger='job_test'
)

ohlcv = pd.read_csv("./data/result/ohlcv.csv", sep=',', encoding='utf-8', index_col='Date', parse_dates=True)
chain = pd.read_csv("./data/result/blockchains.csv", sep=',', encoding='utf-8', index_col='Date', parse_dates=True)

for _sym in SYMBOLS:
    s = Symbol(_sym, ohlcv=ohlcv, blockchain=chain[[c for c in chain.columns if c.startswith(_sym)]], column_map={
        'open': _sym+'_Open',
        'high': _sym+'_High',
        'low': _sym+'_Low',
        'close': _sym,
        'volume': _sym+'_Volume'
    })
def build_model(dataset, pipeline, experiment, param_grid=None, cv=5, scoring='accuracy', n_jobs='auto', test_size=0.3, use_target=None, expanding_window=False):
    models_dir = './results/{}_{}_{}/models/'.format(dataset, pipeline, experiment)
    reports_dir = './results/{}_{}_{}/reports/'.format(dataset, pipeline, experiment)
    experiment_index_file = './results/{}_{}_{}/index.json'.format(dataset, pipeline, experiment)
    log_file = './results/{}_{}_{}/model_build.log'.format(dataset, pipeline, experiment)
    if ',' in scoring:
        scoring = scoring.split(',')
    # if scoring is precision, make scorer manually to suppress zero_division warnings in case of heavy bias
    if scoring == 'precision':
        scoring = make_scorer(precision_score, zero_division=1)
    os.makedirs(models_dir, exist_ok=True)
    os.makedirs(reports_dir, exist_ok=True)
    # Setup logging
    logger.setup(
        filename=log_file,
        filemode='w',
        root_level=logging.DEBUG,
        log_level=logging.DEBUG,
        logger='build_model'
    )
    index_name = 'index'
    if '.' in dataset:
        splits = dataset.split(".")
        dataset = splits[0]
        index_name = splits[1]
    # Load the dataset index
    dataset_index = load_dataset(dataset, return_index=True, index_name=index_name)
    # Dynamically import the pipeline we want to use for building the model
    p = importlib.import_module('pipelines.' + pipeline)
    experiment_index = {}

    if n_jobs == 'auto':
        n_jobs = os.cpu_count()
    # Load parameter grid argument
    if param_grid == None:
        param_grid = p.PARAMETER_GRID
    elif type(param_grid) is 'str':
        with open(param_grid, 'r') as f:
            param_grid = json.load(f)

    logger.info('Start experiment: {} using {} on {}'.format(experiment, pipeline, dataset))
    for _sym, data in dataset_index.items():
        logger.info('Start processing: {}'.format(_sym))
        features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True)
        targets = pd.read_csv(data['target_csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True)
        current_target = p.TARGET if not use_target else use_target

        # Drop columns whose values are all NaN, as well as rows with ANY nan value, then
        # replace infinity values with nan so that they can later be imputed to a finite value
        features = features.dropna(axis='columns', how='all').dropna().replace([np.inf, -np.inf], np.nan)
        target = targets.loc[features.index][current_target]

        features = features.replace([np.inf, -np.inf], np.nan)
        imputer = SimpleImputer()
        imputer.fit(features.values)
        feat_imp_values = imputer.transform(features.values)
        features = pd.DataFrame(feat_imp_values, index=features.index, columns=features.columns)
        X_train, X_test, y_train, y_test = train_test_split(features.values, target.values, shuffle=False, test_size=test_size)
        # Summarize distribution
        logger.info("Start Hyperopt search")
        if expanding_window:
            cv = TimeSeriesSplit(n_splits=expanding_window)
        #cv = sliding_window_split(X_train, 0.1)
        est = HyperoptEstimator(classifier=any_classifier('my_clf'),
                          preprocessing=any_preprocessing('my_pre'),
                          algo=tpe.suggest,
                          max_evals=100,
                          trial_timeout=120)
        est.fit(X_train, y_train)
        logger.info("End Hyperopt search")

        # Take the fitted ensemble with tuned hyperparameters
        clf = est.best_model()['learner']
        best_score = est.score(X_train, y_train)
        best_params = {}

        # Plot learning curve for the classifier
        #est = p.estimator
        #est.set_params(**best_params)

        _, axes = plt.subplots(3, 3, figsize=(20, 12), dpi=200, constrained_layout=True)
        #plt.tight_layout()
        _train_ax = [ axes[0][0], axes[0][1], axes[0][2] ]
        #plot_learning_curve(est, "{} - Learning curves (Train)".format(_sym), X_train, y_train, axes=_train_ax, cv=cv)

        axes[1][0].set_title("{} - ROC (Train)".format(_sym))
        plot_roc_curve(clf, X_train, y_train, ax=axes[1][0])
        axes[1][1].set_title("{} - Precision/Recall (Train)".format(_sym))
        plot_precision_recall_curve(clf, X_train, y_train, ax=axes[1][1])
        axes[1][2].set_title("{} - Confusion matrix (Train)".format(_sym))
        plot_confusion_matrix(clf, X_train, y_train, cmap='Blues', ax=axes[1][2])

        axes[2][0].set_title("{} - ROC (Test)".format(_sym))
        plot_roc_curve(clf, X_test, y_test, ax=axes[2][0])
        axes[2][1].set_title("{} - Precision/Recall (Test)".format(_sym))
        plot_precision_recall_curve(clf, X_train, y_train, ax=axes[2][1])
        axes[2][2].set_title("{} - Confusion matrix (Test)".format(_sym))
        plot_confusion_matrix(clf, X_test, y_test, cmap='Oranges', ax=axes[2][2])

        curve_path = '{}{}_learning_curve.png'.format(reports_dir, _sym)
        plt.savefig(curve_path)
        plt.close()

        # Test ensemble's performance on training and test sets
        predictions1 = clf.predict(X_train)
        train_report = classification_report(y_train, predictions1, output_dict=True)
        logger.info("Classification report on train set:\n{}".format(classification_report(y_train, predictions1)))
        predictions2 = clf.predict(X_test)
        test_report = classification_report(y_test, predictions2, output_dict=True)
        logger.info("Classification report on test set\n{}".format(classification_report(y_test, predictions2)))

        report = {
            'training_set': {
                'features':X_train.shape[1],
                'records':X_train.shape[0],
                'class_distribution': get_class_distribution(y_train),
                'classification_report': train_report,
                'accuracy': accuracy_score(y_train, predictions1),
                'mse': mean_squared_error(y_train, predictions1),
                'precision': precision_score(y_train, predictions1),
                'recall': recall_score(y_train, predictions1),
                'f1': f1_score(y_train, predictions1),
                'y_true':[y for y in y_train],
                'y_pred':[y for y in predictions1]
            },
            'test_set': {
                'features':X_test.shape[1],
                'records':X_test.shape[0],
                'class_distribution':get_class_distribution(y_test),
                'classification_report': test_report,
                'accuracy': accuracy_score(y_test, predictions2),
                'precision': precision_score(y_test, predictions2),
                'mse': mean_squared_error(y_test, predictions2),
                'recall': recall_score(y_test, predictions2),
                'f1': f1_score(y_test, predictions2),
                'y_true': [y for y in y_test],
                'y_pred': [y for y in predictions2]
            }
        }
        # If the classifier has a feature_importances attribute, save it in the report
        feature_importances = None
        if hasattr(clf, 'feature_importances_'):
            feature_importances = clf.feature_importances_
        elif hasattr(clf, 'named_steps') and hasattr(clf.named_steps, 'c') and hasattr(clf.named_steps.c, 'feature_importances_'):
            feature_importances = clf.named_steps.c.feature_importances_
        if feature_importances is not None:
            importances = {features.columns[i]: v for i, v in enumerate(feature_importances)}
            labeled = {str(k): float(v) for k, v in sorted(importances.items(), key=lambda item: -item[1])}
            report['feature_importances'] = labeled
        if hasattr(clf, 'ranking_'):
            report['feature_rank'] = {features.columns[i]: s for i, s in enumerate(clf.ranking_)}
        if hasattr(clf, 'support_'):
            report['feature_support'] = [features.columns[i] for i, s in enumerate(clf.support_) if s]
        train_dist = ['\t\tClass {}:\t{}\t({}%%)'.format(k, d['count'], d['pct']) for k, d in get_class_distribution(y_train).items()]
        test_dist = ['\t\tClass {}:\t{}\t({}%%)'.format(k, d['count'], d['pct']) for k, d in get_class_distribution(y_test).items()]

        logger.info('Model evaluation: \n'
              '== Training set ==\n'
              '\t # Features: {} | # Records: {}\n '
              '\tClass distribution:\n{}\n'
              '\tAccuracy: {}\n'
              '\tPrecision: {}\n'
              '\tMSE: {}\n' \
              '\tRecall: {}\n' \
              '\tF1: {}\n' \
              '== Test set ==\n'
              '\t # Features: {} | # Records: {}\n '
              '\tClass distribution:\n{}\n'
              '\tAccuracy: {}\n'
              '\tPrecision: {}\n'
              '\tMSE: {}\n' \
              '\tRecall: {}\n' \
              '\tF1: {}\n' \
              .format(X_train.shape[1], X_train.shape[0], '\n'.join(train_dist),
                      report['training_set']['accuracy'], report['training_set']['precision'], report['training_set']['mse'],
                      report['training_set']['recall'], report['training_set']['f1'],
                      X_test.shape[1], X_test.shape[0], '\n'.join(test_dist),
                      report['test_set']['accuracy'], report['test_set']['precision'], report['test_set']['mse'],
                      report['test_set']['recall'], report['test_set']['f1']
                      )
        )

        # Save a pickle dump of the model
        model_path = '{}{}.p'.format(models_dir, _sym)
        with open(model_path, 'wb') as f:
            pickle.dump(clf, f)
        # Save the model's parameters
        params_path = '{}{}_parameters.json'.format(models_dir, _sym)
        with open(params_path, 'w') as f:
            json.dump(best_params, f, indent=4)
        # Save the report for this model
        report_path = '{}{}.json'.format(reports_dir, _sym)
        with open(report_path, 'w') as f:
            json.dump(report, f, indent=4)
        # Update the experiment's index with the new results, and save it
        experiment_index[_sym] = {
            'model':model_path,
            'params':params_path,
            'report':report_path
        }
        with open(experiment_index_file, 'w') as f:
            json.dump(experiment_index, f, indent=4)
        logger.info("--- {} end ---".format(_sym))
    return experiment_index
예제 #7
0
def build_model(dataset,
                pipeline,
                experiment,
                param_grid=None,
                cv=5,
                scoring='accuracy',
                n_jobs='auto',
                test_size=0.3,
                use_target=None,
                expanding_window=False):
    # Define log file path for this run
    log_file = './results/{}_{}_{}/model_build.log'.format(
        dataset, pipeline, experiment)
    os.makedirs('./results/{}_{}_{}'.format(dataset, pipeline, experiment),
                exist_ok=True)
    # Setup logging
    logger.setup(filename=log_file,
                 filemode='w',
                 root_level=logging.DEBUG,
                 log_level=logging.DEBUG,
                 logger='build_model')

    # Load the dataset index
    dataset_index = load_dataset(dataset, return_index=True)
    # Dynamically import the pipeline we want to use for building the model
    p = importlib.import_module('pipelines.' + pipeline)

    # Parameter grid argument:
    # - If None, use the pipeline-defined grid
    # - If string, parse it as JSON
    # - If dict, use it as-is (do nothing)
    if param_grid == None:
        param_grid = p.PARAMETER_GRID
    elif type(param_grid) is 'str':
        with open(param_grid, 'r') as f:
            param_grid = json.load(f)
    # Target argument
    # Determines the target feature name (the system supports different classification targets)
    # If not supplied, use the pipeline-defined target.
    current_target = p.TARGET if not use_target else use_target

    logger.info('Start processing: {} using {} on {} with target {}'.format(
        experiment, pipeline, dataset, current_target))
    reports = ReportCollection(dataset, pipeline, experiment)
    for _sym, data in dataset_index.items():
        try:
            logger.info('Start processing: {}'.format(_sym))
            # ToDo: use lib.dataset.features.load_symbol instead of manually reading csv's
            features = pd.read_csv(data['csv'],
                                   sep=',',
                                   encoding='utf-8',
                                   index_col='Date',
                                   parse_dates=True)
            targets = pd.read_csv(data['target_csv'],
                                  sep=',',
                                  encoding='utf-8',
                                  index_col='Date',
                                  parse_dates=True)

            # Drop columns whose values are all NaN, as well as rows with ANY nan value, then
            # replace infinity values with nan so that they can later be imputed to a finite value
            # in the pipeline's "Inputing" stage
            features = features.dropna(axis='columns',
                                       how='all').replace([np.inf, -np.inf],
                                                          np.nan)
            target = targets.loc[features.index][current_target]

            # Split available data in train and test set.
            # Perform grid search with cross-validation on the training set,
            # Then ToDo: instantiate a MLStrategy and test the model on the test set, in sliding window fashion
            X_train, X_test, y_train, y_test = train_test_split(
                features.values,
                target.values,
                shuffle=False,
                test_size=test_size)

            # Log before and after grid search to track execution time
            # Grid search logic moved to its own method for cleanliness
            logger.info("Start Grid search")
            gscv = grid_search(p.estimator,
                               param_grid,
                               X_train,
                               y_train,
                               cv=cv,
                               n_jobs=n_jobs,
                               expanding_window=expanding_window,
                               scoring=scoring)
            logger.info("End Grid search")
            labels, predictions = test_model(p.estimator, gscv.best_params_,
                                             30, X_train, y_train, X_test,
                                             y_test)
            report = classification_report(labels,
                                           predictions,
                                           output_dict=True)

            # Create a Report instance from the grid search results, and add it to this experiment's collection
            _report = Report(_sym, current_target, cv)
            _report.set_close(targets.loc[features.index].close)
            _report.set_dataset_columns(features.columns)
            _report.set_train_dataset(X_train, y_train)
            _report.set_test_dataset(X_test, y_test)
            _report.set_model(p.estimator)
            _report.set_params(gscv.best_params_)
            _report.set_cv(gscv.best_estimator_, gscv.best_score_,
                           gscv.cv_results_)
            reports.add_report(_report)
            reports.save()

            logger.info("--- {} end ---".format(_sym))
        except Exception as e:
            logger.error(
                "Exception while building model pipeline: {} dataset: {} symbol: {}\nException:\n{}"
                .format(pipeline, dataset, _sym, e))
            traceback.print_exc()
    return reports
            predictions1 = clf.predict(X_train)
            train_report = classification_report(y_train,
                                                 predictions1,
                                                 output_dict=True)
            print(classification_report(y_train, predictions1))
            logger.info("Classification report on test set")
            predictions2 = clf.predict(X_test)
            test_report = classification_report(y_test,
                                                predictions2,
                                                output_dict=True)
            print(classification_report(y_test, predictions2))
            stats = {
                'score': accuracy_score(y_train, predictions1),
                'mse': mean_squared_error(y_train, predictions1),
                'test_score': accuracy_score(y_test, predictions2),
                'test_mse': mean_squared_error(y_test, predictions2),
                'train_report': train_report,
                'test_report': test_report,
            }
            print(stats)
            print("--- end ---")


if __name__ == '__main__':
    logger.setup(filename='../test_pickled_model.log',
                 filemode='w',
                 root_level=logging.DEBUG,
                 log_level=logging.DEBUG,
                 logger='test_pickled_model')
    main()
예제 #9
0
from matplotlib import pyplot as plt
import numpy as np
import os
import pickle
import json

def main():
    index = load_dataset('all_merged', return_index=True)
    for _sym, data in index.items():

        features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True)
        # Replace nan with infinity so that it can later be imputed to a finite value
        features = features.replace([np.inf, -np.inf], np.nan)

        # Derive target classes from closing price
        target_pct = target_price_variation(features['close'])
        target = target_binned_price_variation(target_pct, n_bins=2)
        # target = target_discrete_price_variation(target_pct)

        print("--- end ---")

if __name__ == '__main__':
    logger.setup(
        filename='../blockchain_features.log',
        filemode='w',
        root_level=logging.DEBUG,
        log_level=logging.DEBUG,
        logger='blockchain_features'
    )
    main()
            'xls':
            'data/datasets/all_merged/excel/{}_faceted.xlsx'.format(
                _sym.lower()),
            'target_csv':
            'data/datasets/all_merged/csv/{}_target.csv'.format(_sym.lower()),
            'target_xls':
            'data/datasets/all_merged/excel/{}_target.xlsx'.format(
                _sym.lower()),
            'features': {
                'price_history': [c for c in history_facet.columns],
                'trend': [c for c in trend_facet.columns],
                'volatility': [c for c in volatility_facet.columns],
                'volume': [c for c in volume_facet.columns],
                'chain': [c for c in chain_facet.columns],
            }
        }
        logger.info('Saved {} in data/datasets/all_merged/'.format(_sym))
    with open('data/datasets/all_merged/index_faceted.json', 'w') as f:
        json.dump(index, f, sort_keys=True, indent=4)


if __name__ == '__main__':
    logger.setup(filename='../build_dataset.log',
                 filemode='w',
                 root_level=logging.DEBUG,
                 log_level=logging.DEBUG,
                 logger='build_dataset')
    build_merged_dataset()
    build_atsa_dataset('all_merged')
    build_improved_dataset('all_merged')
    build_faceted_dataset('all_merged')
예제 #11
0
def build_model(dataset,
                pipeline,
                experiment,
                current_target='class',
                test_size=0.3):
    models_dir = './results/{}_{}_{}/models/'.format(dataset, pipeline,
                                                     experiment)
    reports_dir = './results/{}_{}_{}/reports/'.format(dataset, pipeline,
                                                       experiment)
    experiment_index_file = './results/{}_{}_{}/index.json'.format(
        dataset, pipeline, experiment)
    log_file = './results/{}_{}_{}/model_build.log'.format(
        dataset, pipeline, experiment)

    scoring = make_scorer(precision_score, zero_division=1, average='micro')
    os.makedirs(models_dir, exist_ok=True)
    os.makedirs(reports_dir, exist_ok=True)
    # Setup logging
    logger.setup(filename=log_file,
                 filemode='w',
                 root_level=logging.DEBUG,
                 log_level=logging.DEBUG,
                 logger='build_model')
    index_name = 'index'
    if '.' in dataset:
        splits = dataset.split(".")
        dataset = splits[0]
        index_name = splits[1]
    # Load the dataset index
    dataset_index = load_dataset(dataset,
                                 return_index=True,
                                 index_name=index_name)
    # Dynamically import the pipeline we want to use for building the model
    logger.info('Start experiment: {} using {} on {} with target {}'.format(
        experiment, pipeline, dataset, current_target))
    reports = ReportCollection(dataset, pipeline, experiment)
    for _sym, data in {'BTC': dataset_index['BTC']}.items():
        try:
            logger.info('Start processing: {}'.format(_sym))
            features = pd.read_csv(data['csv'],
                                   sep=',',
                                   encoding='utf-8',
                                   index_col='Date',
                                   parse_dates=True)
            targets = pd.read_csv(data['target_csv'],
                                  sep=',',
                                  encoding='utf-8',
                                  index_col='Date',
                                  parse_dates=True)

            # Drop columns whose values are all NaN, as well as rows with ANY nan value, then
            # replace infinity values with nan so that they can later be imputed to a finite value
            features = features.dropna(
                axis='columns', how='all').dropna().replace([np.inf, -np.inf],
                                                            np.nan)
            target = targets.loc[features.index][current_target]

            #X_train, X_test, y_train, y_test = train_test_split(features, target, shuffle=False, test_size=test_size)

            all_size = features.shape[0]
            train_size = int(all_size * (1 - test_size))
            features = detabularise(
                features[[c for c in features.columns if 'close' in c]])
            X_train = features.iloc[0:train_size]
            y_train = target.iloc[0:train_size]
            X_test = features.iloc[train_size:all_size]
            y_test = target.iloc[train_size:all_size]
            # Summarize distribution
            logger.info("Start Grid search")
            clf = ShapeletTransformClassifier(time_contract_in_mins=5)
            clf.fit(X_train, y_train)
            print('{} Score: {}'.format(_sym, clf.score(X_test, y_test)))
            pred = clf.predict(X_test)
            print(classification_report(y_test, pred))
            logger.info("End Grid search")

            logger.info("--- {} end ---".format(_sym))
        except Exception as e:
            logger.error(
                "Exception while building model pipeline: {} dataset: {} symbol: {}\nException:\n{}"
                .format(pipeline, dataset, _sym, e))
            traceback.print_exc()
    return reports
예제 #12
0
                     )  # nargs='?', default='all_merged.index_improved',
 args = parser.parse_args()
 os.makedirs('./equities/{}/'.format(args.name), exist_ok=True)
 models_cache_dir = './equities/{}/models'.format(args.name)
 os.makedirs(models_cache_dir, exist_ok=True)
 db_file = './equities/{}/status.db'.format(args.name)
 if os.path.exists(db_file):
     os.remove(db_file)
 engine = create_engine('sqlite:///' + db_file)
 Base.metadata.bind = engine
 session_factory = sessionmaker(bind=engine)
 DBSession = scoped_session(session_factory)
 migrate(db_file)  # Create status database and tables
 logger.setup(filename='./equities/{}/log.txt'.format(args.name),
              filemode='w',
              root_level=logging.DEBUG,
              log_level=logging.DEBUG,
              logger='equity')
 # result = trailing_window_day(pipeline_name='debug_xgboost',
 #                        parameters='./results/timedspline_safe_debug_xgboost_splines_experiment_171020_040945/',
 #                        dataset='timedspline_safe',
 #                        symbols=['ADA', 'BTC'],
 #                        day='2018-08-01',
 #                        window_size=150
 #                        )
 beg = '2018-06-01'
 end = '2018-09-01'
 _symbols = [
     'ADA',
     'BCH',
     'BNB',
 parser = argparse.ArgumentParser(
     description='Build and tune models, collect results')
 parser.add_argument('-n', dest='name', nargs='?', default='equity_test',
                     help="Name for the current equity")  # nargs='?', default='all_merged.index_improved',
 args = parser.parse_args()
 # Create status directory and DB file
 base_dir = './equities/{}/'.format(args.name)
 os.makedirs(base_dir, exist_ok=True)
 db_file = '{}/status.db'.format(base_dir)
 # if os.path.exists(db_file):
 #     os.remove(db_file)
 # Setup logging
 logger.setup(
     filename='{}/log.txt'.format(base_dir),
     filemode='w',
     root_level=logging.DEBUG,
     log_level=logging.DEBUG,
     logger='equity'
 )
 # Create SQLAlchemy engine and bind the models
 engine = create_engine('sqlite:///' + db_file)
 Base.metadata.bind = engine
 session_factory = sessionmaker(bind=engine)
 DBSession = scoped_session(session_factory)
 # Create status database and tables (only if db is new)
 migrate(db_file)
 # Create exchange instance
 exchange = Exchange(DBSession)
 strategies = {}
 for s in SYMBOLS:
     # Deposit initial asset amount
from lib.log import logger
import pandas as pd
from old.lib.plotter import correlation
import lib.dataset as builder
import json

INTERACTIVE_FIGURE = False
SYMBOLS = [
    'ADA', 'BCH', 'BNB', 'BTC', 'BTG', 'DASH', 'DOGE', 'EOS', 'ETC', 'ETH',
    'IOT', 'LTC', 'LINK', 'NEO', 'QTUM', 'TRX', 'USDT', 'VEN', 'WAVES', 'XEM',
    'XMR', 'XRP', 'ZEC', 'ZRX'
]

logger.setup(filename='../dataset_ohlcv_social.log',
             filemode='w',
             root_level=logging.DEBUG,
             log_level=logging.DEBUG,
             logger='dataset ohlcv_social')

index = {}
for _sym in SYMBOLS:
    ohlcv = pd.read_csv("./data/preprocessed/ohlcv/csv/{}.csv".format(
        _sym.lower()),
                        sep=',',
                        encoding='utf-8',
                        index_col='Date',
                        parse_dates=True)
    cm = pd.read_csv(
        "./data/preprocessed/cryptocompare_social/csv/{}.csv".format(
            _sym.lower()),
        sep=',',