'mutation_proba': 0.2, 'n_generations': 80, 'crossover_independent_proba': 0.5, 'mutation_independent_proba': 0.05, 'tournament_size': 5, 'n_gen_no_change': 10, 'caching': True, 'n_jobs': -1 } pipe = Pipeline([ ('scaler', MinMaxScaler()), ('SVC', GeneticSelectionCV(**params)), ]) if __name__ == '__main__': freeze_support() logger.setup(filename='../feature_selection.log', filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='correlation') main()
predictions1 = clf.predict(X_train) train_report = classification_report(y_train, predictions1, output_dict=True) print(classification_report(y_train, predictions1)) logger.info("Classification report on test set") predictions2 = clf.predict(X_test) test_report = classification_report(y_test, predictions2, output_dict=True) print(classification_report(y_test, predictions2)) stats = { 'score': accuracy_score(y_train, predictions1), 'mse': mean_squared_error(y_train, predictions1), 'test_score': accuracy_score(y_test, predictions2), 'test_mse': mean_squared_error(y_test, predictions2), 'train_report': train_report, 'test_report': test_report, } print(stats) print("--- end ---") if __name__ == '__main__': logger.setup(filename='../build_boosted_model.log', filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='build_boosted_model') main()
with open(estFile.format(_sym), 'wb') as f: pickle.dump(clf, f) hyperparameters[_sym] = { 'estimator': estFile.format(_sym), 'stats': stats } # feature_importances = np.mean([ # p.named_steps.c.feature_importances_ for p in clf.estimators_ # ], axis=0) # importances = {X.columns[i]: v for i, v in enumerate(feature_importances)} # labeled = {str(k): v for k, v in sorted(importances.items(), key=lambda item: -item[1])} # print({ # # 'features':sel_features # 'feature_importances': labeled, # # 'rank': {l: i + 1 for i, l in enumerate(labeled.keys())}, # }) with open(resultFile, 'w') as f: # Save results at every update json.dump(hyperparameters, f, indent=4) print("--- end ---") if __name__ == '__main__': logger.setup(filename='../dataset_info_svc.log', filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='dataset_info_bagging') main()
sfm.transform(input) sup = sfm.get_support() sel_features = [c for c, p in zip(features.columns, sup) if p] importances = { features.columns[i]: v for i, v in enumerate(clf.named_steps.c.feature_importances_) } labeled_importances = { str(k): v for k, v in sorted(importances.items(), key=lambda item: -item[1]) } hyperparameters[_sym] = { 'estimator': estFile.format(_sym), 'stats': stats, 'features': sel_features, 'feature_importances': labeled_importances } with open(resultFile, 'w') as f: # Save results at every update json.dump(hyperparameters, f, indent=4) print("--- end ---") if __name__ == '__main__': logger.setup(filename='../dataset_info_randomforest_sfm.log', filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='dataset_info_randomforest_sfm') main()
'QTUM', 'TRX', 'USDT', 'VEN', 'WAVES', 'XEM', 'XMR', 'XRP', 'ZEC', 'ZRX' ] logger.setup( filename='../job_test.log', filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='job_test' ) ohlcv = pd.read_csv("./data/result/ohlcv.csv", sep=',', encoding='utf-8', index_col='Date', parse_dates=True) chain = pd.read_csv("./data/result/blockchains.csv", sep=',', encoding='utf-8', index_col='Date', parse_dates=True) for _sym in SYMBOLS: s = Symbol(_sym, ohlcv=ohlcv, blockchain=chain[[c for c in chain.columns if c.startswith(_sym)]], column_map={ 'open': _sym+'_Open', 'high': _sym+'_High', 'low': _sym+'_Low', 'close': _sym, 'volume': _sym+'_Volume' })
def build_model(dataset, pipeline, experiment, param_grid=None, cv=5, scoring='accuracy', n_jobs='auto', test_size=0.3, use_target=None, expanding_window=False): models_dir = './results/{}_{}_{}/models/'.format(dataset, pipeline, experiment) reports_dir = './results/{}_{}_{}/reports/'.format(dataset, pipeline, experiment) experiment_index_file = './results/{}_{}_{}/index.json'.format(dataset, pipeline, experiment) log_file = './results/{}_{}_{}/model_build.log'.format(dataset, pipeline, experiment) if ',' in scoring: scoring = scoring.split(',') # if scoring is precision, make scorer manually to suppress zero_division warnings in case of heavy bias if scoring == 'precision': scoring = make_scorer(precision_score, zero_division=1) os.makedirs(models_dir, exist_ok=True) os.makedirs(reports_dir, exist_ok=True) # Setup logging logger.setup( filename=log_file, filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='build_model' ) index_name = 'index' if '.' in dataset: splits = dataset.split(".") dataset = splits[0] index_name = splits[1] # Load the dataset index dataset_index = load_dataset(dataset, return_index=True, index_name=index_name) # Dynamically import the pipeline we want to use for building the model p = importlib.import_module('pipelines.' + pipeline) experiment_index = {} if n_jobs == 'auto': n_jobs = os.cpu_count() # Load parameter grid argument if param_grid == None: param_grid = p.PARAMETER_GRID elif type(param_grid) is 'str': with open(param_grid, 'r') as f: param_grid = json.load(f) logger.info('Start experiment: {} using {} on {}'.format(experiment, pipeline, dataset)) for _sym, data in dataset_index.items(): logger.info('Start processing: {}'.format(_sym)) features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) targets = pd.read_csv(data['target_csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) current_target = p.TARGET if not use_target else use_target # Drop columns whose values are all NaN, as well as rows with ANY nan value, then # replace infinity values with nan so that they can later be imputed to a finite value features = features.dropna(axis='columns', how='all').dropna().replace([np.inf, -np.inf], np.nan) target = targets.loc[features.index][current_target] features = features.replace([np.inf, -np.inf], np.nan) imputer = SimpleImputer() imputer.fit(features.values) feat_imp_values = imputer.transform(features.values) features = pd.DataFrame(feat_imp_values, index=features.index, columns=features.columns) X_train, X_test, y_train, y_test = train_test_split(features.values, target.values, shuffle=False, test_size=test_size) # Summarize distribution logger.info("Start Hyperopt search") if expanding_window: cv = TimeSeriesSplit(n_splits=expanding_window) #cv = sliding_window_split(X_train, 0.1) est = HyperoptEstimator(classifier=any_classifier('my_clf'), preprocessing=any_preprocessing('my_pre'), algo=tpe.suggest, max_evals=100, trial_timeout=120) est.fit(X_train, y_train) logger.info("End Hyperopt search") # Take the fitted ensemble with tuned hyperparameters clf = est.best_model()['learner'] best_score = est.score(X_train, y_train) best_params = {} # Plot learning curve for the classifier #est = p.estimator #est.set_params(**best_params) _, axes = plt.subplots(3, 3, figsize=(20, 12), dpi=200, constrained_layout=True) #plt.tight_layout() _train_ax = [ axes[0][0], axes[0][1], axes[0][2] ] #plot_learning_curve(est, "{} - Learning curves (Train)".format(_sym), X_train, y_train, axes=_train_ax, cv=cv) axes[1][0].set_title("{} - ROC (Train)".format(_sym)) plot_roc_curve(clf, X_train, y_train, ax=axes[1][0]) axes[1][1].set_title("{} - Precision/Recall (Train)".format(_sym)) plot_precision_recall_curve(clf, X_train, y_train, ax=axes[1][1]) axes[1][2].set_title("{} - Confusion matrix (Train)".format(_sym)) plot_confusion_matrix(clf, X_train, y_train, cmap='Blues', ax=axes[1][2]) axes[2][0].set_title("{} - ROC (Test)".format(_sym)) plot_roc_curve(clf, X_test, y_test, ax=axes[2][0]) axes[2][1].set_title("{} - Precision/Recall (Test)".format(_sym)) plot_precision_recall_curve(clf, X_train, y_train, ax=axes[2][1]) axes[2][2].set_title("{} - Confusion matrix (Test)".format(_sym)) plot_confusion_matrix(clf, X_test, y_test, cmap='Oranges', ax=axes[2][2]) curve_path = '{}{}_learning_curve.png'.format(reports_dir, _sym) plt.savefig(curve_path) plt.close() # Test ensemble's performance on training and test sets predictions1 = clf.predict(X_train) train_report = classification_report(y_train, predictions1, output_dict=True) logger.info("Classification report on train set:\n{}".format(classification_report(y_train, predictions1))) predictions2 = clf.predict(X_test) test_report = classification_report(y_test, predictions2, output_dict=True) logger.info("Classification report on test set\n{}".format(classification_report(y_test, predictions2))) report = { 'training_set': { 'features':X_train.shape[1], 'records':X_train.shape[0], 'class_distribution': get_class_distribution(y_train), 'classification_report': train_report, 'accuracy': accuracy_score(y_train, predictions1), 'mse': mean_squared_error(y_train, predictions1), 'precision': precision_score(y_train, predictions1), 'recall': recall_score(y_train, predictions1), 'f1': f1_score(y_train, predictions1), 'y_true':[y for y in y_train], 'y_pred':[y for y in predictions1] }, 'test_set': { 'features':X_test.shape[1], 'records':X_test.shape[0], 'class_distribution':get_class_distribution(y_test), 'classification_report': test_report, 'accuracy': accuracy_score(y_test, predictions2), 'precision': precision_score(y_test, predictions2), 'mse': mean_squared_error(y_test, predictions2), 'recall': recall_score(y_test, predictions2), 'f1': f1_score(y_test, predictions2), 'y_true': [y for y in y_test], 'y_pred': [y for y in predictions2] } } # If the classifier has a feature_importances attribute, save it in the report feature_importances = None if hasattr(clf, 'feature_importances_'): feature_importances = clf.feature_importances_ elif hasattr(clf, 'named_steps') and hasattr(clf.named_steps, 'c') and hasattr(clf.named_steps.c, 'feature_importances_'): feature_importances = clf.named_steps.c.feature_importances_ if feature_importances is not None: importances = {features.columns[i]: v for i, v in enumerate(feature_importances)} labeled = {str(k): float(v) for k, v in sorted(importances.items(), key=lambda item: -item[1])} report['feature_importances'] = labeled if hasattr(clf, 'ranking_'): report['feature_rank'] = {features.columns[i]: s for i, s in enumerate(clf.ranking_)} if hasattr(clf, 'support_'): report['feature_support'] = [features.columns[i] for i, s in enumerate(clf.support_) if s] train_dist = ['\t\tClass {}:\t{}\t({}%%)'.format(k, d['count'], d['pct']) for k, d in get_class_distribution(y_train).items()] test_dist = ['\t\tClass {}:\t{}\t({}%%)'.format(k, d['count'], d['pct']) for k, d in get_class_distribution(y_test).items()] logger.info('Model evaluation: \n' '== Training set ==\n' '\t # Features: {} | # Records: {}\n ' '\tClass distribution:\n{}\n' '\tAccuracy: {}\n' '\tPrecision: {}\n' '\tMSE: {}\n' \ '\tRecall: {}\n' \ '\tF1: {}\n' \ '== Test set ==\n' '\t # Features: {} | # Records: {}\n ' '\tClass distribution:\n{}\n' '\tAccuracy: {}\n' '\tPrecision: {}\n' '\tMSE: {}\n' \ '\tRecall: {}\n' \ '\tF1: {}\n' \ .format(X_train.shape[1], X_train.shape[0], '\n'.join(train_dist), report['training_set']['accuracy'], report['training_set']['precision'], report['training_set']['mse'], report['training_set']['recall'], report['training_set']['f1'], X_test.shape[1], X_test.shape[0], '\n'.join(test_dist), report['test_set']['accuracy'], report['test_set']['precision'], report['test_set']['mse'], report['test_set']['recall'], report['test_set']['f1'] ) ) # Save a pickle dump of the model model_path = '{}{}.p'.format(models_dir, _sym) with open(model_path, 'wb') as f: pickle.dump(clf, f) # Save the model's parameters params_path = '{}{}_parameters.json'.format(models_dir, _sym) with open(params_path, 'w') as f: json.dump(best_params, f, indent=4) # Save the report for this model report_path = '{}{}.json'.format(reports_dir, _sym) with open(report_path, 'w') as f: json.dump(report, f, indent=4) # Update the experiment's index with the new results, and save it experiment_index[_sym] = { 'model':model_path, 'params':params_path, 'report':report_path } with open(experiment_index_file, 'w') as f: json.dump(experiment_index, f, indent=4) logger.info("--- {} end ---".format(_sym)) return experiment_index
def build_model(dataset, pipeline, experiment, param_grid=None, cv=5, scoring='accuracy', n_jobs='auto', test_size=0.3, use_target=None, expanding_window=False): # Define log file path for this run log_file = './results/{}_{}_{}/model_build.log'.format( dataset, pipeline, experiment) os.makedirs('./results/{}_{}_{}'.format(dataset, pipeline, experiment), exist_ok=True) # Setup logging logger.setup(filename=log_file, filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='build_model') # Load the dataset index dataset_index = load_dataset(dataset, return_index=True) # Dynamically import the pipeline we want to use for building the model p = importlib.import_module('pipelines.' + pipeline) # Parameter grid argument: # - If None, use the pipeline-defined grid # - If string, parse it as JSON # - If dict, use it as-is (do nothing) if param_grid == None: param_grid = p.PARAMETER_GRID elif type(param_grid) is 'str': with open(param_grid, 'r') as f: param_grid = json.load(f) # Target argument # Determines the target feature name (the system supports different classification targets) # If not supplied, use the pipeline-defined target. current_target = p.TARGET if not use_target else use_target logger.info('Start processing: {} using {} on {} with target {}'.format( experiment, pipeline, dataset, current_target)) reports = ReportCollection(dataset, pipeline, experiment) for _sym, data in dataset_index.items(): try: logger.info('Start processing: {}'.format(_sym)) # ToDo: use lib.dataset.features.load_symbol instead of manually reading csv's features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) targets = pd.read_csv(data['target_csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) # Drop columns whose values are all NaN, as well as rows with ANY nan value, then # replace infinity values with nan so that they can later be imputed to a finite value # in the pipeline's "Inputing" stage features = features.dropna(axis='columns', how='all').replace([np.inf, -np.inf], np.nan) target = targets.loc[features.index][current_target] # Split available data in train and test set. # Perform grid search with cross-validation on the training set, # Then ToDo: instantiate a MLStrategy and test the model on the test set, in sliding window fashion X_train, X_test, y_train, y_test = train_test_split( features.values, target.values, shuffle=False, test_size=test_size) # Log before and after grid search to track execution time # Grid search logic moved to its own method for cleanliness logger.info("Start Grid search") gscv = grid_search(p.estimator, param_grid, X_train, y_train, cv=cv, n_jobs=n_jobs, expanding_window=expanding_window, scoring=scoring) logger.info("End Grid search") labels, predictions = test_model(p.estimator, gscv.best_params_, 30, X_train, y_train, X_test, y_test) report = classification_report(labels, predictions, output_dict=True) # Create a Report instance from the grid search results, and add it to this experiment's collection _report = Report(_sym, current_target, cv) _report.set_close(targets.loc[features.index].close) _report.set_dataset_columns(features.columns) _report.set_train_dataset(X_train, y_train) _report.set_test_dataset(X_test, y_test) _report.set_model(p.estimator) _report.set_params(gscv.best_params_) _report.set_cv(gscv.best_estimator_, gscv.best_score_, gscv.cv_results_) reports.add_report(_report) reports.save() logger.info("--- {} end ---".format(_sym)) except Exception as e: logger.error( "Exception while building model pipeline: {} dataset: {} symbol: {}\nException:\n{}" .format(pipeline, dataset, _sym, e)) traceback.print_exc() return reports
predictions1 = clf.predict(X_train) train_report = classification_report(y_train, predictions1, output_dict=True) print(classification_report(y_train, predictions1)) logger.info("Classification report on test set") predictions2 = clf.predict(X_test) test_report = classification_report(y_test, predictions2, output_dict=True) print(classification_report(y_test, predictions2)) stats = { 'score': accuracy_score(y_train, predictions1), 'mse': mean_squared_error(y_train, predictions1), 'test_score': accuracy_score(y_test, predictions2), 'test_mse': mean_squared_error(y_test, predictions2), 'train_report': train_report, 'test_report': test_report, } print(stats) print("--- end ---") if __name__ == '__main__': logger.setup(filename='../test_pickled_model.log', filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='test_pickled_model') main()
from matplotlib import pyplot as plt import numpy as np import os import pickle import json def main(): index = load_dataset('all_merged', return_index=True) for _sym, data in index.items(): features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) # Replace nan with infinity so that it can later be imputed to a finite value features = features.replace([np.inf, -np.inf], np.nan) # Derive target classes from closing price target_pct = target_price_variation(features['close']) target = target_binned_price_variation(target_pct, n_bins=2) # target = target_discrete_price_variation(target_pct) print("--- end ---") if __name__ == '__main__': logger.setup( filename='../blockchain_features.log', filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='blockchain_features' ) main()
'xls': 'data/datasets/all_merged/excel/{}_faceted.xlsx'.format( _sym.lower()), 'target_csv': 'data/datasets/all_merged/csv/{}_target.csv'.format(_sym.lower()), 'target_xls': 'data/datasets/all_merged/excel/{}_target.xlsx'.format( _sym.lower()), 'features': { 'price_history': [c for c in history_facet.columns], 'trend': [c for c in trend_facet.columns], 'volatility': [c for c in volatility_facet.columns], 'volume': [c for c in volume_facet.columns], 'chain': [c for c in chain_facet.columns], } } logger.info('Saved {} in data/datasets/all_merged/'.format(_sym)) with open('data/datasets/all_merged/index_faceted.json', 'w') as f: json.dump(index, f, sort_keys=True, indent=4) if __name__ == '__main__': logger.setup(filename='../build_dataset.log', filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='build_dataset') build_merged_dataset() build_atsa_dataset('all_merged') build_improved_dataset('all_merged') build_faceted_dataset('all_merged')
def build_model(dataset, pipeline, experiment, current_target='class', test_size=0.3): models_dir = './results/{}_{}_{}/models/'.format(dataset, pipeline, experiment) reports_dir = './results/{}_{}_{}/reports/'.format(dataset, pipeline, experiment) experiment_index_file = './results/{}_{}_{}/index.json'.format( dataset, pipeline, experiment) log_file = './results/{}_{}_{}/model_build.log'.format( dataset, pipeline, experiment) scoring = make_scorer(precision_score, zero_division=1, average='micro') os.makedirs(models_dir, exist_ok=True) os.makedirs(reports_dir, exist_ok=True) # Setup logging logger.setup(filename=log_file, filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='build_model') index_name = 'index' if '.' in dataset: splits = dataset.split(".") dataset = splits[0] index_name = splits[1] # Load the dataset index dataset_index = load_dataset(dataset, return_index=True, index_name=index_name) # Dynamically import the pipeline we want to use for building the model logger.info('Start experiment: {} using {} on {} with target {}'.format( experiment, pipeline, dataset, current_target)) reports = ReportCollection(dataset, pipeline, experiment) for _sym, data in {'BTC': dataset_index['BTC']}.items(): try: logger.info('Start processing: {}'.format(_sym)) features = pd.read_csv(data['csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) targets = pd.read_csv(data['target_csv'], sep=',', encoding='utf-8', index_col='Date', parse_dates=True) # Drop columns whose values are all NaN, as well as rows with ANY nan value, then # replace infinity values with nan so that they can later be imputed to a finite value features = features.dropna( axis='columns', how='all').dropna().replace([np.inf, -np.inf], np.nan) target = targets.loc[features.index][current_target] #X_train, X_test, y_train, y_test = train_test_split(features, target, shuffle=False, test_size=test_size) all_size = features.shape[0] train_size = int(all_size * (1 - test_size)) features = detabularise( features[[c for c in features.columns if 'close' in c]]) X_train = features.iloc[0:train_size] y_train = target.iloc[0:train_size] X_test = features.iloc[train_size:all_size] y_test = target.iloc[train_size:all_size] # Summarize distribution logger.info("Start Grid search") clf = ShapeletTransformClassifier(time_contract_in_mins=5) clf.fit(X_train, y_train) print('{} Score: {}'.format(_sym, clf.score(X_test, y_test))) pred = clf.predict(X_test) print(classification_report(y_test, pred)) logger.info("End Grid search") logger.info("--- {} end ---".format(_sym)) except Exception as e: logger.error( "Exception while building model pipeline: {} dataset: {} symbol: {}\nException:\n{}" .format(pipeline, dataset, _sym, e)) traceback.print_exc() return reports
) # nargs='?', default='all_merged.index_improved', args = parser.parse_args() os.makedirs('./equities/{}/'.format(args.name), exist_ok=True) models_cache_dir = './equities/{}/models'.format(args.name) os.makedirs(models_cache_dir, exist_ok=True) db_file = './equities/{}/status.db'.format(args.name) if os.path.exists(db_file): os.remove(db_file) engine = create_engine('sqlite:///' + db_file) Base.metadata.bind = engine session_factory = sessionmaker(bind=engine) DBSession = scoped_session(session_factory) migrate(db_file) # Create status database and tables logger.setup(filename='./equities/{}/log.txt'.format(args.name), filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='equity') # result = trailing_window_day(pipeline_name='debug_xgboost', # parameters='./results/timedspline_safe_debug_xgboost_splines_experiment_171020_040945/', # dataset='timedspline_safe', # symbols=['ADA', 'BTC'], # day='2018-08-01', # window_size=150 # ) beg = '2018-06-01' end = '2018-09-01' _symbols = [ 'ADA', 'BCH', 'BNB',
parser = argparse.ArgumentParser( description='Build and tune models, collect results') parser.add_argument('-n', dest='name', nargs='?', default='equity_test', help="Name for the current equity") # nargs='?', default='all_merged.index_improved', args = parser.parse_args() # Create status directory and DB file base_dir = './equities/{}/'.format(args.name) os.makedirs(base_dir, exist_ok=True) db_file = '{}/status.db'.format(base_dir) # if os.path.exists(db_file): # os.remove(db_file) # Setup logging logger.setup( filename='{}/log.txt'.format(base_dir), filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='equity' ) # Create SQLAlchemy engine and bind the models engine = create_engine('sqlite:///' + db_file) Base.metadata.bind = engine session_factory = sessionmaker(bind=engine) DBSession = scoped_session(session_factory) # Create status database and tables (only if db is new) migrate(db_file) # Create exchange instance exchange = Exchange(DBSession) strategies = {} for s in SYMBOLS: # Deposit initial asset amount
from lib.log import logger import pandas as pd from old.lib.plotter import correlation import lib.dataset as builder import json INTERACTIVE_FIGURE = False SYMBOLS = [ 'ADA', 'BCH', 'BNB', 'BTC', 'BTG', 'DASH', 'DOGE', 'EOS', 'ETC', 'ETH', 'IOT', 'LTC', 'LINK', 'NEO', 'QTUM', 'TRX', 'USDT', 'VEN', 'WAVES', 'XEM', 'XMR', 'XRP', 'ZEC', 'ZRX' ] logger.setup(filename='../dataset_ohlcv_social.log', filemode='w', root_level=logging.DEBUG, log_level=logging.DEBUG, logger='dataset ohlcv_social') index = {} for _sym in SYMBOLS: ohlcv = pd.read_csv("./data/preprocessed/ohlcv/csv/{}.csv".format( _sym.lower()), sep=',', encoding='utf-8', index_col='Date', parse_dates=True) cm = pd.read_csv( "./data/preprocessed/cryptocompare_social/csv/{}.csv".format( _sym.lower()), sep=',',