def compare_by_weekend(): df = pd.read_csv(r'data\J_2019.csv') df = dproc.clean_data(df) df_results = dproc.compare_by_component(df, 'vikend_delovnik', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], models_type, [0, 0], [''], save_file_to='vikendi.pdf') return df_results
def compare_by_weather(): df = pd.read_csv(r'data\J_2019.csv') df = dproc.clean_data(df) df = df.dropna(subset=['vreme']) df_results = dproc.compare_by_component(df, 'vreme', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], models_type, [0, 0], [''], save_file_to='vreme.pdf') return df_results
def compare_by_day(): df = pd.read_csv(r'data\J_2019.csv') df = dproc.clean_data(df) df_results = dproc.compare_by_component(df, 'dan', [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], models_type, [0, 0, 0, 0, 1, 1, 0], ['delovnik', 'vikend'], rows=2, cols=1, save_file_to='dnevi.pdf') return df_results
def get_averaged_btc(fiat='EUR'): from data_processing import merge_dfs_on_column, clean_data exchanges = ['COINBASE', 'BITSTAMP', 'ITBIT', 'KRAKEN'] exchange_data = {} for exchange in exchanges: exchange_code = 'BCHARTS/{0}{1}'.format(exchange, fiat) btc_exchange_df = get_quandl_data(exchange_code) exchange_data[exchange] = btc_exchange_df btc_fiat_datasets = merge_dfs_on_column(list(exchange_data.values()), list(exchange_data.keys()), 'Weighted Price') return clean_data(btc_fiat_datasets)
def split_data(df): """Cleans, splits and removes outliers from test and training set. Args: df (DataFrame) Returns: train (DataFrame) test (DataFrame) """ clean_df = clean_data(df) train, test = clean_df.randomSplit([0.9, 0.1], seed=123) train, dur_mean, dur_std = remove_outliers(train, 'min_duration') test = remove_outliers(test, 'min_duration', False, dur_mean, dur_std) train.persist() test.persist() return train, test
def compare_by_type(): slo_names = dict({ "osebno_v": "osebna v.", "avtobus": "avtobusi", "tovorno_v": "tovorna v.", }) df = pd.read_csv(r'data\J_2019.csv') df = dproc.clean_data(df) df_results = dproc.compare_by_component( df, ['osebno_v', 'avtobus', 'tovorno_v'], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], models_type, [0, 1, 0], ['osebna in tovorna vozila', 'avtobusi'], labels=slo_names, save_file_to='tipi_vozil.pdf', rows=2, cols=1, multiple_cols=True, main_name='tipi_vozil') return df_results
print("") if __name__ == '__main__': # load BART data print ('Loading BART data.') df_bart_import = pd.read_pickle('../data/bart/df_bart_hourly.pkl') df_bart_hourly = df_bart_import.copy() df_bart = data_processing.clean_data_bart(df_bart_hourly) # load forecast.io data from SQL database print ('Loading weather data.') df_forecast = sql_helper.db_load_weather() df = data_processing.clean_data(df_forecast, features=['dayofweek', 'holiday', 'dayofyear', 'pressure', 'apparenttemperaturemin-3']) # assign X and y values for model df_train = df.copy() y = df_bart['20110101':'20151231'].counts_normed.values X = df_train['20110101':'20151231'].values # build a classifier clf = RandomForestRegressor() # use a full grid over all parameters param_grid = {'max_depth': [10, 15, 20, 30, None], 'max_features': ['sqrt', 'log2', None], 'min_samples_split': [1, 2, 4, 6, 8], 'min_samples_leaf': [1, 2, 4, 6, 8],
'precipintensitymax', 'pressure', 'temperaturemax', 'temperaturemin', 'time', 'windspeed' ] #update dataframe with specified features df_daily = df_daily[colnames] # update SQL database sql_helper.db_update(df_daily, colnames) #load forecast from database engine = sqlalchemy.create_engine("postgres://postgres@/forecast") conn = engine.connect() df_predict_import = pd.read_sql('''SELECT * FROM forecast_daily ORDER BY time DESC LIMIT 45''', con=engine) conn.close() engine.dispose() df_predict = data_processing.clean_data(df_predict_import, features=[ 'dayofweek', 'holiday', 'dayofyear', 'pressure', 'apparenttemperaturemin-3' ]) df = predict(df_predict) print 'Predictions:' print df
df_daily = pd.DataFrame(forecast.json['daily']['data']) df_daily.columns = [x.lower() for x in df_daily.columns] # specify columns/features to keep colnames = ['apparenttemperaturemax', 'apparenttemperaturemin', 'precipintensity', 'precipintensitymax', 'pressure', 'temperaturemax', 'temperaturemin', 'time', 'windspeed'] #update dataframe with specified features df_daily = df_daily[colnames] # update SQL database sql_helper.db_update(df_daily, colnames) #load forecast from database engine = sqlalchemy.create_engine("postgres://postgres@/forecast") conn = engine.connect() df_predict_import = pd.read_sql('''SELECT * FROM forecast_daily ORDER BY time DESC LIMIT 45''', con=engine) conn.close() engine.dispose() df_predict = data_processing.clean_data(df_predict_import, features=['dayofweek', 'holiday', 'dayofyear', 'pressure', 'apparenttemperaturemin-3']) df = predict(df_predict) print 'Predictions:' print df
if __name__ == '__main__': # load BART data print('Loading BART data.') df_bart_import = pd.read_pickle('../data/bart/df_bart_hourly.pkl') df_bart_hourly = df_bart_import.copy() df_bart = data_processing.clean_data_bart(df_bart_hourly) # load forecast.io data from SQL database print('Loading weather data.') df_forecast = sql_helper.db_load_weather() df = data_processing.clean_data(df_forecast, features=[ 'dayofweek', 'holiday', 'dayofyear', 'pressure', 'apparenttemperaturemin-3' ]) # assign X and y values for model df_train = df.copy() y = df_bart['20110101':'20151231'].counts_normed.values X = df_train['20110101':'20151231'].values # build a classifier clf = RandomForestRegressor() # use a full grid over all parameters param_grid = { 'max_depth': [10, 15, 20, 30, None], 'max_features': ['sqrt', 'log2', None],