Пример #1
0
def compare_by_weekend():
    df = pd.read_csv(r'data\J_2019.csv')
    df = dproc.clean_data(df)
    df_results = dproc.compare_by_component(df,
                                            'vikend_delovnik',
                                            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                                            models_type, [0, 0], [''],
                                            save_file_to='vikendi.pdf')
    return df_results
Пример #2
0
def compare_by_weather():
    df = pd.read_csv(r'data\J_2019.csv')
    df = dproc.clean_data(df)
    df = df.dropna(subset=['vreme'])
    df_results = dproc.compare_by_component(df,
                                            'vreme',
                                            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                                            models_type, [0, 0], [''],
                                            save_file_to='vreme.pdf')
    return df_results
Пример #3
0
def compare_by_day():
    df = pd.read_csv(r'data\J_2019.csv')
    df = dproc.clean_data(df)
    df_results = dproc.compare_by_component(df,
                                            'dan',
                                            [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                                            models_type, [0, 0, 0, 0, 1, 1, 0],
                                            ['delovnik', 'vikend'],
                                            rows=2,
                                            cols=1,
                                            save_file_to='dnevi.pdf')

    return df_results
Пример #4
0
def get_averaged_btc(fiat='EUR'):
    from data_processing import merge_dfs_on_column, clean_data

    exchanges = ['COINBASE', 'BITSTAMP', 'ITBIT', 'KRAKEN']

    exchange_data = {}

    for exchange in exchanges:
        exchange_code = 'BCHARTS/{0}{1}'.format(exchange, fiat)
        btc_exchange_df = get_quandl_data(exchange_code)
        exchange_data[exchange] = btc_exchange_df

    btc_fiat_datasets = merge_dfs_on_column(list(exchange_data.values()),
                                            list(exchange_data.keys()),
                                            'Weighted Price')

    return clean_data(btc_fiat_datasets)
Пример #5
0
def split_data(df):
    """Cleans, splits and removes outliers from test and training set.

    Args:
        df (DataFrame)

    Returns:
        train (DataFrame)
        test (DataFrame)

    """
    clean_df = clean_data(df)
    train, test = clean_df.randomSplit([0.9, 0.1], seed=123)
    train, dur_mean, dur_std = remove_outliers(train, 'min_duration')
    test = remove_outliers(test, 'min_duration', False, dur_mean, dur_std)
    train.persist()
    test.persist()
    return train, test
Пример #6
0
def compare_by_type():
    slo_names = dict({
        "osebno_v": "osebna v.",
        "avtobus": "avtobusi",
        "tovorno_v": "tovorna v.",
    })
    df = pd.read_csv(r'data\J_2019.csv')
    df = dproc.clean_data(df)
    df_results = dproc.compare_by_component(
        df, ['osebno_v', 'avtobus', 'tovorno_v'],
        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
        models_type, [0, 1, 0], ['osebna in tovorna vozila', 'avtobusi'],
        labels=slo_names,
        save_file_to='tipi_vozil.pdf',
        rows=2,
        cols=1,
        multiple_cols=True,
        main_name='tipi_vozil')
    return df_results
Пример #7
0
        print("")

if __name__ == '__main__':

    # load BART data
    print ('Loading BART data.')
    df_bart_import = pd.read_pickle('../data/bart/df_bart_hourly.pkl')
    df_bart_hourly = df_bart_import.copy()
    df_bart = data_processing.clean_data_bart(df_bart_hourly)

    # load forecast.io data from SQL database
    print ('Loading weather data.')
    df_forecast = sql_helper.db_load_weather()

    df = data_processing.clean_data(df_forecast,
                                features=['dayofweek', 'holiday', 'dayofyear',
                                        'pressure', 'apparenttemperaturemin-3'])

    # assign X and y values for model
    df_train = df.copy()
    y = df_bart['20110101':'20151231'].counts_normed.values
    X = df_train['20110101':'20151231'].values

    # build a classifier
    clf = RandomForestRegressor()

    # use a full grid over all parameters
    param_grid = {'max_depth': [10, 15, 20, 30, None],
                  'max_features': ['sqrt', 'log2', None],
                  'min_samples_split': [1, 2, 4, 6, 8],
                  'min_samples_leaf': [1, 2, 4, 6, 8],
Пример #8
0
        'precipintensitymax', 'pressure', 'temperaturemax', 'temperaturemin',
        'time', 'windspeed'
    ]

    #update dataframe with specified features
    df_daily = df_daily[colnames]

    # update SQL database
    sql_helper.db_update(df_daily, colnames)

    #load forecast from database
    engine = sqlalchemy.create_engine("postgres://postgres@/forecast")
    conn = engine.connect()
    df_predict_import = pd.read_sql('''SELECT *
                                       FROM forecast_daily
                                       ORDER BY time DESC LIMIT 45''',
                                    con=engine)
    conn.close()
    engine.dispose()

    df_predict = data_processing.clean_data(df_predict_import,
                                            features=[
                                                'dayofweek', 'holiday',
                                                'dayofyear', 'pressure',
                                                'apparenttemperaturemin-3'
                                            ])

    df = predict(df_predict)
    print 'Predictions:'
    print df
Пример #9
0
    df_daily = pd.DataFrame(forecast.json['daily']['data'])
    df_daily.columns = [x.lower() for x in df_daily.columns]

    # specify columns/features to keep
    colnames = ['apparenttemperaturemax', 'apparenttemperaturemin',
               'precipintensity', 'precipintensitymax', 'pressure',
               'temperaturemax', 'temperaturemin', 'time', 'windspeed']

    #update dataframe with specified features
    df_daily = df_daily[colnames]

    # update SQL database
    sql_helper.db_update(df_daily, colnames)

    #load forecast from database
    engine = sqlalchemy.create_engine("postgres://postgres@/forecast")
    conn = engine.connect()
    df_predict_import = pd.read_sql('''SELECT *
                                       FROM forecast_daily
                                       ORDER BY time DESC LIMIT 45''', con=engine)
    conn.close()
    engine.dispose()

    df_predict = data_processing.clean_data(df_predict_import,
                                features=['dayofweek', 'holiday', 'dayofyear',
                                        'pressure', 'apparenttemperaturemin-3'])

    df = predict(df_predict)
    print 'Predictions:'
    print df
Пример #10
0
if __name__ == '__main__':

    # load BART data
    print('Loading BART data.')
    df_bart_import = pd.read_pickle('../data/bart/df_bart_hourly.pkl')
    df_bart_hourly = df_bart_import.copy()
    df_bart = data_processing.clean_data_bart(df_bart_hourly)

    # load forecast.io data from SQL database
    print('Loading weather data.')
    df_forecast = sql_helper.db_load_weather()

    df = data_processing.clean_data(df_forecast,
                                    features=[
                                        'dayofweek', 'holiday', 'dayofyear',
                                        'pressure', 'apparenttemperaturemin-3'
                                    ])

    # assign X and y values for model
    df_train = df.copy()
    y = df_bart['20110101':'20151231'].counts_normed.values
    X = df_train['20110101':'20151231'].values

    # build a classifier
    clf = RandomForestRegressor()

    # use a full grid over all parameters
    param_grid = {
        'max_depth': [10, 15, 20, 30, None],
        'max_features': ['sqrt', 'log2', None],