예제 #1
0
def main(ops, start, end, es_host, es_port, folder, download, download_folder, verbose, concurrency):

    if not ops:
        raise click.UsageError('No Argument provided. Use --help if you need help')

    accepted_args = {
        'es': elasticsearch_updater,
        's3': s3_writer,
        'disk': file_writer
    }

    writers = []
    for op in ops:
        if op in accepted_args.keys():
            writers.append(accepted_args[op])
        else:
            raise click.UsageError('Operation (%s) is not supported' % op)

    logger.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()

    if verbose:
        ch.setLevel(logging.INFO)
    else:
        ch.setLevel(logging.ERROR)

    formatter = logging.Formatter('%(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)

    if 'es' in ops:
        global es
        es = Elasticsearch([{
            'host': es_host,
            'port': es_port
        }])

        create_index(es_index, es_type)

    if not start and not end:
        delta = timedelta(days=3)
        start = date.today() - delta
        start = '{0}-{1}-{2}'.format(start.year, start.month, start.day)

    csv_reader(folder, writers, start_date=start, end_date=end, download=download, download_path=download_folder,
               num_worker_threads=concurrency)
예제 #2
0
def main(ops, start, end, es_host, es_port, folder, download,
         aws, download_folder, verbose, concurrency):

    if not ops:
        raise click.UsageError('No Argument provided. Use --help if you need help')

    accepted_args = {
        'es': elasticsearch_updater,
        's3': s3_writer,
        'disk': file_writer,
        'thumbs': thumbnail_writer,
        'db': dynamodb_updater
    }

    writers = []
    for op in ops:
        if op in accepted_args.keys():
            writers.append(accepted_args[op])
        else:
            raise click.UsageError('Operation (%s) is not supported' % op)

    logger.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()

    if verbose:
        ch.setLevel(logging.INFO)
    else:
        ch.setLevel(logging.ERROR)

    formatter = logging.Formatter('%(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)

    if 'es' in ops or 'thumbs' in ops:
        create_index(es_index, es_type, es_host, es_port, aws=aws)

    if not start and not end:
        delta = timedelta(days=3)
        start = date.today() - delta
        start = '{0}-{1}-{2}'.format(start.year, start.month, start.day)

    csv_reader(folder, writers, start_date=start, end_date=end, download=download, download_path=download_folder,
               num_worker_threads=concurrency, es_host=es_host, es_port=es_port, aws=aws)
예제 #3
0
def insert_csv_data():
    session = Session()
    try:
        data = csv_reader()
        print('Data import in process...')
        # json_dump('btw17_data.json', data)
        if insert_counties(data) == 'SUCCESS':
            print('Data import successful')
    finally:
        session.close()
        print('DONE.')
예제 #4
0
def scan_file(path):
    with open(path, "r") as f_obj:
        statistics = reader.csv_reader(f_obj)
    return statistics
예제 #5
0
def main():

    # create an folder "Output" under the current folder to hold results
    create_folders('output', ['acf-pacf', 'trends', 'pkl'])

    #turn on logging
    fname = './output/' + 'datastats_' + datetime.now().time().strftime(
        '%Hh%Mm') + '.txt'
    logging.basicConfig(filename=fname,
                        level=logging.INFO,
                        format='%(message)s')

    df = csv_reader(s.filename,
                    field_names=s.field_names,
                    desc=s.desc_names,
                    nrows=5000)

    #ALPHA need to be tuned according to training sample size
    ALPHA = s.set_alpha(df.shape[0])

    # Time-consuming!... just refer to the attached file Infogain.txt
    # logging.info('\n' * 2)
    # logging.info('===================================================================================')
    # logging.info('                       Information Gain by different variables                     ')
    # logging.info('===================================================================================')
    #
    # condits=['SlicedStart','Vehicle','Weekday','Weekend_not','Service','SlicedStart+Weekday']
    # gain, summary = check_info_gain(df, s.field_names, condits)
    #
    # logging.info("""\n We observed that :
    # \t 1. Start time is relevant, indicating day trends exist with respect to Start time
    # \t 2. Vehicle has some relevance, but likely due to outliers (some group is very small).
    # \t 3. Weekdays has little relevance.
    # \t 4. (Start, Weekdays) has some relevance, but likely due to outliers, ignored for now.\n """)

    logging.info('\n' * 2)
    logging.info(
        '==================================================================================='
    )
    logging.info(
        '                         Daily bus dispatch intervals                              '
    )
    logging.info(
        '==================================================================================='
    )
    logging.info(
        'Check the day trend plot of dispatch interval in the folder ./output/trends'
    )

    check_bus_dispatch_interval(df,
                                alpha=ALPHA,
                                resample_interval=s.RESAMPLING_INTERVAL)

    logging.info("""The trend plot shows
         \t 1.the dispatch interval varies over a day.
          \t 2.The standard deviation over the smoothing window indicates that the dispatch interval varies greatly between different days.\n
    Based on the above observation and considering the randomness in bus dispatch, it does not make much sense to evenly resample the daily data over time. Instead, we just treat each bus run in a day as a sequential data point.\n"""
                 )

    logging.info('\n' * 2)
    logging.info(
        '==================================================================================='
    )
    logging.info(
        '              Average Day Trends of All the Stop/Travel Times                      '
    )
    logging.info(
        '==================================================================================='
    )
    logging.info(
        'Check the day trend plots in folder ./output/trends. The plots is downsampled for ploting'
    )
    df, trends, df_res = denoise_extract_trend(
        df,
        field_names=s.field_names,
        alpha=ALPHA,
        verbose=VERBOSE,
        resample_interval=s.RESAMPLING_INTERVAL)

    logging.info('\n' * 2)
    logging.info(
        '==================================================================================='
    )
    logging.info(
        '              cross-correlation between the detrend residuals                      '
    )
    logging.info(
        '==================================================================================='
    )

    check_xcorr(df_res, s.field_names)

    logging.info(
        '\n We observe that residuals after detrending mostly are not cross-correlated. Only few couples have low correlation. '
    )

    # logging.info('\n' * 2)
    # logging.info('===================================================================================')
    # logging.info('          information gain of categorical variables on residuals after detrending          ')
    # logging.info('===================================================================================')
    #
    # condits=['SlicedStart','Vehicle','Weekday','Weekend_not','Service','SlicedStart+Weekday']
    # gain_res, summary_res = check_info_gain(df_res.join(df[s.desc_names]), s.field_names, condits)
    #
    # logging.info('\n We observe that information gain Start time.')
    #

    logging.info('\n' * 2)
    logging.info(
        '==================================================================================='
    )
    logging.info(
        '       Investigate p,d,q orders for ARIMA model fitting                             '
    )
    logging.info(
        '==================================================================================='
    )
    logging.info(
        '\nThe d-order is determined by differencing+ADF test iteratively. ')
    logging.info(
        """Tentative p,q are chosen by investigating the autocorrelation and partial autocorrelation plots. Check the ACF-PACF plots in the folder ./output/acf-pacf"""
    )

    orders = get_pdq(df, df_res, s.field_names)
    with open('./output/pkl/orders.pkl', 'wb') as f:
        pkl.dump(orders, f)

    logging.info('\n')
    logging.info(
        '               ARIMA(p,d,q) order - by Acf-Pacf confidence level check                           '
    )
    logging.info(
        '--------------------------------------------------------------------------------------------------'
    )
    logging.info(orders.to_string(line_width=100))

    logging.info(
        """\nFor %d out of %d variables, the detrend residuals appear as white noise. For these stops/spans, the trend
    average corresponding to the Start time will be a good prediction. """ %
        ((orders.loc['residual'] == (0, 0, 0)).sum(), orders.shape[1]))

    # TODO:The p, q identified are tentative. The p, q values should be manually altered by visually checking the shape of ACF and PACF.
    # For example, p for AR (q for MA) term should be removed if the pacf (acf) has an obvious cut-off pattern.
    # Some series has a high-order AR term, indicating under-differencing.

    logging.info('\n' * 2)
    logging.info(
        """\n ----------------------    The data exploration is over!    ------------------------------  """
    )

    logging.shutdown()
예제 #6
0
def main(ops, csv, start, end, folder, download, download_folder, verbose, concurrency, product):
    global ipfs_api
    
    if not ops:
        raise click.UsageError('No Argument provided. Use --help if you need help')
    
    ipfs_api 	= ipfsapi.connect('127.0.0.1', 5001)
    ipfs_id 	= ipfs_api.id()
    #print "*** ipfs_id", ipfs_id
    #sys.exit(-1)
    
    accepted_args = {
        'es': elasticsearch_updater,
        's3': s3_writer,
        'disk': file_writer
    }
    
    writers = []
    for op in ops:
        if op in accepted_args.keys():
            writers.append(accepted_args[op])
        else:
            raise click.UsageError('Operation (%s) is not supported' % op)
    
    logger.setLevel(logging.DEBUG)
    ch = logging.StreamHandler()
    
    if verbose:
        ch.setLevel(logging.INFO)
    else:
        ch.setLevel(logging.ERROR)
    
    formatter = logging.Formatter('%(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    
    print "folder", folder, "csv", csv
    
    if 'es' in ops:
        global es
        
        es_port = int(os.getenv('ES_PORT', 80))
        es_host = os.getenv('ES_HOST', "NOT_AVAILABLE")
        
        logger.info("Connecting to Elastic Search %s:%d", es_host, es_port)
        es = Elasticsearch([{
            'host': es_host,
            'port': es_port
        }])
        
        create_index(es_index, es_type)
    
    if not start and not end:
        delta = timedelta(days=3)
        start = date.today() - delta
        start = '{0}/{1}/{2}'.format(start.month, start.day,start.year-2000)
    
    # Simple product upload
    if product:
        print "product", product
        if not os.path.exists(product):
            print "Product does not exist", product
            
        with open(product) as data:
            json_data = json.load(data)
            data.close()
        #print json_data
        logger.info('processing %s' % json_data['scene_id'])
        for w in writers:
            w(download_folder, json_data)
        
        print "Done"
        sys.exit(0)
        
    csv_reader(csv, folder, writers, start_date=start, end_date=end, download=download, download_path=download_folder,
               num_worker_threads=concurrency)
예제 #7
0
def main():

    # create an folder "Output" under the current folder to hold results, trained models are save in 'pkl'
    create_folders('output', ['pkl'])
    # Write results to file, return stdout back to screen by  sys.stdout.back_to_screen() if needed.
    fname = './output/' + "models" + datetime.now().time().strftime(
        "%Hh%Mm") + ".txt"
    logging.basicConfig(filename=fname,
                        level=logging.INFO,
                        format='%(message)s')

    df = csv_reader(s.filename,
                    field_names=s.field_names,
                    desc=s.desc_names,
                    split=s.split)

    #ALPHA need to be tuned according to training sample size
    ALPHA = s.set_alpha(df.shape[0])

    #denoise, forward mode, can be altered to use online
    df, trends, df_res = denoise_extract_trend(
        df,
        field_names=s.field_names,
        alpha=ALPHA,
        verbose=VERBOSE,
        resample_interval=s.RESAMPLING_INTERVAL)

    # select, estimate and pickle the ARIMA models for the original series and the detrending residuals (ARIMA with trend term)

    models_0 = {}
    d_0 = {}
    models_trend = {}
    d_trend = {}

    dummy_mean = df[s.field_names].mean()
    df_zeroed = df[s.field_names] - dummy_mean

    # get tentative pdq orders
    with open('./output/pkl/orders.pkl', 'rb') as f:
        pdq = pd.read_pickle(f)

    for name, dff, d, mdl in zip(['series', 'residual'], [df_zeroed, df_res],
                                 [d_0, d_trend], [models_0, models_trend]):
        for f in s.field_names:
            d[f], mdl[f] = model_select_fit(dff[f].dropna().values,
                                            pdq.loc[name, f],
                                            VERBOSE=VERBOSE >= 2)
            if VERBOSE >= 1:
                logging.info('\n' * 4 +
                             'For {} -  {} after {}-order differencing'.format(
                                 f, name, d[f]))
                logging.info('\nNone' if mdl[f] is None else mdl[f].summary())

    # pickle all we need for the model with a trend term
    with open('./output/pkl/models_trend.pkl', 'wb') as f:
        pkl.dump(trends, f)
        pkl.dump(s.RESAMPLING_INTERVAL, f)
        pkl.dump(d_trend, f)
        pkl.dump(models_trend, f)

    # pickle the model for the original series
    with open('./output/pkl/models_0.pkl', 'wb') as f:
        pkl.dump(dummy_mean, f)
        pkl.dump(d_0, f)
        pkl.dump(models_0, f)

    # The dummy mean of the training set will also be used for comparing with the forecasts.
    with open('./output/pkl/models_dummy.pkl', 'wb') as f:
        pkl.dump(dummy_mean, f)

    logging.info('\n' * 2)
    logging.info(
        'The models are save as models_trend.pkl, models_0.pkl, and models_dummy.pkl'
    )
    logging.info(
        """\n ----------------------    Model training is over!    ------------------------------  """
    )

    logging.shutdown()
예제 #8
0
    :return: metrics
    :rtype: dictionary
    """
    metrics["mean_hr_bpm"] = mean_heart_rate
    metrics["voltage_extremes"] = voltage_extremes
    metrics["duration"] = duration
    metrics["num_beats"] = num_beats
    metrics["beats"] = beats

    return metrics


if __name__ == "__main__":
    data_file = 'test_data1.csv'
    logging.config.fileConfig('logger_config.ini', disable_existing_loggers=False)
    data = csv_reader(data_file)
    time_array = data[0]
    voltage_array = data[1]
    validate(time_array, voltage_array)
    beats = beats_test(threshold=0.7, voltage_array=voltage_array,
                       time_array=time_array
                       )
    num_beats = num_beats_test(threshold=0.7, voltage_array=voltage_array)
    duration = duration(time_array=time_array)
    mean_heart_rate = mean_beats(threshold=0.7, voltage_array=voltage_array,
                                 time_array=time_array
                                 )
    voltage_extremes = voltage_extremes(voltage_array=voltage_array)
    my_dictionary = create_metrics()
    my_dictionary = add_word(my_dictionary)
    logging.info(my_dictionary)
예제 #9
0
from reader import csv_reader
from preprocess import preprocess
from model_select import model_select
from plot_bar_chart import plot_bar_chart
from model_training import model_training
from test import test

path = 'Route24304Train.csv'

(x_date, x_vehicle, x_service, x_start, x_duration, x_spans,
 x_weekday_not) = csv_reader(path)
(x_weekday_not1,x_start1,x_stops1,x_travels1,x_date1, x_weekday_not2, x_stops2,x_travels2,x_start2,x_data2) = \
    preprocess(x_date, x_start, x_duration, x_spans, x_weekday_not)

(e_t_t, e_m_t, e_t_s, e_m_s, std_t_t, std_m_t, std_t_s,
 std_m_s) = model_select(x_weekday_not1, x_start1, x_stops1, x_travels1,
                         x_date1)
plot_bar_chart(e_m_t, std_m_t)
plot_bar_chart(e_m_s, std_m_s)
plot_bar_chart(e_t_t, std_t_t)
plot_bar_chart(e_t_s, std_t_s)

model_training(x_weekday_not1, x_start1, x_stops1, x_travels1, x_date1)
(MAE_travel_left, MAE_stops_left, MAE_travel_top, MAE_stop_top, MAE_stop_top, MAE_travel_main)\
    =test(x_weekday_not2, x_start2, x_stops1, x_travels2, x_date2)
예제 #10
0
def main():
    """
    The performance of 5 models are compared:
        (1) ARIMA,
        (2) ARIMA with average day trend template,
        (3) day trend average,
        (4) day trend + ewma filtered detrending residual
        (5) dummy mean
    The results are save in a log file.

    """
    # Outliers are removed before being used for forecasting.
    # But outliers are included in evaluation, so the forecasts are compared with the real raw data.

    create_folders('output', ['forecasts'])
    # turn on logging
    fname = './output/' + "forecasts" + datetime.now().time().strftime(
        "%Hh%Mm") + ".txt"
    logging.basicConfig(filename=fname,
                        level=logging.INFO,
                        format='%(message)s')

    df_test = csv_reader(s.filename,
                         field_names=s.field_names,
                         desc=s.desc_names,
                         split=s.split - 1)

    df_test_original = df_test.copy(
    )  # keep the original data for performance evaluation

    # cleaned observations for forecasting
    # now rmvoutliers_fill is on an individual day series, while in training phase, it's on an aligned series of all the historical data.
    # use ewm with a larger ALPHAl, or a moving window with smaller window
    for f in s.field_names:
        rmvoutliers_fill(df_test[f], fill='ewm', n=1)

    mdl_list = [
        'ARIMA', 'ARIMA_trend', 'average_trend', 'ewma_trend', 'ewma',
        'dummy_means'
    ]
    err_mae = pd.DataFrame(None, mdl_list, s.field_names)
    err_quant = pd.DataFrame(None, mdl_list, s.field_names)
    for mdl in mdl_list:
        df_hat = forecast(df_test, s.field_names, mdlname=mdl)
        # mean absolute error and 75% quantile
        err_mae.loc[mdl, :] = (df_test_original[s.field_names] -
                               df_hat).abs().mean()
        err_quant.loc[mdl, :] = (df_test_original[s.field_names] -
                                 df_hat).abs().quantile(q=0.75)

    if VERBOSE >= 1:
        # plot an example day series for all the stops.
        mdl_list2 = ['ARIMA_trend', 'average_trend', 'ewma_trend']
        ts = pd.DataFrame(None, columns=mdl_list2)

        date = '2016-07-22'

        for f in s.field_names:
            ts['Orignal'] = df_test_original.loc[date, f]
            for mdl in mdl_list2:
                ts[mdl] = forecast(df_test[date], [f], mdlname=mdl)

            ax = ts.plot(title="Stop-travel time forecasting for " + f + ' (' +
                         date + ')')

            import matplotlib.dates as mdates
            ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M'))
            plt.tight_layout()

            fig = ax.get_figure()
            fig.savefig('./output/forecasts/' + f + '.pdf')
            plt.close(fig)

    logging.info('\n' * 2)
    logging.info(
        '======================================================================================================'
    )
    logging.info(
        '         Performance Evaluation of Stop/travel Time Forecast for Next Bus (in seconds)                   '
    )
    logging.info(
        '======================================================================================================'
    )

    logging.info('\n' * 2)
    logging.info(
        '                                   Mean Absolute Error                                          '
    )
    logging.info(
        '--------------------------------------------------------------------------------------------------'
    )
    logging.info(err_mae.to_string(line_width=100))
    logging.info('\n' * 2)
    logging.info('The average mean absolute error over all the stops:\n')
    logging.info(err_mae.mean(axis=1))

    logging.info('\n' * 2)
    logging.info(
        '                           75% Quantile of Forecasting Error                                   '
    )
    logging.info(
        '--------------------------------------------------------------------------------------------------'
    )
    logging.info(err_quant.to_string(line_width=100))
    logging.info('\n' * 2)
    logging.info('The average 75% error quantile over all the stops:\n')
    logging.info(err_quant.mean(axis=1))

    i = 0
    k = 10
    methods = ['average_trend', 'ewma_trend', 'dummy_means']
    while i * k < s.field_names.__len__():
        ax = plot_bar_chart(err_mae.loc[methods].values[:, i * k:i * k + k],
                            err_quant.loc[methods].values[:, i:i + k],
                            'Absolute Forcast Error and 75% Quantile',
                            s.field_names[i * k:i * k + k],
                            legend=methods)
        i += 1
        fig = ax.get_figure()
        fig.savefig('./output/forecast_error_' + str(i) + '.pdf')
        plt.close(fig)

    logging.info('\n' * 2)
    logging.info(
        """\n ----------------------    The testing is over!    ------------------------------  """
    )

    logging.shutdown()
예제 #11
0
from reader import csv_reader
from cleaner import cleaner
import pandas as pd

csv_file = open('receita.csv', 'r')
dict_list = csv_reader(csv_file)
file_cleaned = cleaner(dict_list)
imports = pd.DataFrame(file_cleaned)
export_csv = imports.to_csv(
    r'C:\Users\CalebeLadis\PycharmProjects\csv-reader\receita2.csv',
    index=None,
    header=True)