def main(ops, start, end, es_host, es_port, folder, download, download_folder, verbose, concurrency): if not ops: raise click.UsageError('No Argument provided. Use --help if you need help') accepted_args = { 'es': elasticsearch_updater, 's3': s3_writer, 'disk': file_writer } writers = [] for op in ops: if op in accepted_args.keys(): writers.append(accepted_args[op]) else: raise click.UsageError('Operation (%s) is not supported' % op) logger.setLevel(logging.DEBUG) ch = logging.StreamHandler() if verbose: ch.setLevel(logging.INFO) else: ch.setLevel(logging.ERROR) formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) logger.addHandler(ch) if 'es' in ops: global es es = Elasticsearch([{ 'host': es_host, 'port': es_port }]) create_index(es_index, es_type) if not start and not end: delta = timedelta(days=3) start = date.today() - delta start = '{0}-{1}-{2}'.format(start.year, start.month, start.day) csv_reader(folder, writers, start_date=start, end_date=end, download=download, download_path=download_folder, num_worker_threads=concurrency)
def main(ops, start, end, es_host, es_port, folder, download, aws, download_folder, verbose, concurrency): if not ops: raise click.UsageError('No Argument provided. Use --help if you need help') accepted_args = { 'es': elasticsearch_updater, 's3': s3_writer, 'disk': file_writer, 'thumbs': thumbnail_writer, 'db': dynamodb_updater } writers = [] for op in ops: if op in accepted_args.keys(): writers.append(accepted_args[op]) else: raise click.UsageError('Operation (%s) is not supported' % op) logger.setLevel(logging.DEBUG) ch = logging.StreamHandler() if verbose: ch.setLevel(logging.INFO) else: ch.setLevel(logging.ERROR) formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) logger.addHandler(ch) if 'es' in ops or 'thumbs' in ops: create_index(es_index, es_type, es_host, es_port, aws=aws) if not start and not end: delta = timedelta(days=3) start = date.today() - delta start = '{0}-{1}-{2}'.format(start.year, start.month, start.day) csv_reader(folder, writers, start_date=start, end_date=end, download=download, download_path=download_folder, num_worker_threads=concurrency, es_host=es_host, es_port=es_port, aws=aws)
def insert_csv_data(): session = Session() try: data = csv_reader() print('Data import in process...') # json_dump('btw17_data.json', data) if insert_counties(data) == 'SUCCESS': print('Data import successful') finally: session.close() print('DONE.')
def scan_file(path): with open(path, "r") as f_obj: statistics = reader.csv_reader(f_obj) return statistics
def main(): # create an folder "Output" under the current folder to hold results create_folders('output', ['acf-pacf', 'trends', 'pkl']) #turn on logging fname = './output/' + 'datastats_' + datetime.now().time().strftime( '%Hh%Mm') + '.txt' logging.basicConfig(filename=fname, level=logging.INFO, format='%(message)s') df = csv_reader(s.filename, field_names=s.field_names, desc=s.desc_names, nrows=5000) #ALPHA need to be tuned according to training sample size ALPHA = s.set_alpha(df.shape[0]) # Time-consuming!... just refer to the attached file Infogain.txt # logging.info('\n' * 2) # logging.info('===================================================================================') # logging.info(' Information Gain by different variables ') # logging.info('===================================================================================') # # condits=['SlicedStart','Vehicle','Weekday','Weekend_not','Service','SlicedStart+Weekday'] # gain, summary = check_info_gain(df, s.field_names, condits) # # logging.info("""\n We observed that : # \t 1. Start time is relevant, indicating day trends exist with respect to Start time # \t 2. Vehicle has some relevance, but likely due to outliers (some group is very small). # \t 3. Weekdays has little relevance. # \t 4. (Start, Weekdays) has some relevance, but likely due to outliers, ignored for now.\n """) logging.info('\n' * 2) logging.info( '===================================================================================' ) logging.info( ' Daily bus dispatch intervals ' ) logging.info( '===================================================================================' ) logging.info( 'Check the day trend plot of dispatch interval in the folder ./output/trends' ) check_bus_dispatch_interval(df, alpha=ALPHA, resample_interval=s.RESAMPLING_INTERVAL) logging.info("""The trend plot shows \t 1.the dispatch interval varies over a day. \t 2.The standard deviation over the smoothing window indicates that the dispatch interval varies greatly between different days.\n Based on the above observation and considering the randomness in bus dispatch, it does not make much sense to evenly resample the daily data over time. Instead, we just treat each bus run in a day as a sequential data point.\n""" ) logging.info('\n' * 2) logging.info( '===================================================================================' ) logging.info( ' Average Day Trends of All the Stop/Travel Times ' ) logging.info( '===================================================================================' ) logging.info( 'Check the day trend plots in folder ./output/trends. The plots is downsampled for ploting' ) df, trends, df_res = denoise_extract_trend( df, field_names=s.field_names, alpha=ALPHA, verbose=VERBOSE, resample_interval=s.RESAMPLING_INTERVAL) logging.info('\n' * 2) logging.info( '===================================================================================' ) logging.info( ' cross-correlation between the detrend residuals ' ) logging.info( '===================================================================================' ) check_xcorr(df_res, s.field_names) logging.info( '\n We observe that residuals after detrending mostly are not cross-correlated. Only few couples have low correlation. ' ) # logging.info('\n' * 2) # logging.info('===================================================================================') # logging.info(' information gain of categorical variables on residuals after detrending ') # logging.info('===================================================================================') # # condits=['SlicedStart','Vehicle','Weekday','Weekend_not','Service','SlicedStart+Weekday'] # gain_res, summary_res = check_info_gain(df_res.join(df[s.desc_names]), s.field_names, condits) # # logging.info('\n We observe that information gain Start time.') # logging.info('\n' * 2) logging.info( '===================================================================================' ) logging.info( ' Investigate p,d,q orders for ARIMA model fitting ' ) logging.info( '===================================================================================' ) logging.info( '\nThe d-order is determined by differencing+ADF test iteratively. ') logging.info( """Tentative p,q are chosen by investigating the autocorrelation and partial autocorrelation plots. Check the ACF-PACF plots in the folder ./output/acf-pacf""" ) orders = get_pdq(df, df_res, s.field_names) with open('./output/pkl/orders.pkl', 'wb') as f: pkl.dump(orders, f) logging.info('\n') logging.info( ' ARIMA(p,d,q) order - by Acf-Pacf confidence level check ' ) logging.info( '--------------------------------------------------------------------------------------------------' ) logging.info(orders.to_string(line_width=100)) logging.info( """\nFor %d out of %d variables, the detrend residuals appear as white noise. For these stops/spans, the trend average corresponding to the Start time will be a good prediction. """ % ((orders.loc['residual'] == (0, 0, 0)).sum(), orders.shape[1])) # TODO:The p, q identified are tentative. The p, q values should be manually altered by visually checking the shape of ACF and PACF. # For example, p for AR (q for MA) term should be removed if the pacf (acf) has an obvious cut-off pattern. # Some series has a high-order AR term, indicating under-differencing. logging.info('\n' * 2) logging.info( """\n ---------------------- The data exploration is over! ------------------------------ """ ) logging.shutdown()
def main(ops, csv, start, end, folder, download, download_folder, verbose, concurrency, product): global ipfs_api if not ops: raise click.UsageError('No Argument provided. Use --help if you need help') ipfs_api = ipfsapi.connect('127.0.0.1', 5001) ipfs_id = ipfs_api.id() #print "*** ipfs_id", ipfs_id #sys.exit(-1) accepted_args = { 'es': elasticsearch_updater, 's3': s3_writer, 'disk': file_writer } writers = [] for op in ops: if op in accepted_args.keys(): writers.append(accepted_args[op]) else: raise click.UsageError('Operation (%s) is not supported' % op) logger.setLevel(logging.DEBUG) ch = logging.StreamHandler() if verbose: ch.setLevel(logging.INFO) else: ch.setLevel(logging.ERROR) formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) logger.addHandler(ch) print "folder", folder, "csv", csv if 'es' in ops: global es es_port = int(os.getenv('ES_PORT', 80)) es_host = os.getenv('ES_HOST', "NOT_AVAILABLE") logger.info("Connecting to Elastic Search %s:%d", es_host, es_port) es = Elasticsearch([{ 'host': es_host, 'port': es_port }]) create_index(es_index, es_type) if not start and not end: delta = timedelta(days=3) start = date.today() - delta start = '{0}/{1}/{2}'.format(start.month, start.day,start.year-2000) # Simple product upload if product: print "product", product if not os.path.exists(product): print "Product does not exist", product with open(product) as data: json_data = json.load(data) data.close() #print json_data logger.info('processing %s' % json_data['scene_id']) for w in writers: w(download_folder, json_data) print "Done" sys.exit(0) csv_reader(csv, folder, writers, start_date=start, end_date=end, download=download, download_path=download_folder, num_worker_threads=concurrency)
def main(): # create an folder "Output" under the current folder to hold results, trained models are save in 'pkl' create_folders('output', ['pkl']) # Write results to file, return stdout back to screen by sys.stdout.back_to_screen() if needed. fname = './output/' + "models" + datetime.now().time().strftime( "%Hh%Mm") + ".txt" logging.basicConfig(filename=fname, level=logging.INFO, format='%(message)s') df = csv_reader(s.filename, field_names=s.field_names, desc=s.desc_names, split=s.split) #ALPHA need to be tuned according to training sample size ALPHA = s.set_alpha(df.shape[0]) #denoise, forward mode, can be altered to use online df, trends, df_res = denoise_extract_trend( df, field_names=s.field_names, alpha=ALPHA, verbose=VERBOSE, resample_interval=s.RESAMPLING_INTERVAL) # select, estimate and pickle the ARIMA models for the original series and the detrending residuals (ARIMA with trend term) models_0 = {} d_0 = {} models_trend = {} d_trend = {} dummy_mean = df[s.field_names].mean() df_zeroed = df[s.field_names] - dummy_mean # get tentative pdq orders with open('./output/pkl/orders.pkl', 'rb') as f: pdq = pd.read_pickle(f) for name, dff, d, mdl in zip(['series', 'residual'], [df_zeroed, df_res], [d_0, d_trend], [models_0, models_trend]): for f in s.field_names: d[f], mdl[f] = model_select_fit(dff[f].dropna().values, pdq.loc[name, f], VERBOSE=VERBOSE >= 2) if VERBOSE >= 1: logging.info('\n' * 4 + 'For {} - {} after {}-order differencing'.format( f, name, d[f])) logging.info('\nNone' if mdl[f] is None else mdl[f].summary()) # pickle all we need for the model with a trend term with open('./output/pkl/models_trend.pkl', 'wb') as f: pkl.dump(trends, f) pkl.dump(s.RESAMPLING_INTERVAL, f) pkl.dump(d_trend, f) pkl.dump(models_trend, f) # pickle the model for the original series with open('./output/pkl/models_0.pkl', 'wb') as f: pkl.dump(dummy_mean, f) pkl.dump(d_0, f) pkl.dump(models_0, f) # The dummy mean of the training set will also be used for comparing with the forecasts. with open('./output/pkl/models_dummy.pkl', 'wb') as f: pkl.dump(dummy_mean, f) logging.info('\n' * 2) logging.info( 'The models are save as models_trend.pkl, models_0.pkl, and models_dummy.pkl' ) logging.info( """\n ---------------------- Model training is over! ------------------------------ """ ) logging.shutdown()
:return: metrics :rtype: dictionary """ metrics["mean_hr_bpm"] = mean_heart_rate metrics["voltage_extremes"] = voltage_extremes metrics["duration"] = duration metrics["num_beats"] = num_beats metrics["beats"] = beats return metrics if __name__ == "__main__": data_file = 'test_data1.csv' logging.config.fileConfig('logger_config.ini', disable_existing_loggers=False) data = csv_reader(data_file) time_array = data[0] voltage_array = data[1] validate(time_array, voltage_array) beats = beats_test(threshold=0.7, voltage_array=voltage_array, time_array=time_array ) num_beats = num_beats_test(threshold=0.7, voltage_array=voltage_array) duration = duration(time_array=time_array) mean_heart_rate = mean_beats(threshold=0.7, voltage_array=voltage_array, time_array=time_array ) voltage_extremes = voltage_extremes(voltage_array=voltage_array) my_dictionary = create_metrics() my_dictionary = add_word(my_dictionary) logging.info(my_dictionary)
from reader import csv_reader from preprocess import preprocess from model_select import model_select from plot_bar_chart import plot_bar_chart from model_training import model_training from test import test path = 'Route24304Train.csv' (x_date, x_vehicle, x_service, x_start, x_duration, x_spans, x_weekday_not) = csv_reader(path) (x_weekday_not1,x_start1,x_stops1,x_travels1,x_date1, x_weekday_not2, x_stops2,x_travels2,x_start2,x_data2) = \ preprocess(x_date, x_start, x_duration, x_spans, x_weekday_not) (e_t_t, e_m_t, e_t_s, e_m_s, std_t_t, std_m_t, std_t_s, std_m_s) = model_select(x_weekday_not1, x_start1, x_stops1, x_travels1, x_date1) plot_bar_chart(e_m_t, std_m_t) plot_bar_chart(e_m_s, std_m_s) plot_bar_chart(e_t_t, std_t_t) plot_bar_chart(e_t_s, std_t_s) model_training(x_weekday_not1, x_start1, x_stops1, x_travels1, x_date1) (MAE_travel_left, MAE_stops_left, MAE_travel_top, MAE_stop_top, MAE_stop_top, MAE_travel_main)\ =test(x_weekday_not2, x_start2, x_stops1, x_travels2, x_date2)
def main(): """ The performance of 5 models are compared: (1) ARIMA, (2) ARIMA with average day trend template, (3) day trend average, (4) day trend + ewma filtered detrending residual (5) dummy mean The results are save in a log file. """ # Outliers are removed before being used for forecasting. # But outliers are included in evaluation, so the forecasts are compared with the real raw data. create_folders('output', ['forecasts']) # turn on logging fname = './output/' + "forecasts" + datetime.now().time().strftime( "%Hh%Mm") + ".txt" logging.basicConfig(filename=fname, level=logging.INFO, format='%(message)s') df_test = csv_reader(s.filename, field_names=s.field_names, desc=s.desc_names, split=s.split - 1) df_test_original = df_test.copy( ) # keep the original data for performance evaluation # cleaned observations for forecasting # now rmvoutliers_fill is on an individual day series, while in training phase, it's on an aligned series of all the historical data. # use ewm with a larger ALPHAl, or a moving window with smaller window for f in s.field_names: rmvoutliers_fill(df_test[f], fill='ewm', n=1) mdl_list = [ 'ARIMA', 'ARIMA_trend', 'average_trend', 'ewma_trend', 'ewma', 'dummy_means' ] err_mae = pd.DataFrame(None, mdl_list, s.field_names) err_quant = pd.DataFrame(None, mdl_list, s.field_names) for mdl in mdl_list: df_hat = forecast(df_test, s.field_names, mdlname=mdl) # mean absolute error and 75% quantile err_mae.loc[mdl, :] = (df_test_original[s.field_names] - df_hat).abs().mean() err_quant.loc[mdl, :] = (df_test_original[s.field_names] - df_hat).abs().quantile(q=0.75) if VERBOSE >= 1: # plot an example day series for all the stops. mdl_list2 = ['ARIMA_trend', 'average_trend', 'ewma_trend'] ts = pd.DataFrame(None, columns=mdl_list2) date = '2016-07-22' for f in s.field_names: ts['Orignal'] = df_test_original.loc[date, f] for mdl in mdl_list2: ts[mdl] = forecast(df_test[date], [f], mdlname=mdl) ax = ts.plot(title="Stop-travel time forecasting for " + f + ' (' + date + ')') import matplotlib.dates as mdates ax.xaxis.set_major_formatter(mdates.DateFormatter('%H:%M')) plt.tight_layout() fig = ax.get_figure() fig.savefig('./output/forecasts/' + f + '.pdf') plt.close(fig) logging.info('\n' * 2) logging.info( '======================================================================================================' ) logging.info( ' Performance Evaluation of Stop/travel Time Forecast for Next Bus (in seconds) ' ) logging.info( '======================================================================================================' ) logging.info('\n' * 2) logging.info( ' Mean Absolute Error ' ) logging.info( '--------------------------------------------------------------------------------------------------' ) logging.info(err_mae.to_string(line_width=100)) logging.info('\n' * 2) logging.info('The average mean absolute error over all the stops:\n') logging.info(err_mae.mean(axis=1)) logging.info('\n' * 2) logging.info( ' 75% Quantile of Forecasting Error ' ) logging.info( '--------------------------------------------------------------------------------------------------' ) logging.info(err_quant.to_string(line_width=100)) logging.info('\n' * 2) logging.info('The average 75% error quantile over all the stops:\n') logging.info(err_quant.mean(axis=1)) i = 0 k = 10 methods = ['average_trend', 'ewma_trend', 'dummy_means'] while i * k < s.field_names.__len__(): ax = plot_bar_chart(err_mae.loc[methods].values[:, i * k:i * k + k], err_quant.loc[methods].values[:, i:i + k], 'Absolute Forcast Error and 75% Quantile', s.field_names[i * k:i * k + k], legend=methods) i += 1 fig = ax.get_figure() fig.savefig('./output/forecast_error_' + str(i) + '.pdf') plt.close(fig) logging.info('\n' * 2) logging.info( """\n ---------------------- The testing is over! ------------------------------ """ ) logging.shutdown()
from reader import csv_reader from cleaner import cleaner import pandas as pd csv_file = open('receita.csv', 'r') dict_list = csv_reader(csv_file) file_cleaned = cleaner(dict_list) imports = pd.DataFrame(file_cleaned) export_csv = imports.to_csv( r'C:\Users\CalebeLadis\PycharmProjects\csv-reader\receita2.csv', index=None, header=True)