def run(): try: paths_list = get_paths('20170501') i = 0 while i < 100: # df = pd.read_csv(paths_list[i], sep='|', names=['user_id', 'ts', 'rssi', 'AP']) # df = get_df_with_index(df, df['ts']) # df_0 = df[df['AP'] == '14E4E6E186A4'] # df_1 = df[df['AP'] == 'EC172FE3B340'] # print("df: %s, df_0: %s, df_1: %s" % (df, df_0, df_1)) # i += 1 # plt.plot(df_0['rssi']) # plt.plot(df_1['rssi']) # plt.xlabel('time') # plt.ylabel('rssi') # plt.show() with open(paths_list[i], 'r') as fr: length = len(fr.readlines()) date = get_date(paths_list[i]) user_id = get_uid(paths_list[i]) with open(paths_list[i], 'r') as fr: time_slices_list = [] prev_line = fr.readline() prev_list = prev_line.split("|") user_id = prev_list[0] prev_ts = int(prev_list[1]) prev_rssi = int(prev_list[2]) prev_AP = prev_list[-1].strip() time_slices_list.append((prev_ts, prev_rssi, prev_AP)) i += 1 j = 1 while j < length: j += 1 cur_line = fr.readline() cur_list = cur_line.split("|") cur_ts = int(cur_list[1]) cur_rssi = int(cur_list[2]) cur_AP = cur_list[-1].strip() if cur_ts - prev_ts <= 120: time_slices_list.append((cur_ts, cur_rssi, cur_AP)) else: df = pd.DataFrame(time_slices_list) df = get_df_with_index(df, df[0]) df_0 = df[df[2] == '14E4E6E186A4'] df_1 = df[df[2] == 'EC172FE3B340'] print("df: %s, df_0: %s, df_1: %s, user_id: %s" % (df, df_0, df_1, user_id)) plt.plot(df_0[1]) plt.plot(df_1[1]) plt.title(user_id) plt.xlabel('time') plt.ylabel('rssi') plt.show() time_slices_list[:] = [] time_slices_list.append((cur_ts, cur_rssi, cur_AP)) prev_ts = cur_ts except Exception as e: raise e
def make_paths(self): if not self.paths: self.paths = get_paths(self.database.graph, self.name) if not self.local_tables: local_tables, one_to_many_tables = make_local_tables(self.paths) self.local_tables = local_tables self.one_to_many_tables = one_to_many_tables self.table_path_list = create_table_path_list(self.paths) self.table_path = create_table_path(self.table_path_list, self.name)
def main(): prtime('starting lgb.py PREPROCESS_VERSION =', PREPROCESS_VERSION, 'OUTPUT_VERSION = ', OUTPUT_VERSION) IDIR, ODIR = get_paths() train = read32(ODIR + 'train_updated_v'+str(PREPROCESS_VERSION)+'.csv') prtime('train reading done') gc.collect() # if LOG: # train.Value = np.log1p(train.Value) prtime('reading submission_format') submission_format = read32(ODIR + 'submission_format_updated_v'+str(PREPROCESS_VERSION)+'.csv') metadata = pd.read_csv(IDIR + 'metadata.csv').set_index('SiteId') prtime('generating features for submission') train = get_static_features(train, metadata) train = get_ratio_features(train) submission_format = get_static_features(submission_format, metadata) submission_format = get_ratio_features(submission_format) print(train.dtypes) print(train.memory_usage()) print(submission_format.dtypes) print(submission_format.memory_usage()) submission_frequency = pd.read_csv(IDIR + 'submission_frequency.csv') # submission_updated = pd.read_csv(ODIR + 'submission_updated.csv') train.Temperature.fillna(np.nanmedian(train.Temperature), inplace = True) submission_format.Temperature.fillna(np.nanmedian(train.Temperature), inplace = True) freqs = [900000000000, 3600000000000, 86400000000000] seeds = [14,15,16,17,18] # seeds = [14,15,16,17,18]*100 # freqs = [86400000000000] best_losses = pd.DataFrame(columns = ['freq', 'seed', 'single', 'blended'], dtype = np.float32) for seed in seeds: if os.path.isfile(ODIR + 'lw'+str(OUTPUT_VERSION)+'_submission_lgb_'+str(seed)+'.csv'): print('skipping training for seed', seed,' file already exists') continue for freq in freqs: best_loss = tune_params(train, submission_format, submission_frequency, freq, n_attempts = NUM_ATTEMPTS, random_seed = seed) best_losses = best_losses.append({'freq' : freq, 'seed' : seed, 'single' : best_loss['single'], 'blended' : best_loss['blended']}, ignore_index = True) print('best losses so far = ', best_losses) print(best_losses.groupby('freq')['single','blended'].mean()) # print('last 5 losses mean : ', best_losses.iloc[-5:]['blended'].mean()) filenames = [ODIR + 'lw'+str(OUTPUT_VERSION)+'_submission_lgb_'+str(seed)+'.csv' for seed in seeds] average(filenames)
import pandas as pd import numpy as np from util import get_paths, prtime import multiprocessing from multiprocessing import Process, Queue import time import traceback import sys import gc import lgb import os from collections import OrderedDict IDIR, ODIR = get_paths() VERSION = 12 # Version of preprocessed files, used in output files names like 'train_updated_vNN.csv' # Number of nan values in an array def my_nancount(a): return np.sum(np.isnan(a)) # Calculating historical aggregates # df - source dataframe (train set) # TestTimestamp - start of test period, no data at this point or beyond is used # period - amount of time before TestTimestamp used to calculate aggregetes # target col - column to calculate averages (can be Value, Temperature, ...) # cols - columns to group by (i.e. we are getting aggregate values for the same values in these columns in the past # col_values - current values in this columns (for example, current time and day of week)
This module takes in numpy arrays of the B-Tax final and intermediate calculations and then puts them into Pandas Dataframes in a format suitable for tabular representation in the web app. Last updated: 8/2/2016. """ # Import packages import os.path import sys import pandas as pd import numpy as np import cPickle as pickle from util import get_paths, read_from_egg globals().update(get_paths()) def CBO_compare(vars_by_asset): """Function to compare B-Tax output to CBO calcuations :param user_params: The user input for implementing reforms :type user_params: dictionary :returns: METR (by industry and asset) and METTR (by asset) :rtype: DataFrame """ # read in CBO file CBO_data = pd.read_excel(os.path.join(_REF_DIR, 'effective_taxrates.xls'), sheetname='Full detail', header=1, skiprows=0, skip_footer=8) CBO_data.columns = [col.encode('ascii', 'ignore') for col in CBO_data] CBO_data.rename(columns = {'Top page (Rows 3-35): Equipment Bottom page (Rows 36-62): All Other ':'Asset Type'}, inplace = True)
def infer(): """ Main method. For paths specified in input_paths, computes prediction and then saves. input_paths can be a list of paths or a path to a directory of x files or a path to a csv file with paths in each line of the file. """ if conf["rand_seed"] is not None: random.seed(conf["rand_seed"]) #parsing possible command-line arguments parser = argparse.ArgumentParser() parser.add_argument( "--input_paths", type=str, nargs="?", help="path to CSV list of input paths or input file or dir with files", default=conf["input_paths"]) parser.add_argument("--output_dir_path", type=str, nargs="?", help="path to directory to save predictions", default=conf["output_dir_path"]) parser.add_argument( "--model_path", type=str, nargs="?", help="path directory containing meta-graph and weights for model", default=conf["model_path"]) args = parser.parse_args() #getting input filepaths input_paths = util.get_paths(args.input_paths) #getting output_dir_path output_dir_path = args.output_dir_path #getting model path model_path = args.model_path if conf["max_n_preds"] is not None: random.shuffle(input_paths) input_paths = input_paths[:conf["max_n_preds"]] #creating base dir if needed if not os.path.isdir(output_dir_path): os.makedirs(output_dir_path) #creating preds dir preds_dir = mk_preds_dir(output_dir_path, "preds") #meta-model meta_model = model.MetaModel(**conf["meta_model_kwargs"]) with tf.Session(graph=tf.Graph()) as sess: #loading model weights print("loading model from '{}'...".format(model_path), flush=True, end=" ") model.load(sess, model_path) meta_model.set_params_from_colls() print("done") #building functions load_fn = conf["load_fn"] pre_proc_fn = conf["pre_proc_fn"] save_y_pred_fn = conf["save_y_pred_fn"] #prediction function is a composition _pred_fn = lambda x: predict(x, meta_model.get_pred_fn(sess)) pred_fn = (lambda x: hmirr_averaged_predict(x, _pred_fn)) \ if conf["hmirr_averaged_predict"] else _pred_fn #iterating over images doing predictions pred_times = [] #for path in input_paths: for path in input_paths: print("on file '{}'".format(path)) #loading x = load_fn(path) orig_x_shape = x.shape[-2:] print('x shape, dtype:', x.shape, x.dtype) #pre-processing x = pre_proc_fn(x) print('[pre-proc] x shape, dtype:', x.shape, x.dtype) #predicting print("\tpredicting...", flush=True, end=" ") start_time = time.time() y_pred = pred_fn(x) pred_time = time.time() - start_time pred_times.append(pred_time) print("done. took {:.4f} seconds".format(pred_time), end=" | ") print("y_pred shape:", y_pred.shape) #saving y_pred_path = get_y_pred_path(path, preds_dir) save_y_pred_fn(y_pred_path, y_pred, orig_x_shape) print("\tsaved y_pred to '{}'".format(y_pred_path)) print("\ndone prediction on {} files in {:.4f}s (avg {:.4f}s)".format( len(input_paths), sum(pred_times), get_mean(pred_times))) print("saved preds in '{}'".format(preds_dir))
def main(args): with tf.Graph().as_default(): with tf.Session() as sess: # Read the file containing the pairs used for testing #readStart = time.clock() names = util.read_names(os.path.expanduser(args.lfw_names)) #print(names) # Get the paths for the corresponding images paths, actual_issame = util.get_paths( os.path.expanduser(args.lfw_dir), names, args.lfw_file_ext) #readEnd = time.clock() print(paths) # Load the model #loadStart = time.clock() print('Model directory: %s' % args.model_dir) meta_file, ckpt_file = util.get_model_filenames( os.path.expanduser(args.model_dir)) #run_metadata = tf.RunMetadata() print('Metagraph file: %s' % meta_file) print('Checkpoint file: %s' % ckpt_file) util.load_model(args.model_dir, meta_file, ckpt_file) #loadEnd = time.clock() # Get input and output tensors images_placeholder = tf.get_default_graph().get_tensor_by_name( "input:0") embeddings = tf.get_default_graph().get_tensor_by_name( "embeddings:0") phase_train_placeholder = tf.get_default_graph( ).get_tensor_by_name("phase_train:0") image_size = images_placeholder.get_shape()[1] embedding_size = embeddings.get_shape()[1] print('Image Size: %s' % str(image_size)) print('Embedding Size: %s' % str(embedding_size)) # Run forward pass to calculate embeddings print('Calculating embeddings') batch_size = args.lfw_batch_size nrof_images = len(paths) nrof_batches = int(math.ceil(1.0 * nrof_images / batch_size)) emb_array = np.zeros((nrof_images, embedding_size)) runStart = time.clock() #run_metadata = tf.RunMetadata() for i in range(nrof_batches): start_index = i * batch_size end_index = min((i + 1) * batch_size, nrof_images) paths_batch = paths[start_index:end_index] images = util.load_data(paths_batch, image_size) feed_dict = { images_placeholder: images, phase_train_placeholder: False } emb_array[start_index:end_index, :] = sess.run( embeddings, feed_dict=feed_dict) break runEnd = time.clock() #print('Size of image list of batch 100: %d'%sizeImages) #print('Path array size : %d'%sys.getsizeof(paths)) #print('Time to extract path from file: %d'%(readEnd - readStart)) #print('Time to load model from disk: %d'%(loadEnd - loadStart)) print('Time to calculate embeddings: %d' % (runEnd - runStart)) buildIndexStart = time.clock() # create an index of Euclidean distance p = PannsIndex(dimension=128, metric='euclidean') for i in range(0, 50): p.add_vector(emb_array[i][:]) p.parallelize(True) p.build(40) buildIndexEnd = time.clock() results = p.query( emb_array[8][:], 4) #pick one face and find its 4 nearest neigbour print([ paths[x[0]] for x in results ]) #putting brackets around generator expression makes it a list
import numpy as np import lightgbm as lgb import gc import sys import os from util import get_paths, prtime, update_params from sklearn.model_selection import train_test_split from collections import OrderedDict import psutil GLOBAL_PARAMS_UPDATE = False # If true, best parameters from the previous iteration are used in the next one PREPROCESS_VERSION = 12 # Version of preprocessed files OUTPUT_VERSION = 1247 # Version of output files to generate IDIR, ODIR = get_paths() # Folders with original and generated data NUM_ATTEMPTS = 10 # Number of attempts to optimize parameters and create better blended solution for each validation fold RETRAIN = True # Retrain model on the whole training set (train+eval) after evaluation is done VAL_SIZE = 0.3 # Validation set ratio def get_ratio(s1, s2): return (s1/s2).fillna(1).astype(np.float32) # Calculating static features, not using historical data def get_static_features(df, metadata): df['Timestamp'] = pd.to_datetime(df.Timestamp) df['Doy'] = df.Timestamp.dt.dayofyear.astype(np.float32) df['Time'] = (df.Timestamp.dt.hour/24.0+df.Timestamp.dt.minute/(24.0*60.0)).astype(np.float32) df['DowTime'] = df['Dow']+df['Time']
------------------------------------------------------------------------------- This module takes in numpy arrays of the B-Tax final and intermediate calculations and then puts them into Pandas Dataframes in a format suitable for tabular representation in the web app. Last updated: 8/2/2016. """ # Import packages import os.path import sys import pandas as pd import numpy as np import cPickle as pickle from util import get_paths, read_from_egg globals().update(get_paths()) def CBO_compare(vars_by_asset): """Function to compare B-Tax output to CBO calcuations :param user_params: The user input for implementing reforms :type user_params: dictionary :returns: METR (by industry and asset) and METTR (by asset) :rtype: DataFrame """ # read in CBO file CBO_data = pd.read_excel(os.path.join(_REF_DIR, 'effective_taxrates.xls'), sheetname='Full detail', header=1, skiprows=0,
""" Analyzes the uses of the noscript tag. """ from util import get_paths, parse_csv_line, as_bool from util import benchmark_columns as columns from urllib.parse import urlparse import os import lxml.html bm_file_path, _, noscript_dir_path, _ = get_paths() # Category definitions. cat_alt = "alternative_content" cat_track = "tracking_metrics" cat_other = "other" def warn_tag(tag, url): """ Warns about a tag not being recognized properly. """ print("Unrecognized tag for {}: {} {}".format(url.hostname, tag.tag, tag.attrib)) def is_url_relative(url): """ True if a URL is relative, False otherwise. """ return url[0] == "/" and url[1] != "/" def one_of_in(lst, val): """
""" Computes the median script execution time for every website (doesn't work). """ from util import get_paths, parse_csv_line from util import benchmark_columns as columns import os, json _, metrics_dir_path, _, _ = get_paths() metrics_file_names = os.listdir(metrics_dir_path) for file_name in metrics_file_names: # Skip metrics taken when JS was disabled. if "nojs" in file_name: continue file_path = os.path.join(metrics_dir_path, file_name) # List to load the JSON array with browser metrics into. metrics = [] # List of script execution durations. script_timings = [] with open(file_path, "r") as f: metrics = json.load(f) # Compute the difference in time between the current and # the previous sample taken. for i in range(1, len(metrics)): t = metrics[i]["ScriptDuration"] - metrics[i - 1]["ScriptDuration"]
def train(): #parsing possible command-line arguments parser = argparse.ArgumentParser() parser.add_argument('--output_dir_path', type=str, nargs='?', help='path to directory to save train data', default=conf['output_dir_path']) parser.add_argument('--pre_trained_model_path', type=str, nargs='?', help='path to pre-trained model', default=conf['pre_trained_model_path']) parser.add_argument('--train_set', type=str, nargs='?', help='path to csv list of train set paths', default=conf['train_set']) parser.add_argument('--val_set', type=str, nargs='?', help='path to csv list of validation set paths', default=conf['val_set']) args = parser.parse_args() #getting output_dir_path output_dir_path = args.output_dir_path #getting pre_trained_model_path pre_trained_model_path = args.pre_trained_model_path #getting train_set train_set = util.get_paths(args.train_set) #getting val val_set = util.get_paths(args.val_set) out_dir = util.mk_model_dir(output_dir_path) print('created out dir \'{}\', populating...'.format(out_dir), flush=True, end=' ') populate_out_dir(out_dir, train_set, val_set) print('done.') #meta-model meta_model_kwargs = dict(conf['meta_model_kwargs']) if 'rand_seed' not in meta_model_kwargs: meta_model_kwargs['rand_seed'] = conf['rand_seed'] + 2 meta_model = model.MetaModel(**meta_model_kwargs) #creating logging object log = util.Tee([ sys.stdout, open(os.path.join(out_dir, 'etc', 'train-log', 'train.log'), 'w') ]) #building graph if pre_trained_model_path is None: log.print('[info] building graph for the first time') graph = meta_model.build_graph() else: graph = tf.Graph() #tensorboard logging paths summ_dir = os.path.join(out_dir, 'etc', 'train-log', 'summaries') #training session with tf.Session(graph=graph) as sess: #if first time training, creates graph collections for model params #else, loads model weights and params from collections if pre_trained_model_path is None: sess.run( tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())) meta_model.mk_params_colls(graph=graph) else: log.print('[info] loading graph/weights from \'{}\''.format( pre_trained_model_path)) model.load(sess, pre_trained_model_path) meta_model.set_params_from_colls(graph=graph) #building functions #train function: cumputes loss _train_fn = meta_model.get_train_fn(sess) def train_fn(x, y_true): return _train_fn( x, y_true, {meta_model.params['learning_rate']: conf['learning_rate']}) #test function: returns a dict with pairs metric_name: metric_value _test_fn = meta_model.get_test_fn(sess) def test_fn(x, y_true): metrics_values = _test_fn(x, y_true) return OrderedDict( zip(meta_model.params['metrics'].keys(), metrics_values)) #save model function: given epoch and iter number, saves checkpoint def save_model_fn(epoch=None, it=None, name=None): if name is None: path = os.path.join(out_dir, 'self', 'ckpts', 'epoch-{}_it-{}'.format(epoch, it)) else: path = os.path.join(out_dir, 'self', 'ckpts', '{}'.format(name)) model.save(sess, path, overwrite=True) print(' saved checkpoint to \'{}\''.format(path)) #test if conf['use_tensorboard']: #tensorboard summary writers train_writer = tf.summary.FileWriter(os.path.join( summ_dir, 'train'), graph=graph) val_writer = tf.summary.FileWriter(os.path.join(summ_dir, 'val'), graph=graph) #running tensorboard cmd = ['tensorboard', '--logdir={}'.format(summ_dir)] cmd.extend('--{}={}'.format(k, v) \ for k, v in conf['tensorboard_params'].items()) log.print('[info] running \'{}\''.format(' '.join(cmd))) proc = sp.Popen(cmd, stdout=sp.PIPE, stderr=sp.PIPE) _log_fn = meta_model.get_summary_fn(sess) def log_fn(x, y_true, its, train=True): summ = _log_fn(x, y_true) if train: train_writer.add_summary(summ, its) if its % 10 == 0: train_writer.flush() else: val_writer.add_summary(summ, its) if its % 10 == 0: val_writer.flush() else: log_fn = None #main train loop print('calling train loop') try: trloop.train_loop( train_set=train_set, train_fn=train_fn, n_epochs=conf['n_epochs'], val_set=val_set, val_fn=test_fn, val_every_its=conf['val_every_its'], patience=conf['patience'], log_every_its=conf['log_every_its'], log_fn=log_fn, save_model_fn=save_model_fn, save_every_its=conf['save_every_its'], batch_gen_kw=conf['batch_gen_kw'], log_batch_gen_kw=conf['log_batch_gen_kw'], better_loss_tol=conf['better_loss_tol'], verbose=conf['verbose'], print_fn=log.print, ) except KeyboardInterrupt: print('Keyboard Interrupt event.') finally: #closing tensorboard writers if conf['use_tensorboard']: train_writer.close() val_writer.close() #saving model on final state path = os.path.join(out_dir, 'self', 'ckpts', 'final') print('saving checkpoint to \'{}\'...'.format(path), flush=True) model.save(sess, path, overwrite=True) print('\ndone.', flush=True)
def process_init(sender=None, conf=None, **kwargs): if sender.hostname == 'worker1@harshitpc': with tf.Graph().as_default(): with tf.Session() as sess: # Read the file containing the pairs used for testing #readStart = time.clock() names = util.read_names(os.path.expanduser(lfw_names)) #print(names) # Get the paths for the corresponding images paths, actual_issame = util.get_paths( os.path.expanduser(lfw_dir), names, lfw_file_ext) #readEnd = time.clock() print("Done Initializing") # Load the model #loadStart = time.clock() print('Model directory: %s' % model_dir) meta_file, ckpt_file = util.get_model_filenames( os.path.expanduser(model_dir)) #run_metadata = tf.RunMetadata() print('Metagraph file: %s' % meta_file) print('Checkpoint file: %s' % ckpt_file) util.load_model(model_dir, meta_file, ckpt_file) #loadEnd = time.clock() # Get input and output tensors images_placeholder = tf.get_default_graph().get_tensor_by_name( "input:0") embeddings = tf.get_default_graph().get_tensor_by_name( "embeddings:0") phase_train_placeholder = tf.get_default_graph( ).get_tensor_by_name("phase_train:0") image_size = 160 # Warning. This was hardcoded. General should be ---> images_placeholder.get_shape()[1] embedding_size = 128 #Warning. This was hardcoded. General should be ---> embeddings.get_shape()[1] #print('Embedding Size: %s' %str(embedding_size)) # Run forward pass to calculate embeddings print('Calculating embeddings') batch_size = lfw_batch_size nrof_images = len(paths) nrof_batches = int(math.ceil(1.0 * nrof_images / batch_size)) #INCREDIBLY STUPID STUFF TO FOLLOW. WILL STRUCTURE THE CODE PROPERLY TO AVOID THIS LATER. global emb_array runStart = time.clock() #run_metadata = tf.RunMetadata() for i in range(nrof_batches): start_index = i * batch_size end_index = min((i + 1) * batch_size, nrof_images) paths_batch = paths[start_index:end_index] images = util.load_data(paths_batch, image_size) feed_dict = { images_placeholder: images, phase_train_placeholder: False } emb_array[start_index:end_index, :] = sess.run( embeddings, feed_dict=feed_dict) break
if argc < 2: print("usage: {} outputdir plot".format(sys.argv[0])) exit() action = sys.argv[2] plot_types = ["hist_load", "hist_domload", "hist_idle"] if action not in plot_types: print("plot argument must be one of: {}".format(", ".join(plot_types))) exit() # matplotlib is quite massive so we're only importing it now. import matplotlib import matplotlib.pyplot as plt bm_file_path, _, _, _ = get_paths() bm_results_file_path = append_to_filename(bm_file_path, "_results") # Read results file into rows field. rows = [] with open(bm_results_file_path) as f: # Skip CSV header. next(f) for line in f: line = parse_csv_line(line) line[1] = as_bool(line[1]) line[2] = as_bool(line[2]) for i in range(3, len(line)):
# Parse arguments args = parse_arguments(argv[1:]) # Generate dummy data print('Loading data...') data_dir = args.data_dir _, _, X_test, y_test = ember.read_vectorized_features(data_dir, scale=args.scale) X_test = np.array(X_test) X_test = X_test[y_test != -1] y_test = y_test[y_test != -1] model_dir = args.model_dir path_dict = get_paths(model_dir) json_file = open(path_dict['graph'], 'r') loaded_model_json = json_file.read() json_file.close() model = model_from_json(loaded_model_json) model.load_weights(path_dict['model']) with open(path_dict['scaler'], 'rb') as f: scaler = pkl.load(f) X_test = scaler.transform(X_test) X_test = np.expand_dims(X_test, axis=-1) y_test = keras.utils.to_categorical(y_test, num_classes=2) # ROC curve y_pred = model.predict(X_test) fpr, tpr, thresholds = roc_curve(np.argmax(y_test, axis=1),
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import sys import re import pandas as pd import util # パス取得 paths = util.get_paths() data_path = paths['data_path'] input_path = paths['input_path'] input_raw_path = paths['input_raw_path'] input_unzip_path = paths['input_unzip_path'] def merge_data(): """ ### サンプルデータ作成 - kaggleのリクルートホールディングスのデータを加工 """ # データ解凍・読み込み for fname in ['air_visit_data', 'air_store_info']: util.unzip(f'{input_raw_path}/{fname}.csv.zip', input_unzip_path) df_visit = pd.read_csv(f'{input_unzip_path}/air_visit_data.csv') df_store = pd.read_csv(f'{input_unzip_path}/air_store_info.csv') (df_visit.merge(df_store, on='air_store_id', how='left').assign( pref_name=lambda x: x['air_area_name'].str.split(' ').str.get(0).str. replace('Tōkyō-to', '東京都').str.replace('Ōsaka-fu', '大阪府').str.replace(
""" Removes redundant files in the subdirectories of the output file that don't belong to any row present in the main table. """ from util import get_paths, parse_csv_line, try_remove from util import benchmark_columns as columns import os bm_file_path, metrics_dir_path, noscript_dir_path, screenshots_dir_path = get_paths( ) # List all subdirectory contents. metrics_dir_list = os.listdir(metrics_dir_path) noscript_dir_list = os.listdir(noscript_dir_path) screenshots_dir_list = os.listdir(screenshots_dir_path) with open(bm_file_path, "r") as f: # Skip CSV header. next(f) # Remove the files from the directory listings that are # referenced in the main table. In the end, the lists will # only contain files that need to be deleted. for line in f: data_file_name = parse_csv_line(line)[columns["dataFileName"]] try_remove(metrics_dir_list, data_file_name + ".json") try_remove(noscript_dir_list, data_file_name + ".html") try_remove(screenshots_dir_list, data_file_name + ".png")