def verify_paths(params): """verify paths exist as either relational or absolute paths, modifying them if necessary""" # under one shot mode, only care about 1 result file if params.one_shot: if not os.path.isfile(params.result_file): print(params.task_dir, " one_shot result file does not exist") return # normally want to check each run in the directory if os.path.isdir(params.task_dir): walk_runs(params, check_result_exists) else: print(params.task_dir, " directory does not exist")
def update_db(params, db): # check if table for task exists; if not then create it task_table_name = params.task_table_name create_table(params, db, task_table_name) # load up latest run and parsed date for task def check_last_runs_table(runs): natural_sort(runs) cursor = db.cursor() cursor.execute("SELECT MAX({}), MAX({}) FROM {}".format( "parsed_date", "run", params.task_table_name)) row = cursor.fetchone() if row[0]: # storing in database truncates decimal, so add 1 second last_parsed_date = row[0] + 1 last_run = row[1] print("last parsed date", last_parsed_date) print("last run", last_run) params.last_parsed_date = last_parsed_date params.last_run = last_run else: print("first population") params.last_run = 0 # else first run, nothing in table yet def add_run_to_db(params, run): run_number = get_trailing_num(run) resfilename = get_result_file(params, params.run_prefix, run_number) try: parsed_date = os.path.getmtime(resfilename) # throw away unless newer than latest or run number greater than maximum if parsed_date <= params.last_parsed_date and run_number <= params.last_run: return except OSError: print("file {} not found; skipping".format(resfilename)) return params.last_run += 1 print("run {} added ({}) ({})".format(run_number, params.last_run, parsed_date)) with open(resfilename, 'r') as res: # make sure table is compatible with run data by inserting any new columns # always called "run" in table (converted from whatever prefix users use) result_params = ["run", "parsed_date"] result_params.extend(res.readline().split('\t')) if result_params[-1] == '\n': result_params.pop() result_params = [ "".join(('\"', p.strip(), '\"')) for p in result_params ] pre_sample_pos = res.tell() result_params_sample = res.readline().split('\t') # go back to presample location for normal line iteration res.seek(pre_sample_pos, os.SEEK_SET) # add new column to table for c in range(len(result_params)): if result_params[c] not in params.tracked_columns: print("ADDING {} as new column".format(result_params[c])) add_column_to_table( params, db, result_params[c], result_params_sample[ c - 2]) # -2 accounts for run and parsed date # add value rows rows_to_add = [] for line in res: # run number and parsed_date are always recorded result_params_val = [params.last_run, parsed_date] result_params_val.extend(line.split('\t')) if result_params_val[-1] == '\n': result_params_val.pop() # something must be wrong here if len(result_params_val) > len(result_params): print( "There are {} values for only {} parameters in run {}; \ skipping run".format(len(result_params_val), len(result_params), run_number)) # skip this run params.last_run -= 1 return # for when the last column value is the empty string while len(result_params_val) < len(result_params): result_params_val.append('') rows_to_add.append( tuple( convert_strictest(param) if param != nullval else None for param in result_params_val)) param_placeholders = ("?," * len(result_params)).rstrip(',') # specify columns to insert into since ordering of columns in file may not match table insert_rows_command = "INSERT OR IGNORE INTO {} ({}) VALUES ({})".format( params.task_table_name, ','.join(result_params), param_placeholders) cursor = db.cursor() cursor.executemany(insert_rows_command, rows_to_add) walk_runs(params, add_run_to_db, check_last_runs_table) db.commit()
def verify_paths(params): """verify paths exist as either relational or absolute paths, modifying them if necessary""" if os.path.isdir(params.task_dir): walk_runs(params, check_result_exists) else: print(params.task_dir, " directory does not exist")
def update_db(params, db): # check if table for task exists; if not then create it task_table_name = params.task_table_name create_table(params, db, task_table_name) # verify that the max runs in the task_dir is >= to the max runs in the database def check_runs_match_table(runs): natural_sort(runs) highest_run = get_trailing_num(runs[-1]) cursor = db.cursor() cursor.execute("SELECT MAX({}) FROM {}".format(params.run_prefix, params.task_table_name)) row = cursor.fetchone() if row[0]: if highest_run < row[0]: print("stored run ({}) is higher than existing run ({}); \ consider running with --clean to remake task table".format(row[0], highest_run)) # else first run, nothing in table yet def add_run_to_db(params, run): resfilename = get_result_file(params, run) run_number = get_trailing_num(run) try: parsed_date = os.path.getmtime(resfilename) except OSError: print("file {} not found; skipping".format(resfilename)) return with open(resfilename, 'r') as res: # make sure table is compatible with run data by inserting any new columns # always called "run" in table (converted from whatever prefix users use) result_params = ["run", "parsed_date"] result_params.extend(res.readline().split('\t')) if result_params[-1] == '\n': result_params.pop() result_params = ["".join(('\"',p.strip(),'\"')) for p in result_params] pre_sample_pos = res.tell() result_params_sample = res.readline().split('\t') # go back to presample location for normal line iteration res.seek(pre_sample_pos, os.SEEK_SET) # add new column to table for c in range(len(result_params)): if result_params[c] not in params.tracked_columns: print("ADDING {} as new column".format(result_params[c])) add_column_to_table(params, db, result_params[c], result_params_sample[c-2]) # -2 accounts for run and parsed date # add value rows rows_to_add = [] for line in res: # run number and parsed_date are always recorded result_params_val = [run_number, parsed_date] result_params_val.extend(line.split('\t')) if result_params_val[-1] == '\n': result_params_val.pop() # something must be wrong here if len(result_params_val) > len(result_params): print("There are {} values for only {} parameters in run {}; \ skipping run".format(len(result_params_val), len(result_params), run_number)) # skip this run return # for when the last column value is the empty string while len(result_params_val) < len(result_params): result_params_val.append('') rows_to_add.append(tuple(convert_strictest(param) if param != nullval else None for param in result_params_val)) param_placeholders = ("?,"*len(result_params)).rstrip(',') # specify columns to insert into since ordering of columns in file may not match table insert_rows_command = "INSERT OR IGNORE INTO {} ({}) VALUES ({})".format( params.task_table_name, ','.join(result_params), param_placeholders) cursor = db.cursor() cursor.executemany(insert_rows_command, rows_to_add) walk_runs(params, add_run_to_db, check_runs_match_table) db.commit()
def update_db(params, db): # check if table for task exists; if not then create it create_table(params, db) # load up latest run and parsed date for task def check_last_runs_table(runs): natural_sort(runs) cursor = db.cursor() cursor.execute("SELECT MAX({}), MAX({}) FROM {}".format("parsed_date", "run", params.task_table_name)) row = cursor.fetchone() if row[0]: # storing in database truncates decimal, so add 1 second last_parsed_date = row[0] + 1 last_run = row[1] print("last parsed date", last_parsed_date) print("last run", last_run) params.last_parsed_date = last_parsed_date params.last_run = last_run else: print("first population") params.last_run = 0 # else first run, nothing in table yet def add_run_to_db(params, run): resfilename = get_result_file(params, run) run_number = get_trailing_num(run) try: parsed_date = os.path.getmtime(resfilename) # throw away unless newer than latest or run number greater than maximum if parsed_date <= params.last_parsed_date and run_number <= params.last_run: return except OSError: print("file {} not found; skipping".format(resfilename)) return params.last_run += 1 print("run {} added ({}) ({})".format(run_number, params.last_run, parsed_date)) with open(resfilename, 'rb') as res: # make sure table is compatible with run data by inserting any new columns # always called "run" in table (converted from whatever prefix users use) csvreader = csv.reader(res, delimiter=params.delimiter) result_params = ["run", "parsed_date"] result_params.extend(csvreader.next()) empty_end = False if not result_params[-1]: # empty or None empty_end = True result_params.pop() result_params = ["".join(('\"',p.strip(),'\"')) for p in result_params] print(result_params) result_params_sample = [params.last_run, parsed_date] result_params_sample.extend(csvreader.next()) if empty_end: result_params_sample.pop() while len(result_params_sample) < len(result_params): result_params_sample.append('') # add new column to table for c in range(len(result_params)): if result_params[c] not in params.tracked_columns: print("ADDING {} as new column".format(result_params[c])) add_column_to_table(params, db, result_params[c], result_params_sample[c]) # add value rows rows_to_add = [tuple(convert_strictest(param) if param != nullval else None for param in result_params_sample)] for line in csvreader: # run number and parsed_date are always recorded result_params_val = [params.last_run, parsed_date] result_params_val.extend(line) if empty_end: result_params_val.pop() # something must be wrong here if len(result_params_val) > len(result_params): print("There are {} values for only {} parameters in run {}; \ skipping run".format(len(result_params_val), len(result_params), run_number)) # skip this run params.last_run -= 1 return # for padding when columns have unequal depth while len(result_params_val) < len(result_params): result_params_val.append('') rows_to_add.append(tuple(convert_strictest(param) if param != nullval else None for param in result_params_val)) print("rows to add") print(rows_to_add) param_placeholders = ("?,"*len(result_params)).rstrip(',') # specify columns to insert into since ordering of columns in file may not match table insert_rows_command = "INSERT OR IGNORE INTO {} ({}) VALUES ({})".format( params.task_table_name, ','.join(result_params), param_placeholders) cursor = db.cursor() cursor.executemany(insert_rows_command, rows_to_add) walk_runs(params, add_run_to_db, check_last_runs_table) db.commit()