예제 #1
0
def verify_paths(params):
    """verify paths exist as either relational or absolute paths, modifying them if necessary"""
    # under one shot mode, only care about 1 result file
    if params.one_shot:
        if not os.path.isfile(params.result_file):
            print(params.task_dir, " one_shot result file does not exist")
        return
    # normally want to check each run in the directory
    if os.path.isdir(params.task_dir):
        walk_runs(params, check_result_exists)
    else:
        print(params.task_dir, " directory does not exist")
예제 #2
0
def update_db(params, db):
    # check if table for task exists; if not then create it
    task_table_name = params.task_table_name
    create_table(params, db, task_table_name)

    # load up latest run and parsed date for task
    def check_last_runs_table(runs):
        natural_sort(runs)
        cursor = db.cursor()
        cursor.execute("SELECT MAX({}), MAX({}) FROM {}".format(
            "parsed_date", "run", params.task_table_name))
        row = cursor.fetchone()
        if row[0]:
            # storing in database truncates decimal, so add 1 second
            last_parsed_date = row[0] + 1
            last_run = row[1]
            print("last parsed date", last_parsed_date)
            print("last run", last_run)
            params.last_parsed_date = last_parsed_date
            params.last_run = last_run
        else:
            print("first population")
            params.last_run = 0
        # else first run, nothing in table yet

    def add_run_to_db(params, run):
        run_number = get_trailing_num(run)
        resfilename = get_result_file(params, params.run_prefix, run_number)
        try:
            parsed_date = os.path.getmtime(resfilename)
            # throw away unless newer than latest or run number greater than maximum
            if parsed_date <= params.last_parsed_date and run_number <= params.last_run:
                return
        except OSError:
            print("file {} not found; skipping".format(resfilename))
            return

        params.last_run += 1
        print("run {} added ({}) ({})".format(run_number, params.last_run,
                                              parsed_date))

        with open(resfilename, 'r') as res:
            # make sure table is compatible with run data by inserting any new columns
            # always called "run" in table (converted from whatever prefix users use)
            result_params = ["run", "parsed_date"]
            result_params.extend(res.readline().split('\t'))
            if result_params[-1] == '\n':
                result_params.pop()
            result_params = [
                "".join(('\"', p.strip(), '\"')) for p in result_params
            ]

            pre_sample_pos = res.tell()
            result_params_sample = res.readline().split('\t')
            # go back to presample location for normal line iteration
            res.seek(pre_sample_pos, os.SEEK_SET)

            # add new column to table
            for c in range(len(result_params)):
                if result_params[c] not in params.tracked_columns:
                    print("ADDING {} as new column".format(result_params[c]))
                    add_column_to_table(
                        params, db, result_params[c], result_params_sample[
                            c - 2])  # -2 accounts for run and parsed date

            # add value rows
            rows_to_add = []
            for line in res:
                # run number and parsed_date are always recorded
                result_params_val = [params.last_run, parsed_date]
                result_params_val.extend(line.split('\t'))
                if result_params_val[-1] == '\n':
                    result_params_val.pop()
                # something must be wrong here
                if len(result_params_val) > len(result_params):
                    print(
                        "There are {} values for only {} parameters in run {}; \
                        skipping run".format(len(result_params_val),
                                             len(result_params), run_number))
                    # skip this run
                    params.last_run -= 1
                    return

                # for when the last column value is the empty string
                while len(result_params_val) < len(result_params):
                    result_params_val.append('')

                rows_to_add.append(
                    tuple(
                        convert_strictest(param) if param != nullval else None
                        for param in result_params_val))

            param_placeholders = ("?," * len(result_params)).rstrip(',')
            # specify columns to insert into since ordering of columns in file may not match table
            insert_rows_command = "INSERT OR IGNORE INTO {} ({}) VALUES ({})".format(
                params.task_table_name, ','.join(result_params),
                param_placeholders)
            cursor = db.cursor()
            cursor.executemany(insert_rows_command, rows_to_add)

    walk_runs(params, add_run_to_db, check_last_runs_table)
    db.commit()
예제 #3
0
def verify_paths(params):
    """verify paths exist as either relational or absolute paths, modifying them if necessary"""
    if os.path.isdir(params.task_dir):
        walk_runs(params, check_result_exists)
    else:
        print(params.task_dir, " directory does not exist")
def update_db(params, db):
    # check if table for task exists; if not then create it
    task_table_name = params.task_table_name
    create_table(params, db, task_table_name)
    # verify that the max runs in the task_dir is >= to the max runs in the database
    def check_runs_match_table(runs):
        natural_sort(runs)
        highest_run = get_trailing_num(runs[-1])
        cursor = db.cursor()
        cursor.execute("SELECT MAX({}) FROM {}".format(params.run_prefix, params.task_table_name))
        row = cursor.fetchone()
        if row[0]:
            if highest_run < row[0]:
                print("stored run ({}) is higher than existing run ({}); \
consider running with --clean to remake task table".format(row[0], highest_run))
        # else first run, nothing in table yet

    def add_run_to_db(params, run):
        resfilename = get_result_file(params, run)
        run_number = get_trailing_num(run)
        try:
            parsed_date = os.path.getmtime(resfilename)
        except OSError:
            print("file {} not found; skipping".format(resfilename))
            return


        with open(resfilename, 'r') as res:
            # make sure table is compatible with run data by inserting any new columns
            # always called "run" in table (converted from whatever prefix users use)
            result_params = ["run", "parsed_date"]
            result_params.extend(res.readline().split('\t'))
            if result_params[-1] == '\n':
                result_params.pop()
            result_params = ["".join(('\"',p.strip(),'\"')) for p in result_params]

            pre_sample_pos = res.tell()
            result_params_sample = res.readline().split('\t')
            # go back to presample location for normal line iteration
            res.seek(pre_sample_pos, os.SEEK_SET)

            # add new column to table
            for c in range(len(result_params)):
                if result_params[c] not in params.tracked_columns:
                    print("ADDING {} as new column".format(result_params[c]))
                    add_column_to_table(params, db, result_params[c], result_params_sample[c-2]) # -2 accounts for run and parsed date

            # add value rows
            rows_to_add = []
            for line in res:
                # run number and parsed_date are always recorded
                result_params_val = [run_number, parsed_date]
                result_params_val.extend(line.split('\t'))
                if result_params_val[-1] == '\n':
                    result_params_val.pop()
                # something must be wrong here
                if len(result_params_val) > len(result_params):
                    print("There are {} values for only {} parameters in run {}; \
                        skipping run".format(len(result_params_val), len(result_params), run_number))
                    # skip this run
                    return

                # for when the last column value is the empty string
                while len(result_params_val) < len(result_params):
                    result_params_val.append('')

                rows_to_add.append(tuple(convert_strictest(param) if param != nullval else None for param in result_params_val))

            param_placeholders = ("?,"*len(result_params)).rstrip(',')
            # specify columns to insert into since ordering of columns in file may not match table
            insert_rows_command = "INSERT OR IGNORE INTO {} ({}) VALUES ({})".format(
                params.task_table_name,
                ','.join(result_params), 
                param_placeholders)
            cursor = db.cursor()
            cursor.executemany(insert_rows_command, rows_to_add)



    walk_runs(params, add_run_to_db, check_runs_match_table)
    db.commit()
def verify_paths(params):
    """verify paths exist as either relational or absolute paths, modifying them if necessary"""
    if os.path.isdir(params.task_dir):
        walk_runs(params, check_result_exists)
    else:
        print(params.task_dir, " directory does not exist")
예제 #6
0
def update_db(params, db):
    # check if table for task exists; if not then create it
    create_table(params, db)
    # load up latest run and parsed date for task
    def check_last_runs_table(runs):
        natural_sort(runs)
        cursor = db.cursor()
        cursor.execute("SELECT MAX({}), MAX({}) FROM {}".format("parsed_date", "run", params.task_table_name))
        row = cursor.fetchone()
        if row[0]:
            # storing in database truncates decimal, so add 1 second
            last_parsed_date = row[0] + 1
            last_run = row[1]
            print("last parsed date", last_parsed_date)
            print("last run", last_run)
            params.last_parsed_date = last_parsed_date
            params.last_run = last_run
        else:
            print("first population")
            params.last_run = 0
        # else first run, nothing in table yet

    def add_run_to_db(params, run):
        resfilename = get_result_file(params, run)
        run_number = get_trailing_num(run)
        try:
            parsed_date = os.path.getmtime(resfilename)
            # throw away unless newer than latest or run number greater than maximum
            if parsed_date <= params.last_parsed_date and run_number <= params.last_run:
                return
        except OSError:
            print("file {} not found; skipping".format(resfilename))
            return

        params.last_run += 1
        print("run {} added ({}) ({})".format(run_number, params.last_run, parsed_date))

        with open(resfilename, 'rb') as res:
            # make sure table is compatible with run data by inserting any new columns
            # always called "run" in table (converted from whatever prefix users use)
            csvreader = csv.reader(res, delimiter=params.delimiter)
            result_params = ["run", "parsed_date"]
            result_params.extend(csvreader.next())
            empty_end = False
            if not result_params[-1]:   # empty or None
                empty_end = True
                result_params.pop()
            result_params = ["".join(('\"',p.strip(),'\"')) for p in result_params]
            print(result_params)

            result_params_sample = [params.last_run, parsed_date]
            result_params_sample.extend(csvreader.next())
            if empty_end:
                result_params_sample.pop()
            while len(result_params_sample) < len(result_params):
                result_params_sample.append('')

            # add new column to table
            for c in range(len(result_params)):
                if result_params[c] not in params.tracked_columns:
                    print("ADDING {} as new column".format(result_params[c]))
                    add_column_to_table(params, db, result_params[c], result_params_sample[c])

            # add value rows
            rows_to_add = [tuple(convert_strictest(param) if param != nullval else None for param in result_params_sample)]
            for line in csvreader:
                # run number and parsed_date are always recorded
                result_params_val = [params.last_run, parsed_date]
                result_params_val.extend(line)
                if empty_end:
                    result_params_val.pop()
                # something must be wrong here
                if len(result_params_val) > len(result_params):
                    print("There are {} values for only {} parameters in run {}; \
                        skipping run".format(len(result_params_val), len(result_params), run_number))
                    # skip this run
                    params.last_run -= 1
                    return

                # for padding when columns have unequal depth
                while len(result_params_val) < len(result_params):
                    result_params_val.append('')

                rows_to_add.append(tuple(convert_strictest(param) if param != nullval else None for param in result_params_val))

            print("rows to add")
            print(rows_to_add)
            param_placeholders = ("?,"*len(result_params)).rstrip(',')
            # specify columns to insert into since ordering of columns in file may not match table
            insert_rows_command = "INSERT OR IGNORE INTO {} ({}) VALUES ({})".format(
                params.task_table_name,
                ','.join(result_params), 
                param_placeholders)
            cursor = db.cursor()
            cursor.executemany(insert_rows_command, rows_to_add)



    walk_runs(params, add_run_to_db, check_last_runs_table)
    db.commit()