예제 #1
0
def get_jurisdiction_roles():
    jurisdiction_roles = []
    for role in current_user.roles:
        if not role.name:
            logger.warning("User Role %s has no name", role)
            continue
        parts = role.name.split('_', maxsplit=1)
        if len(parts) != 2:
            logger.warning(
                "User role %s does not have two parts,"
                "cannot process into jurisdiction and event type", role.name)
            continue
        jurisdiction, event_type = parts
        try:
            schema_file = load_schema_file(event_type)
        except FileNotFoundError:
            logger.warning(
                'User belongs to event_type %s that has no schema file',
                event_type)
            continue
        jurisdiction_roles.append({
            'jurisdictionSlug':
            jurisdiction,
            'jurisdiction':
            PRETTY_JURISDICTION_MAP.get(jurisdiction, jurisdiction),
            'eventTypeSlug':
            event_type,
            'eventType':
            schema_file.get('name')
        })
    return jurisdiction_roles
예제 #2
0
def add_missing_fields(event_type, infilename):
    goodtables_schema = load_schema_file(event_type)
    schema_fields = goodtables_schema['fields']
    outfilename = infilename + '.filled'
    delimiter = infer_delimiter(infilename)
    with open(infilename, 'rb') as infileobj, open(outfilename,
                                                   'wb') as outfileobj:
        reader = csv.DictReader(lower_first(infileobj), delimiter=delimiter)
        writer = csv.DictWriter(
            outfileobj,
            fieldnames=[field['name'] for field in schema_fields],
            quoting=csv.QUOTE_MINIMAL)
        writer.writeheader()
        try:
            for line in reader:
                newline = {}
                for field in schema_fields:
                    field_name = field['name']
                    if field_name not in line or not line[field_name]:
                        if field['type'] == 'integer':
                            newline[field_name] = None
                        else:
                            newline[field_name] = ''
                    else:
                        if field['type'] == 'string':
                            newline[field_name] = line[field_name].strip()
                        else:
                            newline[field_name] = line[field_name]
                writer.writerow(newline)
        except Exception as e:
            raise ValueError('Line %s has error: %s', reader.line_num, e)
    return outfilename
예제 #3
0
def create_merged_table(jurisdiction, event_type, db_session):
    master_table_name = generate_master_table_name(jurisdiction, event_type)
    goodtables_schema = load_schema_file(event_type)
    full_column_list = master_table_column_list(goodtables_schema)
    create = create_statement_from_column_list(full_column_list,
                                               master_table_name,
                                               goodtables_schema['primaryKey'])
    # create table if it does not exist
    logging.info('Assembled create-if-not-exists table statement: %s', create)
    db_session.execute(create)
예제 #4
0
def validate_header(event_type, filename_without_all_fields):
    goodtables_schema = load_schema_file(event_type)
    schema_fields = goodtables_schema['fields']
    delimiter = infer_delimiter(filename_without_all_fields)
    required_field_names = set(
        field['name'] for field in schema_fields
        if field.get('constraints', {}).get('required', False))
    with open(filename_without_all_fields, 'rb') as infileobj:
        reader = csv.DictReader(lower_first(infileobj), delimiter=delimiter)
        first_line = next(reader)
        for required_field_name in required_field_names:
            if required_field_name not in first_line:
                raise ValueError(
                    f"Field name {required_field_name} is required for {event_type} schema but is not present"
                )
예제 #5
0
def create_and_populate_merged_table(table_name, data, db_engine):
    schema = load_schema_file('test')
    n_fields = len(schema['fields'])
    for row in data:
        assert len(
            row
        ) == n_fields, "sample merged data must have same # of fields as test schema"
    placeholder_string = ', '.join(['%s'] * n_fields)
    master_table_name = generate_master_table_name('test', 'test')
    column_list = master_table_column_list(schema)
    create = create_statement_from_column_list(column_list)
    db_engine.execute(create)
    for row in data:
        db_engine.execute(
            'insert into "{}" values ({}, now(), now())'.format(
                master_table_name, placeholder_string), *row)
예제 #6
0
def create_and_populate_raw_table(raw_table, data, db_engine):
    schema = load_schema_file('test')
    n_fields = len(schema['fields'])
    for row in data:
        assert len(
            row
        ) == n_fields, "sample raw data must have same # of fields as test schema"
    placeholder_string = ', '.join(['%s'] * n_fields)

    create = create_statement_from_goodtables_schema(schema, raw_table)
    db_engine.execute(create)
    for row in data:
        db_engine.execute(
            'insert into "{}" values ({})'.format(raw_table,
                                                  placeholder_string), *row)
    db_engine.execute(
        'insert into upload_log (id, jurisdiction_slug, event_type_slug) values (%s, %s, %s)',
        raw_table, 'test', 'test')
예제 #7
0
def upsert_raw_table_to_master(raw_table_name, jurisdiction, event_type,
                               upload_id, db_session):
    create_merged_table(jurisdiction, event_type, db_session)
    master_table_name = generate_master_table_name(jurisdiction, event_type)
    goodtables_schema = load_schema_file(event_type)
    base_column_list = column_list_from_goodtables_schema(goodtables_schema)
    # use new postgres 'on conflict' functionality to upsert
    update_statements = [
        ' "{column}" = EXCLUDED."{column}"'.format(column=column_def[0])
        for column_def in base_column_list
    ]
    start_ts = datetime.today()
    insert_sql = '''
        insert into {master}
        select raw.*, '{new_ts}' inserted_ts, '{new_ts}' updated_ts, row_number() over ()::text || '{event_type}' as matched_id
        from "{raw}" as raw
        on conflict ({primary_key})
        do update set {update_string}, updated_ts = '{new_ts}'
    '''.format(raw=raw_table_name,
               master=master_table_name,
               event_type=event_type,
               primary_key=', '.join([
                   "\"{}\"".format(col)
                   for col in goodtables_schema['primaryKey']
               ]),
               update_string=', '.join(update_statements),
               new_ts=start_ts.isoformat())
    logging.info('Executing insert: %s', insert_sql)
    db_session.execute(insert_sql)
    end_ts = datetime.today()
    merge_log = MergeLog(
        upload_id=upload_id,
        total_unique_rows=total_unique_rows(raw_table_name,
                                            goodtables_schema['primaryKey'],
                                            db_session),
        new_unique_rows=new_unique_rows(master_table_name, start_ts,
                                        db_session),
        merge_start_timestamp=start_ts,
        merge_complete_timestamp=end_ts,
    )
    db_session.add(merge_log)
    db_session.execute('drop table "{}"'.format(raw_table_name))
    db_session.commit()
    return merge_log.id
예제 #8
0
def copy_raw_table_to_db(full_path, event_type, upload_id, db_engine):
    goodtables_schema = load_schema_file(event_type)
    logging.info('Loaded schema: %s', goodtables_schema)
    table_name = generate_raw_table_name(upload_id)
    create_statement = create_statement_from_goodtables_schema(
        goodtables_schema, table_name)
    logging.info('Assembled create table statement: %s', create_statement)
    db_engine.execute(create_statement)
    logging.info('Successfully created table')
    primary_key = primary_key_statement(goodtables_schema['primaryKey'])
    with open_sesame(full_path, 'rb') as infile:
        conn = db_engine.raw_connection()
        cursor = conn.cursor()
        copy_stmt = 'copy "{}" from stdin with csv force not null {}  header delimiter as \',\' '.format(
            table_name, primary_key)
        try:
            cursor.copy_expert(copy_stmt, infile)
            conn.commit()
        except psycopg2.IntegrityError as e:
            error_message = str(e)
            conn.rollback()
            if 'duplicate key value violates unique constraint' not in error_message:
                raise
            error_message_lines = error_message.split('\n')
            if len(error_message_lines) < 3:
                raise
            line_no_match = re.match(r'^.*(line \d+)',
                                     error_message.split('\n')[2])
            if not line_no_match:
                raise
            line_no = line_no_match.group(1)
            raise ValueError(
                f"Duplicate key value found on {line_no}. {error_message_lines[1]}"
            )
        finally:
            conn.close()
    logging.info('Successfully loaded file')
    return table_name
예제 #9
0
def write_matches_to_db(db_engine, event_type, jurisdiction,
                        matches_filehandle):
    goodtables_schema = load_schema_file(event_type)
    table_name = generate_master_table_name(event_type=event_type,
                                            jurisdiction=jurisdiction)
    logging.info('Writing matches for %s / %s to table %s', event_type,
                 jurisdiction, table_name)
    reader = csv.reader(matches_filehandle, delimiter='|')
    ordered_column_names = next(reader)
    matches_filehandle.seek(0)

    # 1. create pseudo-temporary table for the raw matches file
    # use the CSV's column order but grab the definitions from the goodtables schema
    unordered_column_list = column_list_from_goodtables_schema(
        goodtables_schema)
    primary_key = goodtables_schema['primaryKey']

    all_columns = [
        ('matched_id', 'varchar')
    ] + [col for col in unordered_column_list if col[0] in primary_key]
    column_definitions = dict((col[0], col) for col in all_columns)
    ordered_column_list = [
        column_definitions[ordered_column_name]
        for ordered_column_name in ordered_column_names
    ]
    logging.info('Final column list for temporary matches-only table: %s',
                 ordered_column_list)
    create = create_statement_from_column_list(ordered_column_list, table_name,
                                               primary_key)
    temp_table_name = 'temp_matched_merge_tbl'
    create = create.replace(table_name, temp_table_name)
    logging.info(create)
    db_engine.execute(create)

    # 2. copy data from filehandle to
    conn = db_engine.raw_connection()
    cursor = conn.cursor()
    pk = ','.join([col for col in primary_key])
    copy_stmt = 'copy {} from stdin with csv header delimiter as \'|\' force not null {}'.format(
        temp_table_name, pk)
    try:
        logging.info(copy_stmt)
        cursor.copy_expert(copy_stmt, matches_filehandle)
        logging.info('Status message after COPY: %s', cursor.statusmessage)
        for notice in conn.notices:
            logging.info('Notice from database connection: %s', notice)
        conn.commit()
        cursor = conn.cursor()
        cursor.execute('select * from {} limit 5'.format(temp_table_name))
        logging.info('First five rows: %s', [row for row in cursor])
        big_query = """
update {matched_table} as m set matched_id = regexp_replace(tmp.matched_id::text, '[^\w]', '', 'g')
        from {temp_table_name} tmp where ({pk}) """.format(
            create=create,
            matched_table=table_name,
            temp_table_name=temp_table_name,
            pk=' and '.join([
                'tmp.{col} = m.{col}'.format(col=col) for col in primary_key
            ]))
        logging.info('Updating matches in %s with rows from %s', table_name,
                     temp_table_name)
        logging.info(big_query)
        cursor.execute(big_query)
        logging.info('Status message after UPDATE: %s', cursor.statusmessage)
        conn.commit()
    except Exception as e:
        logging.error(
            'Error encountered! Rolling back merge of matched ids. Original error: %s',
            str(e))
        conn.rollback()
    finally:
        db_engine.execute('drop table if exists {}'.format(temp_table_name))
예제 #10
0
import os

parser = argparse.ArgumentParser(description='Test a DDJ schema file')
parser.add_argument('--jurisdiction', dest='jurisdiction', type=str)
parser.add_argument('--event-type', dest='event_type', type=str)
parser.add_argument('--row-limit', dest='row_limit', type=int)
parser.add_argument('--print-errors', dest='print_errors_of_type', type=str)

args = parser.parse_args()
path = '{}/{}/{}'.format(os.environ['MATCHING_DATA_PATH'], args.jurisdiction,
                         args.event_type)
print(args.jurisdiction)
print(args.event_type)
print(args.row_limit)
print(args.print_errors_of_type)
schema = load_schema_file(args.event_type)
colname_lookup = dict((colindex + 1, field['name'])
                      for colindex, field in enumerate(schema['fields']))
print(colname_lookup)
report = tasks.fill_and_validate(args.event_type, path, args.row_limit)
print(report['error-count'])
counter = Counter()
print('\n\n\n\n\n')
print('------------------------------')
print('PRINTING FIRST 10 ERRORS')
print('------------------------------')
print('\n\n\n\n\n')
print(counter)
for error_num in range(0, min(report['error-count'], 10)):
    print(report['tables'][0]['errors'][error_num])
for error in report['tables'][0]['errors']: