def get_jurisdiction_roles(): jurisdiction_roles = [] for role in current_user.roles: if not role.name: logger.warning("User Role %s has no name", role) continue parts = role.name.split('_', maxsplit=1) if len(parts) != 2: logger.warning( "User role %s does not have two parts," "cannot process into jurisdiction and event type", role.name) continue jurisdiction, event_type = parts try: schema_file = load_schema_file(event_type) except FileNotFoundError: logger.warning( 'User belongs to event_type %s that has no schema file', event_type) continue jurisdiction_roles.append({ 'jurisdictionSlug': jurisdiction, 'jurisdiction': PRETTY_JURISDICTION_MAP.get(jurisdiction, jurisdiction), 'eventTypeSlug': event_type, 'eventType': schema_file.get('name') }) return jurisdiction_roles
def add_missing_fields(event_type, infilename): goodtables_schema = load_schema_file(event_type) schema_fields = goodtables_schema['fields'] outfilename = infilename + '.filled' delimiter = infer_delimiter(infilename) with open(infilename, 'rb') as infileobj, open(outfilename, 'wb') as outfileobj: reader = csv.DictReader(lower_first(infileobj), delimiter=delimiter) writer = csv.DictWriter( outfileobj, fieldnames=[field['name'] for field in schema_fields], quoting=csv.QUOTE_MINIMAL) writer.writeheader() try: for line in reader: newline = {} for field in schema_fields: field_name = field['name'] if field_name not in line or not line[field_name]: if field['type'] == 'integer': newline[field_name] = None else: newline[field_name] = '' else: if field['type'] == 'string': newline[field_name] = line[field_name].strip() else: newline[field_name] = line[field_name] writer.writerow(newline) except Exception as e: raise ValueError('Line %s has error: %s', reader.line_num, e) return outfilename
def create_merged_table(jurisdiction, event_type, db_session): master_table_name = generate_master_table_name(jurisdiction, event_type) goodtables_schema = load_schema_file(event_type) full_column_list = master_table_column_list(goodtables_schema) create = create_statement_from_column_list(full_column_list, master_table_name, goodtables_schema['primaryKey']) # create table if it does not exist logging.info('Assembled create-if-not-exists table statement: %s', create) db_session.execute(create)
def validate_header(event_type, filename_without_all_fields): goodtables_schema = load_schema_file(event_type) schema_fields = goodtables_schema['fields'] delimiter = infer_delimiter(filename_without_all_fields) required_field_names = set( field['name'] for field in schema_fields if field.get('constraints', {}).get('required', False)) with open(filename_without_all_fields, 'rb') as infileobj: reader = csv.DictReader(lower_first(infileobj), delimiter=delimiter) first_line = next(reader) for required_field_name in required_field_names: if required_field_name not in first_line: raise ValueError( f"Field name {required_field_name} is required for {event_type} schema but is not present" )
def create_and_populate_merged_table(table_name, data, db_engine): schema = load_schema_file('test') n_fields = len(schema['fields']) for row in data: assert len( row ) == n_fields, "sample merged data must have same # of fields as test schema" placeholder_string = ', '.join(['%s'] * n_fields) master_table_name = generate_master_table_name('test', 'test') column_list = master_table_column_list(schema) create = create_statement_from_column_list(column_list) db_engine.execute(create) for row in data: db_engine.execute( 'insert into "{}" values ({}, now(), now())'.format( master_table_name, placeholder_string), *row)
def create_and_populate_raw_table(raw_table, data, db_engine): schema = load_schema_file('test') n_fields = len(schema['fields']) for row in data: assert len( row ) == n_fields, "sample raw data must have same # of fields as test schema" placeholder_string = ', '.join(['%s'] * n_fields) create = create_statement_from_goodtables_schema(schema, raw_table) db_engine.execute(create) for row in data: db_engine.execute( 'insert into "{}" values ({})'.format(raw_table, placeholder_string), *row) db_engine.execute( 'insert into upload_log (id, jurisdiction_slug, event_type_slug) values (%s, %s, %s)', raw_table, 'test', 'test')
def upsert_raw_table_to_master(raw_table_name, jurisdiction, event_type, upload_id, db_session): create_merged_table(jurisdiction, event_type, db_session) master_table_name = generate_master_table_name(jurisdiction, event_type) goodtables_schema = load_schema_file(event_type) base_column_list = column_list_from_goodtables_schema(goodtables_schema) # use new postgres 'on conflict' functionality to upsert update_statements = [ ' "{column}" = EXCLUDED."{column}"'.format(column=column_def[0]) for column_def in base_column_list ] start_ts = datetime.today() insert_sql = ''' insert into {master} select raw.*, '{new_ts}' inserted_ts, '{new_ts}' updated_ts, row_number() over ()::text || '{event_type}' as matched_id from "{raw}" as raw on conflict ({primary_key}) do update set {update_string}, updated_ts = '{new_ts}' '''.format(raw=raw_table_name, master=master_table_name, event_type=event_type, primary_key=', '.join([ "\"{}\"".format(col) for col in goodtables_schema['primaryKey'] ]), update_string=', '.join(update_statements), new_ts=start_ts.isoformat()) logging.info('Executing insert: %s', insert_sql) db_session.execute(insert_sql) end_ts = datetime.today() merge_log = MergeLog( upload_id=upload_id, total_unique_rows=total_unique_rows(raw_table_name, goodtables_schema['primaryKey'], db_session), new_unique_rows=new_unique_rows(master_table_name, start_ts, db_session), merge_start_timestamp=start_ts, merge_complete_timestamp=end_ts, ) db_session.add(merge_log) db_session.execute('drop table "{}"'.format(raw_table_name)) db_session.commit() return merge_log.id
def copy_raw_table_to_db(full_path, event_type, upload_id, db_engine): goodtables_schema = load_schema_file(event_type) logging.info('Loaded schema: %s', goodtables_schema) table_name = generate_raw_table_name(upload_id) create_statement = create_statement_from_goodtables_schema( goodtables_schema, table_name) logging.info('Assembled create table statement: %s', create_statement) db_engine.execute(create_statement) logging.info('Successfully created table') primary_key = primary_key_statement(goodtables_schema['primaryKey']) with open_sesame(full_path, 'rb') as infile: conn = db_engine.raw_connection() cursor = conn.cursor() copy_stmt = 'copy "{}" from stdin with csv force not null {} header delimiter as \',\' '.format( table_name, primary_key) try: cursor.copy_expert(copy_stmt, infile) conn.commit() except psycopg2.IntegrityError as e: error_message = str(e) conn.rollback() if 'duplicate key value violates unique constraint' not in error_message: raise error_message_lines = error_message.split('\n') if len(error_message_lines) < 3: raise line_no_match = re.match(r'^.*(line \d+)', error_message.split('\n')[2]) if not line_no_match: raise line_no = line_no_match.group(1) raise ValueError( f"Duplicate key value found on {line_no}. {error_message_lines[1]}" ) finally: conn.close() logging.info('Successfully loaded file') return table_name
def write_matches_to_db(db_engine, event_type, jurisdiction, matches_filehandle): goodtables_schema = load_schema_file(event_type) table_name = generate_master_table_name(event_type=event_type, jurisdiction=jurisdiction) logging.info('Writing matches for %s / %s to table %s', event_type, jurisdiction, table_name) reader = csv.reader(matches_filehandle, delimiter='|') ordered_column_names = next(reader) matches_filehandle.seek(0) # 1. create pseudo-temporary table for the raw matches file # use the CSV's column order but grab the definitions from the goodtables schema unordered_column_list = column_list_from_goodtables_schema( goodtables_schema) primary_key = goodtables_schema['primaryKey'] all_columns = [ ('matched_id', 'varchar') ] + [col for col in unordered_column_list if col[0] in primary_key] column_definitions = dict((col[0], col) for col in all_columns) ordered_column_list = [ column_definitions[ordered_column_name] for ordered_column_name in ordered_column_names ] logging.info('Final column list for temporary matches-only table: %s', ordered_column_list) create = create_statement_from_column_list(ordered_column_list, table_name, primary_key) temp_table_name = 'temp_matched_merge_tbl' create = create.replace(table_name, temp_table_name) logging.info(create) db_engine.execute(create) # 2. copy data from filehandle to conn = db_engine.raw_connection() cursor = conn.cursor() pk = ','.join([col for col in primary_key]) copy_stmt = 'copy {} from stdin with csv header delimiter as \'|\' force not null {}'.format( temp_table_name, pk) try: logging.info(copy_stmt) cursor.copy_expert(copy_stmt, matches_filehandle) logging.info('Status message after COPY: %s', cursor.statusmessage) for notice in conn.notices: logging.info('Notice from database connection: %s', notice) conn.commit() cursor = conn.cursor() cursor.execute('select * from {} limit 5'.format(temp_table_name)) logging.info('First five rows: %s', [row for row in cursor]) big_query = """ update {matched_table} as m set matched_id = regexp_replace(tmp.matched_id::text, '[^\w]', '', 'g') from {temp_table_name} tmp where ({pk}) """.format( create=create, matched_table=table_name, temp_table_name=temp_table_name, pk=' and '.join([ 'tmp.{col} = m.{col}'.format(col=col) for col in primary_key ])) logging.info('Updating matches in %s with rows from %s', table_name, temp_table_name) logging.info(big_query) cursor.execute(big_query) logging.info('Status message after UPDATE: %s', cursor.statusmessage) conn.commit() except Exception as e: logging.error( 'Error encountered! Rolling back merge of matched ids. Original error: %s', str(e)) conn.rollback() finally: db_engine.execute('drop table if exists {}'.format(temp_table_name))
import os parser = argparse.ArgumentParser(description='Test a DDJ schema file') parser.add_argument('--jurisdiction', dest='jurisdiction', type=str) parser.add_argument('--event-type', dest='event_type', type=str) parser.add_argument('--row-limit', dest='row_limit', type=int) parser.add_argument('--print-errors', dest='print_errors_of_type', type=str) args = parser.parse_args() path = '{}/{}/{}'.format(os.environ['MATCHING_DATA_PATH'], args.jurisdiction, args.event_type) print(args.jurisdiction) print(args.event_type) print(args.row_limit) print(args.print_errors_of_type) schema = load_schema_file(args.event_type) colname_lookup = dict((colindex + 1, field['name']) for colindex, field in enumerate(schema['fields'])) print(colname_lookup) report = tasks.fill_and_validate(args.event_type, path, args.row_limit) print(report['error-count']) counter = Counter() print('\n\n\n\n\n') print('------------------------------') print('PRINTING FIRST 10 ERRORS') print('------------------------------') print('\n\n\n\n\n') print(counter) for error_num in range(0, min(report['error-count'], 10)): print(report['tables'][0]['errors'][error_num]) for error in report['tables'][0]['errors']: