def iterate_unzipped(file_prefixes=None): log = logging.getLogger('iterate_unzipped') file_fields = json_comment_filter(json.load(open('file-fields.json', 'r'))) if not file_prefixes: file_prefixes = file_fields.keys() for fprefix in sorted(file_prefixes): version = get_latest_version(fprefix).lstrip('F') unzip_dir = os.path.join(get_unzip_dir(), fprefix) if not os.path.isdir(unzip_dir): unzip([fprefix]) else: with open(os.path.join(unzip_dir, '.version.' + fprefix), 'r') as f: unzip_version = f.read().strip() if unzip_version != version: log.warning('%s: Newer ZIP file available, unzipping again' % (fprefix)) unzip([fprefix]) existing = os.listdir(unzip_dir) for filename in sorted(existing): if filename in ['DAT'] or filename.startswith('.version'): continue if filename not in file_fields[fprefix]: log.warning('%s: Missing spec for %s' % (fprefix, filename)) continue log.debug('%s: Found spec for %s' % (fprefix, filename)) file_path = os.path.join(unzip_dir, filename) yield fprefix, filename, file_path, file_fields
def unzip(file_prefixes=None): """ Extract downloaded zip files into their own folder under /feeds/ """ log = logging.getLogger('targets_unzip') stime = t_time() if not file_prefixes: file_fields = json_comment_filter(json.load(open('file-fields.json', 'r'))) file_prefixes = file_fields.keys() todo = [] for ftname in file_prefixes: if ftname.lower().endswith('.zip'): zipfilename = ftname ftname = ftname[:-4] else: lv = get_latest_version(ftname) zipfilename = ftname + lv + '.ZIP' zpath = os.path.join(get_download_dir(), zipfilename) if False: # don't multiprocess unzip_single(zpath, ftname) else: todo.append((zpath, ftname)) if todo: with Pool() as pool: for _ in pool.imap_unordered(unzip_single_tup, todo): pass log.debug('unzip: %d total time' % (t_time()-stime))
def csv(file_prefixes=None): """ Transform each file in downloaded ZIP to csv format under a /csv/ directory specified in local.cfg. Files with multiple record types are output to multiple csv files e.g. /RJFAF123.TOC becomes /csv/RJFA-TOC-T.CSV (main train operating company ids and names) /csv/RJFA-TOC-F.CSV (additional toc fare ids) """ log = logging.getLogger('targets_csv') stime = t_time() file_fields = json_comment_filter(json.load(open('file-fields.json', 'r'))) if not file_prefixes: file_prefixes = file_fields.keys() versions = {} done = [] todo = [] for fprefix, filename, file_path, file_fields in iterate_unzipped( file_prefixes): if fprefix not in versions: version = get_latest_version(fprefix).lstrip('F') versions[fprefix] = version if False: # don't multiprocess _, _, csv_files = file_to_csv(fprefix, filename, file_path, file_fields) done.extend(csv_files) if csv_files: log.info('Finished processing %s/%s %s csv file(s)' % (fprefix, filename, len(csv_files))) else: todo.append((fprefix, filename, file_path, file_fields)) if todo: n = 1 with Pool() as pool: for fprefix, filename, csv_files in pool.imap_unordered( file_to_csv_tup, todo): csv_msg = '' if len(csv_files) > 1: csv_msg = '- %d csv files' % (len(csv_files)) if len(csv_files) > 0: log.info('Finished processing %s/%s (%d of %d) %s' % (fprefix, filename, n, len(todo), csv_msg)) n += 1 done.extend(csv_files) # remove old versions of files csv_dir = get_csv_dir() for fname in os.listdir(csv_dir): if fname.endswith('.csv') and fname not in done and fname.split( '-')[0] in file_prefixes: os.unlink(os.path.join(csv_dir, fname)) for fprefix in versions: version_file = os.path.join(csv_dir, '.version.' + fprefix) with open(version_file, 'w') as vf: vf.write(versions[fprefix] + '\n') log.debug('csv: %ds total time' % (t_time() - stime))
def csv_to_table(engine, metadata, fprefix, filename, record_type, fields, pks=[], csv_path=None): """ WARNING: this drops and recreates tables table schema is as specified in file-fields.json and field-pks.json """ log = logging.getLogger('targets_postgresql_csv_to_table') version = get_latest_version(fprefix).lstrip('F') if csv_path is None: if record_type: csv_name = '%s-%s-%s.%s.csv' % (fprefix, filename, record_type, version) else: csv_name = '%s-%s.%s.csv' % (fprefix, filename, version) csv_path = os.path.join(get_remote_csv_dir(), csv_name) table = table_from_fields(engine, metadata, fprefix, filename, record_type, fields, pks) inspector = Inspector.from_engine(engine) creating = table.name in inspector.get_table_names() connection = engine.connect() trans = connection.begin() try: drop_create_table(connection, table) # data insert using COPY method force_not_null = '' if pks and pks != ['invalid']: force_not_null = ', FORCE_NOT_NULL ("%s")' % ('", "'.join( [p.lower() for p in pks])) connection.execute("""COPY "%s" FROM '%s' WITH (FORMAT CSV, HEADER%s); """ % (table.name, csv_path, force_not_null)) trans.commit() except OperationalError as oe: trans.rollback() if 'No such file or directory' in str(oe): if creating: log.warning('%s not found, no table created' % (csv_path)) else: log.warning('%s not found, table kept as-is' % (csv_path)) table.name = None else: raise return table.name, creating
def file_to_csv(fprefix, filename, file_path=None, file_fields=None): """ A file can have multiple record types. Output separate CSV files for each one """ version = get_latest_version(fprefix).lstrip('F') if file_path is None: file_path = os.path.join(get_download_dir(), fprefix, filename) if file_fields is None: file_fields = json_comment_filter( json.load(open('file-fields.json', 'r'))) csv_files = {} csv_writers = {} try: fields = file_fields[fprefix][filename] for record in iterate_fields(file_path, fields): k = record.get('RECORD_TYPE', '') if k not in csv_writers: fsuffix = '' if k != '': fsuffix = '-' + k csv_filename = fprefix + '-' + filename + fsuffix + '.' + version + '.csv' csv_path = os.path.join(get_csv_dir(), csv_filename) csv_files[csv_filename] = open(csv_path, 'w') fieldnames = [ f[0] if not isinstance(f, str) else f for f in fields[k] ] fieldnames = [ f for f in fieldnames if f not in ['RECORD_TYPE', 'UPDATE_MARKER'] ] csv_writers[k] = csv_module.DictWriter(csv_files[csv_filename], fieldnames=fieldnames) csv_writers[k].writeheader() if k: del record['RECORD_TYPE'] # contained in filename csv_writers[k].writerow(record) finally: for f in csv_files.values(): f.close() return fprefix, filename, list(csv_files.keys())
def postgresql(file_prefixes=None): """ Move CSV files into corresponding postgresql tables using bulk postgresql COPY command. Table names and columns are lowercased for ease of working in SQL. Types conversion: Date: Applicable columns ending in '_DATE' Time: Applicable columns ending in '_TIME' The CSV files must be on the same server as the postgres db and readable by the postgres process. Composite primary keys have blanks (rather than null) in columns. """ log = logging.getLogger('targets_postgresql') stime = t_time() dburi = get_dburi() engine = create_engine(dburi) connection = engine.connect( ) # trigger conn. related exceptions, e.g. if db doesn't exist metadata = MetaData() file_fields = json_comment_filter(json.load(open('file-fields.json', 'r'))) field_pks = json_comment_filter(json.load(open('field-pks.json', 'r'))) if not file_prefixes: file_prefixes = file_fields.keys() todo = [] for fprefix in sorted(file_prefixes): csv_dir = get_remote_csv_dir() if not os.path.exists(csv_dir) and csv_dir != get_csv_dir(): # We don't have access to the directory we'll be COPYing from # versions are included in CSV filenames so db server will fail # to COPY from an old file pass else: if not os.path.exists(csv_dir): csv([fprefix]) else: with open(os.path.join(csv_dir, '.version.' + fprefix), 'r') as f: csv_version = f.read().strip() if csv_version != get_latest_version(fprefix).lstrip('F'): log.warning( '%s: Newer version available, converting to CSV again' % (fprefix)) csv([fprefix]) for filename in file_fields[fprefix]: for record_type, fields in file_fields[fprefix][filename].items(): pks = field_pks.get(fprefix, {}).get(filename, {}).get(record_type, []) if not fields: log.warning('%s: Missing spec for %s %s' % (fprefix, filename, record_type)) continue if False: table_name, creating = csv_to_table( engine, metadata, fprefix, filename, record_type, fields, pks) if table_name and creating: log.info('Finished recreating %s' % (table_name)) elif table_name: log.info('Finished creating %s' % (table_name)) else: todo.append((fprefix, filename, record_type, fields, pks)) if todo: n = 1 with Pool() as pool: for table_name, creating in pool.imap_unordered( csv_to_table_tup, todo): if table_name and creating: log.info('Finished recreating %s (%d of %d)' % (table_name, n, len(todo))) elif table_name: log.info('Finished creating %s (%d of %d)' % (table_name, n, len(todo))) n += 1 if full_view_refresh: create_views(connection) log.debug('csv to postgresql: %ds total time' % (t_time() - stime))