示例#1
0
def iterate_unzipped(file_prefixes=None):
    log = logging.getLogger('iterate_unzipped')
    file_fields = json_comment_filter(json.load(open('file-fields.json', 'r')))
    if not file_prefixes:
        file_prefixes = file_fields.keys()
    for fprefix in sorted(file_prefixes):
        version = get_latest_version(fprefix).lstrip('F')
        unzip_dir = os.path.join(get_unzip_dir(), fprefix)
        if not os.path.isdir(unzip_dir):
            unzip([fprefix])
        else:
            with open(os.path.join(unzip_dir, '.version.' + fprefix), 'r') as f:
                unzip_version = f.read().strip()
            if unzip_version != version:
                log.warning('%s: Newer ZIP file available, unzipping again' % (fprefix))
                unzip([fprefix])
        existing = os.listdir(unzip_dir)
        for filename in sorted(existing):
            if filename in ['DAT'] or filename.startswith('.version'):
                continue
            if filename not in file_fields[fprefix]:
                log.warning('%s: Missing spec for %s' % (fprefix, filename))
                continue
            log.debug('%s: Found spec for %s' % (fprefix, filename))
            file_path = os.path.join(unzip_dir, filename)
            yield fprefix, filename, file_path, file_fields
示例#2
0
def unzip(file_prefixes=None):
    """
    Extract downloaded zip files into their own folder under /feeds/
    """
    log = logging.getLogger('targets_unzip')
    stime = t_time()
    if not file_prefixes:
        file_fields = json_comment_filter(json.load(open('file-fields.json', 'r')))
        file_prefixes = file_fields.keys()
    todo = []
    for ftname in file_prefixes:
        if ftname.lower().endswith('.zip'):
            zipfilename = ftname
            ftname = ftname[:-4]
        else:
            lv = get_latest_version(ftname)
            zipfilename = ftname + lv + '.ZIP'
        zpath = os.path.join(get_download_dir(), zipfilename)
        if False:
            # don't multiprocess
            unzip_single(zpath, ftname)
        else:
            todo.append((zpath, ftname))
    if todo:
        with Pool() as pool:
            for _ in pool.imap_unordered(unzip_single_tup, todo):
                pass
    log.debug('unzip: %d total time' % (t_time()-stime))
示例#3
0
def csv(file_prefixes=None):
    """
    Transform each file in downloaded ZIP to csv format under a /csv/
    directory specified in local.cfg.
    Files with multiple record types are output to multiple csv files
    e.g. /RJFAF123.TOC becomes
        /csv/RJFA-TOC-T.CSV (main train operating company ids and names)
        /csv/RJFA-TOC-F.CSV (additional toc fare ids)
    """
    log = logging.getLogger('targets_csv')
    stime = t_time()
    file_fields = json_comment_filter(json.load(open('file-fields.json', 'r')))
    if not file_prefixes:
        file_prefixes = file_fields.keys()

    versions = {}
    done = []
    todo = []
    for fprefix, filename, file_path, file_fields in iterate_unzipped(
            file_prefixes):
        if fprefix not in versions:
            version = get_latest_version(fprefix).lstrip('F')
            versions[fprefix] = version
        if False:
            # don't multiprocess
            _, _, csv_files = file_to_csv(fprefix, filename, file_path,
                                          file_fields)
            done.extend(csv_files)
            if csv_files:
                log.info('Finished processing %s/%s %s csv file(s)' %
                         (fprefix, filename, len(csv_files)))
        else:
            todo.append((fprefix, filename, file_path, file_fields))
    if todo:
        n = 1
        with Pool() as pool:
            for fprefix, filename, csv_files in pool.imap_unordered(
                    file_to_csv_tup, todo):
                csv_msg = ''
                if len(csv_files) > 1:
                    csv_msg = '- %d csv files' % (len(csv_files))
                if len(csv_files) > 0:
                    log.info('Finished processing %s/%s (%d of %d) %s' %
                             (fprefix, filename, n, len(todo), csv_msg))
                n += 1
                done.extend(csv_files)

    # remove old versions of files
    csv_dir = get_csv_dir()
    for fname in os.listdir(csv_dir):
        if fname.endswith('.csv') and fname not in done and fname.split(
                '-')[0] in file_prefixes:
            os.unlink(os.path.join(csv_dir, fname))

    for fprefix in versions:
        version_file = os.path.join(csv_dir, '.version.' + fprefix)
        with open(version_file, 'w') as vf:
            vf.write(versions[fprefix] + '\n')

    log.debug('csv: %ds total time' % (t_time() - stime))
示例#4
0
def csv_to_table(engine,
                 metadata,
                 fprefix,
                 filename,
                 record_type,
                 fields,
                 pks=[],
                 csv_path=None):
    """
    WARNING: this drops and recreates tables
    table schema is as specified in file-fields.json and field-pks.json
    """
    log = logging.getLogger('targets_postgresql_csv_to_table')

    version = get_latest_version(fprefix).lstrip('F')
    if csv_path is None:
        if record_type:
            csv_name = '%s-%s-%s.%s.csv' % (fprefix, filename, record_type,
                                            version)
        else:
            csv_name = '%s-%s.%s.csv' % (fprefix, filename, version)
        csv_path = os.path.join(get_remote_csv_dir(), csv_name)

    table = table_from_fields(engine, metadata, fprefix, filename, record_type,
                              fields, pks)

    inspector = Inspector.from_engine(engine)
    creating = table.name in inspector.get_table_names()

    connection = engine.connect()
    trans = connection.begin()
    try:

        drop_create_table(connection, table)

        # data insert using COPY method
        force_not_null = ''
        if pks and pks != ['invalid']:
            force_not_null = ', FORCE_NOT_NULL ("%s")' % ('", "'.join(
                [p.lower() for p in pks]))
        connection.execute("""COPY "%s"
FROM '%s'
WITH (FORMAT CSV, HEADER%s);
        """ % (table.name, csv_path, force_not_null))

        trans.commit()
    except OperationalError as oe:
        trans.rollback()
        if 'No such file or directory' in str(oe):
            if creating:
                log.warning('%s not found, no table created' % (csv_path))
            else:
                log.warning('%s not found, table kept as-is' % (csv_path))
            table.name = None
        else:
            raise

    return table.name, creating
示例#5
0
def file_to_csv(fprefix, filename, file_path=None, file_fields=None):
    """
    A file can have multiple record types. Output separate
    CSV files for each one
    """
    version = get_latest_version(fprefix).lstrip('F')
    if file_path is None:
        file_path = os.path.join(get_download_dir(), fprefix, filename)
    if file_fields is None:
        file_fields = json_comment_filter(
            json.load(open('file-fields.json', 'r')))
    csv_files = {}
    csv_writers = {}
    try:
        fields = file_fields[fprefix][filename]
        for record in iterate_fields(file_path, fields):
            k = record.get('RECORD_TYPE', '')
            if k not in csv_writers:
                fsuffix = ''
                if k != '':
                    fsuffix = '-' + k
                csv_filename = fprefix + '-' + filename + fsuffix + '.' + version + '.csv'
                csv_path = os.path.join(get_csv_dir(), csv_filename)
                csv_files[csv_filename] = open(csv_path, 'w')
                fieldnames = [
                    f[0] if not isinstance(f, str) else f for f in fields[k]
                ]
                fieldnames = [
                    f for f in fieldnames
                    if f not in ['RECORD_TYPE', 'UPDATE_MARKER']
                ]
                csv_writers[k] = csv_module.DictWriter(csv_files[csv_filename],
                                                       fieldnames=fieldnames)
                csv_writers[k].writeheader()
            if k:
                del record['RECORD_TYPE']  # contained in filename
            csv_writers[k].writerow(record)
    finally:
        for f in csv_files.values():
            f.close()
    return fprefix, filename, list(csv_files.keys())
示例#6
0
def postgresql(file_prefixes=None):
    """
    Move CSV files into corresponding postgresql tables
    using bulk postgresql COPY command.
    Table names and columns are lowercased
    for ease of working in SQL.
    Types conversion:
        Date: Applicable columns ending in '_DATE'
        Time: Applicable columns ending in '_TIME'
    The CSV files must be on the same server as the postgres
    db and readable by the postgres process.
    Composite primary keys have blanks (rather than null) in columns.
    """
    log = logging.getLogger('targets_postgresql')
    stime = t_time()
    dburi = get_dburi()
    engine = create_engine(dburi)
    connection = engine.connect(
    )  # trigger conn. related exceptions, e.g. if db doesn't exist
    metadata = MetaData()

    file_fields = json_comment_filter(json.load(open('file-fields.json', 'r')))
    field_pks = json_comment_filter(json.load(open('field-pks.json', 'r')))
    if not file_prefixes:
        file_prefixes = file_fields.keys()
    todo = []
    for fprefix in sorted(file_prefixes):
        csv_dir = get_remote_csv_dir()
        if not os.path.exists(csv_dir) and csv_dir != get_csv_dir():
            # We don't have access to the directory we'll be COPYing from
            # versions are included in CSV filenames so db server will fail
            # to COPY from an old file
            pass
        else:
            if not os.path.exists(csv_dir):
                csv([fprefix])
            else:
                with open(os.path.join(csv_dir, '.version.' + fprefix),
                          'r') as f:
                    csv_version = f.read().strip()
                if csv_version != get_latest_version(fprefix).lstrip('F'):
                    log.warning(
                        '%s: Newer version available, converting to CSV again'
                        % (fprefix))
                    csv([fprefix])

        for filename in file_fields[fprefix]:
            for record_type, fields in file_fields[fprefix][filename].items():
                pks = field_pks.get(fprefix, {}).get(filename,
                                                     {}).get(record_type, [])
                if not fields:
                    log.warning('%s: Missing spec for %s %s' %
                                (fprefix, filename, record_type))
                    continue
                if False:
                    table_name, creating = csv_to_table(
                        engine, metadata, fprefix, filename, record_type,
                        fields, pks)
                    if table_name and creating:
                        log.info('Finished recreating %s' % (table_name))
                    elif table_name:
                        log.info('Finished creating %s' % (table_name))
                else:
                    todo.append((fprefix, filename, record_type, fields, pks))
    if todo:
        n = 1
        with Pool() as pool:
            for table_name, creating in pool.imap_unordered(
                    csv_to_table_tup, todo):
                if table_name and creating:
                    log.info('Finished recreating %s (%d of %d)' %
                             (table_name, n, len(todo)))
                elif table_name:
                    log.info('Finished creating %s (%d of %d)' %
                             (table_name, n, len(todo)))
                n += 1

    if full_view_refresh:
        create_views(connection)

    log.debug('csv to postgresql: %ds total time' % (t_time() - stime))