예제 #1
0
def restore_view(tbldef, fromdir, todataset):
    """
    Restore schema & data from GCS to BigQuery table
    :param tbldef: Table definition dict
    :param todir: GCS input directory gs://..../dataset/tablename/
    :param todataset: BigQuery dataset name
    """

    query = tbldef['view']['query']
    view_name = tbldef['tableReference']['tableId']
    legacy_sql = tbldef['view']['useLegacySql']

    exec_shell_command([
        'bq', 'mk', '--view', query,
        '--{}use_legacy_sql'.format('no' if not legacy_sql else ''),
        '{}.{}'.format(todataset, view_name)
    ])
예제 #2
0
def backup_table(dataset, tablename, todir, schemaonly):
    """
    Store schema & data in table to GCS
    :param dataset: BigQuery dataset name
    :param tablename: BigQuery table name
    :param todir: GCS output prefix
    :param schemaonly: don't export data, just the schema
    :return: None
    """

    full_table_name = '{}.{}'.format(dataset, tablename)

    # write schema to GCS
    schema = exec_shell_command(['bq', 'show', '--schema', full_table_name])
    write_json_string(schema,
                      os.path.join(todir, dataset, tablename, 'schema.json'))

    if not schemaonly:
        # back up the table definition
        tbldef = exec_shell_command(
            ['bq', '--format=json', 'show', full_table_name])
        write_json_string(
            tbldef, os.path.join(todir, dataset, tablename, 'tbldef.json'))

        tbldef = json.loads(tbldef)  # array of dicts
        if tbldef['type'] == 'VIEW':
            return  # no need to extract data

        # read the data
        output_data_name = os.path.join(todir, dataset, tablename,
                                        'data_*.avro')
        _ = exec_shell_command([
            'bq',
            'extract',
            '--destination_format=AVRO',
            '--use_avro_logical_types',  # for DATE, TIME, NUMERIC
            '{}.{}'.format(dataset, tablename),
            output_data_name
        ])
예제 #3
0
def restore_table(fromdir, todataset):
    """
    Restore schema & data from GCS to BigQuery table
    :param todir: GCS input directory gs://..../dataset/tablename/
    :param todataset: BigQuery dataset name
    """

    # start to create load command
    load_command = [
        'bq',
        'load',
        '--source_format',
        'AVRO',
        '--use_avro_logical_types',  # for DATE, TIME, NUMERIC
    ]

    # get table definition
    tbldef = read_json_string(os.path.join(fromdir, 'tbldef.json'))

    if tbldef['type'] == 'VIEW':
        restore_view(tbldef, fromdir, todataset)
        return

    if 'timePartitioning' in tbldef:
        load_command += [
            '--time_partitioning_expiration',
            tbldef['timePartitioning']['expirationMs'],
            '--time_partitioning_field',
            tbldef['timePartitioning']['field'],
            '--time_partitioning_type',
            tbldef['timePartitioning']['type'],
        ]

    if 'rangePartitioning' in tbldef:
        load_command += [
            '--range_partitioning', '{},{},{},{}'.format(
                tbldef['rangePartitioning']['field'],
                tbldef['rangePartitioning']['range']['start'],
                tbldef['rangePartitioning']['range']['end'],
                tbldef['rangePartitioning']['range']['interval'])
        ]

    if 'clustering' in tbldef:
        load_command += [
            '--clustering_fields', ','.join(tbldef['clustering']['fields'])
        ]

    # write schema to a temporary file
    schema = tbldef['schema']['fields']  # array of dicts
    fd, schema_file = tempfile.mkstemp()
    with open(schema_file, 'w') as ofp:
        json.dump(schema, ofp, sort_keys=False, indent=2)
    os.close(fd)
    load_command += [
        '--schema',
        schema_file,
    ]

    # load the data into BigQuery
    table_name = tbldef['tableReference']['tableId']
    load_command += [
        '{}.{}'.format(todataset, table_name),
        os.path.join(fromdir, 'data_*.avro')
    ]

    exec_shell_command(load_command)
예제 #4
0
    parser.add_argument('--schema',
                        action='store_true',
                        help='Write out only the schema, no data')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Turn off verbose logging')

    args = parser.parse_args()
    if not args.quiet:
        logging.basicConfig(level=logging.INFO)

    if '.' in args.input:
        dataset, table = args.input.split('.')
        tables = [table]
    else:
        dataset = args.input
        dataset_contents = exec_shell_command(
            ['bq', '--format=json', 'ls', '--max_results', '10000', dataset])
        dataset_contents = json.loads(dataset_contents)  # array of dicts
        tables = []
        for entry in dataset_contents:
            if entry['type'] == 'TABLE' or entry['type'] == 'VIEW':
                tables.append(entry['tableReference']['tableId'])
            else:
                logging.warning('Not backing up {} because it is a {}'.format(
                    entry['id'], entry['type']))
        print(tables)

    for table in tables:
        backup_table(dataset, table, args.output, args.schema)
예제 #5
0
    parser.add_argument('--schema',
                        action='store_true',
                        help='Write out only the schema, no data')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Turn off verbose logging')

    args = parser.parse_args()
    if not args.quiet:
        logging.basicConfig(level=logging.INFO)

    if '.' in args.input:
        dataset, table = args.input.split('.')
        tables = [table]
    else:
        dataset = args.input
        dataset_contents = exec_shell_command(
            ['bq', '--format=json', 'ls', dataset])
        dataset_contents = json.loads(dataset_contents)  # array of dicts
        tables = []
        for entry in dataset_contents:
            if entry['type'] == 'TABLE' or entry['type'] == 'VIEW':
                tables.append(entry['tableReference']['tableId'])
            else:
                logging.warning('Not backing up {} because it is a {}'.format(
                    entry['id'], entry['type']))
        print(tables)

    for table in tables:
        backup_table(dataset, table, args.output, args.schema)