예제 #1
0
def upload_tweets():
    big_query_client = bigquery.Client.from_service_account_json('my-beam-project-b2834963a4ae.json')

    dataset_ref = big_query_client.dataset('Tweets')
    dataset = Dataset(dataset_ref)
    dataset.description = 'This represents tweets of trending topics'
    dataset = big_query_client.create_dataset(dataset)

    SCHEMA = [
        SchemaField('Tweets', 'STRING', mode='Nullable'),

    ]
    table_ref = big_query_client.dataset('Tweets').table('tabletweet')

    load_config = LoadJobConfig()
    load_config.skip_leading_rows = 0
    load_config.schema = SCHEMA
    load_config.allow_quoted_newlines = True
    load_config.ignore_unknown_values = False
    load_config.max_bad_records = 200


    with open('tweets.csv', 'rb') as readable:
        big_query_client.load_table_from_file(
            readable, table_ref, job_config=load_config)
    print('tweets file uploaded to big query')
예제 #2
0
def insertCSV(stock):

    client = bigquery.Client(project_id)

    SCHEMA = [
        SchemaField('symbol', 'STRING', mode='required'),
        SchemaField('date', 'DATE', mode='required'),
        SchemaField('close', 'FLOAT', mode='required'),
        SchemaField('high', 'FLOAT', mode='required'),
        SchemaField('low', 'FLOAT', mode='required'),
        SchemaField('open', 'FLOAT', mode='required'),
        SchemaField('volume', 'INTEGER', mode='required'),
    ]

    table_ref = client.dataset(dataset_id).table(stock)

    load_config = LoadJobConfig()
    load_config.skip_leading_rows = 1
    load_config.schema = SCHEMA

    with open('Data/%s.csv' % stock, 'rb') as readable:
        r = csv.reader(readable, delimiter=',')
        client.load_table_from_file(readable,
                                    table_ref,
                                    job_config=load_config)
예제 #3
0
        def load_task():
            client = Client()
            job_config = LoadJobConfig()
            schema_path = os.path.join(
                dags_folder,
                'resources/stages/raw/schemas/{task}.json'.format(task=task))
            job_config.schema = read_bigquery_schema_from_file(schema_path)
            job_config.source_format = SourceFormat.CSV if file_format == 'csv' else SourceFormat.NEWLINE_DELIMITED_JSON
            if file_format == 'csv':
                job_config.skip_leading_rows = 1
            job_config.write_disposition = 'WRITE_TRUNCATE'
            job_config.allow_quoted_newlines = allow_quoted_newlines
            job_config.ignore_unknown_values = True

            export_location_uri = 'gs://{bucket}/export'.format(
                bucket=output_bucket)
            uri = '{export_location_uri}/{task}/*.{file_format}'.format(
                export_location_uri=export_location_uri,
                task=task,
                file_format=file_format)
            table_ref = client.dataset(dataset_name_raw).table(task)
            load_job = client.load_table_from_uri(uri,
                                                  table_ref,
                                                  job_config=job_config)
            submit_bigquery_job(load_job, job_config)
            assert load_job.state == 'DONE'
예제 #4
0
 def __create_load_job_config(
         self, ems_load_job_config: EmsLoadJobConfig) -> LoadJobConfig:
     config = LoadJobConfig()
     config.labels = ems_load_job_config.labels
     config.create_disposition = ems_load_job_config.create_disposition.value
     config.write_disposition = ems_load_job_config.write_disposition.value
     config.schema = _parse_schema_resource(ems_load_job_config.schema)
     config.skip_leading_rows = ems_load_job_config.skip_leading_rows
     return config
예제 #5
0
    def create_table_from_csv(self, dataset, table_name, file_path, schema):
        table_ref = dataset.table(table_name)

        load_config = LoadJobConfig()
        load_config.skip_leading_rows = 1
        load_config.schema = schema

        with open(file_path, 'rb') as readable:
            self.client.load_table_from_file(
                readable, table_ref, job_config=load_config)  # API request

        return
예제 #6
0
def DTSTableDefinition_to_BQLoadJobConfig(dts_tabledef):
    """
    https://cloud.google.com/bigquery/docs/reference/data-transfer/partner/rpc/google.cloud.bigquery.datatransfer.v1#tabledefinition

    TO

    https://googlecloudplatform.github.io/google-cloud-python/latest/bigquery/reference.html#google.cloud.bigquery.job.LoadJob

    :param dts_tabledef:
    :return:
    """
    from bq_dts import rest_client
    job_config = LoadJobConfig()

    dts_schema = RPCRecordSchema_to_GCloudSchema(dts_tabledef['schema'])
    job_config.schema = dts_schema

    # BQ DTS does not provide controls for the following dispositions
    job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE

    if 'format' in dts_tabledef:
        dts_format = dts_tabledef['format']
        source_format = rest_client.BQ_DTS_FORMAT_TO_BQ_SOURCE_FORMAT_MAP[
            dts_format]
        assert source_format is not None
        job_config.source_format = source_format

    if 'max_bad_records' in dts_tabledef:
        job_config.max_bad_records = dts_tabledef['max_bad_records']

    if 'encoding' in dts_tabledef:
        dts_encoding = dts_tabledef['encoding']
        job_config.encoding = rest_client.BQ_DTS_ENCODING_TO_BQ_ENCODING_MAP[
            dts_encoding]

    if 'csv_options' in dts_tabledef:
        csv_opts = dts_tabledef['csv_options']
        if 'field_delimiter' in csv_opts:
            job_config.field_delimiter = csv_opts['field_delimiter']
        if 'allow_quoted_newlines' in csv_opts:
            job_config.allow_quoted_newlines = csv_opts[
                'allow_quoted_newlines']
        if 'quote_char' in csv_opts:
            job_config.quote_character = csv_opts['quote_char']
        if 'skip_leading_rows' in csv_opts:
            job_config.skip_leading_rows = csv_opts['skip_leading_rows']

    return job_config
예제 #7
0
def load_stage(dst_dataset: Dataset, bq_client: Client, bucket_name: str,
               gcs_client: storage.Client) -> List[LoadJob]:
    """
    Stage files from a bucket to a dataset

    :param dst_dataset: reference to destination dataset object
    :param bq_client: a BigQuery client object
    :param bucket_name: the location in GCS containing the vocabulary files
    :param gcs_client: a Cloud Storage client object
    :return: list of completed load jobs
    """
    blobs = list(gcs_client.list_blobs(bucket_name))

    table_blobs = [_filename_to_table_name(blob.name) for blob in blobs]
    missing_blobs = [
        table for table in VOCABULARY_TABLES if table not in table_blobs
    ]
    if missing_blobs:
        raise RuntimeError(
            f'Bucket {bucket_name} is missing files for tables {missing_blobs}'
        )

    load_jobs = []
    for blob in blobs:
        table_name = _filename_to_table_name(blob.name)
        # ignore any non-vocabulary files
        if table_name not in VOCABULARY_TABLES:
            continue
        destination = dst_dataset.table(table_name)
        safe_schema = safe_schema_for(table_name)
        job_config = LoadJobConfig()
        job_config.schema = safe_schema
        job_config.skip_leading_rows = 1
        job_config.field_delimiter = FIELD_DELIMITER
        job_config.max_bad_records = MAX_BAD_RECORDS
        job_config.source_format = 'CSV'
        job_config.quote_character = ''
        source_uri = f'gs://{bucket_name}/{blob.name}'
        load_job = bq_client.load_table_from_uri(source_uri,
                                                 destination,
                                                 job_config=job_config)
        LOGGER.info(f'table:{destination} job_id:{load_job.job_id}')
        load_jobs.append(load_job)
        load_job.result()
    return load_jobs
예제 #8
0
    def create_table(self, path, table_from='uri'):
        bp = BQParser(path)
        dataset_name = bp.dataset_name
        table_name = bp.table_name
        skip_leading_rows = bp.skip_leading_rows
        schema = bp.schema

        table_ref = self.client.dataset(dataset_name).table(table_name)
        load_config = LoadJobConfig()
        load_config.skip_leading_rows = skip_leading_rows
        load_config.schema = schema
        file_source = bp.properties.get('inputPath')

        if table_from == 'uri':
            self.client.load_table_from_uri(source_uris=file_source,
                                            destination=table_ref,
                                            job_config=load_config)
        else:
            raise ValueError('Not supported')
def load_folder(dst_dataset: str, bq_client: BQClient, bucket_name: str,
                prefix: str, gcs_client: GCSClient,
                hpo_id: str) -> List[LoadJob]:
    """
    Stage files from a bucket to a dataset

    :param dst_dataset: Identifies the destination dataset
    :param bq_client: a BigQuery client object
    :param bucket_name: the bucket in GCS containing the archive files
    :param prefix: prefix of the filepath URI
    :param gcs_client: a Cloud Storage client object
    :param hpo_id: Identifies the HPO site
    :return: list of completed load jobs
    """
    blobs = list(gcs_client.list_blobs(bucket_name, prefix=prefix))

    load_jobs = []
    for blob in blobs:
        table_name = _filename_to_table_name(blob.name)
        if table_name not in AOU_REQUIRED:
            LOGGER.debug(f'Skipping file for {table_name}')
            continue
        schema = get_table_schema(table_name)
        hpo_table_name = f'{hpo_id}_{table_name}'
        fq_hpo_table = f'{bq_client.project}.{dst_dataset}.{hpo_table_name}'
        destination = Table(fq_hpo_table, schema=schema)
        destination = bq_client.create_table(destination)
        job_config = LoadJobConfig()
        job_config.schema = schema
        job_config.skip_leading_rows = 1
        job_config.source_format = 'CSV'
        source_uri = f'gs://{bucket_name}/{blob.name}'
        load_job = bq_client.load_table_from_uri(
            source_uri,
            destination,
            job_config=job_config,
            job_id_prefix=f"{__file__.split('/')[-1].split('.')[0]}_")
        LOGGER.info(f'table:{destination} job_id:{load_job.job_id}')
        load_jobs.append(load_job)
        load_job.result()
    return load_jobs
def add_load_job_csv_config(unhandled_hints: Set[str],
                            hints: ValidatedRecordsHints,
                            fail_if_cant_handle_hint: bool,
                            config: bigquery.LoadJobConfig) -> None:
    # source_format: File format of the data.
    config.source_format = 'CSV'

    # encoding: The character encoding of the data.
    # The supported values are UTF-8 or ISO-8859-1.
    # "UTF-8 or ISO-8859-1"
    #
    if hints.encoding == 'UTF8':
        config.encoding = 'UTF-8'
    else:
        # Currently records hints don't support ISO-8859-1
        cant_handle_hint(fail_if_cant_handle_hint, 'encoding', hints)
    quiet_remove(unhandled_hints, 'encoding')

    # field_delimiter: The separator for fields in a CSV file.
    assert isinstance(hints.field_delimiter, str)
    config.field_delimiter = hints.field_delimiter
    quiet_remove(unhandled_hints, 'field-delimiter')

    # allow_jagged_rows: Allow missing trailing optional columns (CSV only).

    # null_marker: Represents a null value (CSV only)
    #
    # (documentation is mangled for this one, but I assume the default is
    # '' or something sensible, so not messing with it)

    # quote_character: Character used to quote data sections (CSV
    # only).
    #
    # [Optional] The value that is used to quote data sections in
    # a CSV file. BigQuery converts the string to ISO-8859-1
    # encoding, and then uses the first byte of the encoded string
    # to split the data in its raw, binary state. The default
    # value is a double-quote ('"'). If your data does not contain
    # quoted sections, set the property value to an empty
    # string. If your data contains quoted newline characters, you
    # must also set the allowQuotedNewlines property to
    # true.
    #
    # @default "

    # I tried a few combinations and found that when you leave quote_character as the default
    #
    # * Fields quoted with "" are loaded without the surrounding quotes in the
    #   string
    # * "" becomes " in a quoted field
    # * "" stays "" in a non-quoted field
    # * nonnumeric quoting works fine
    # * full quoting works fine

    if hints.quoting is None:
        config.quote_character = ''
    elif hints.quoting == 'all' or hints.quoting == 'minimal' or hints.quoting == 'nonnumeric':
        # allow_quoted_newlines: Allow quoted data containing newline
        # characters (CSV only).

        config.allow_quoted_newlines = True

        assert isinstance(hints.quotechar, str)
        config.quote_character = hints.quotechar
        if hints.doublequote:
            pass
        else:
            cant_handle_hint(fail_if_cant_handle_hint, 'doublequote', hints)

    else:
        _assert_never(hints.quoting)
    quiet_remove(unhandled_hints, 'quoting')
    quiet_remove(unhandled_hints, 'quotechar')
    quiet_remove(unhandled_hints, 'doublequote')

    # No mention of escaping in BigQuery documentation, and in
    # practice backslashes come through without being interpreted.
    if hints.escape is None:
        pass
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'escape', hints)
    quiet_remove(unhandled_hints, 'escape')

    # skip_leading_rows: Number of rows to skip when reading data (CSV only).
    if hints.header_row:
        config.skip_leading_rows = 1
    else:
        config.skip_leading_rows = 0
    quiet_remove(unhandled_hints, 'header-row')

    # "When you load CSV or JSON data, values in DATE columns must
    #  use the dash (-) separator and the date must be in the
    # following format: YYYY-MM-DD (year-month-day)."
    if hints.dateformat == 'YYYY-MM-DD':
        pass
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'dateformat', hints)
    quiet_remove(unhandled_hints, 'dateformat')

    # "When you load JSON or CSV data, values in TIMESTAMP columns
    #  must use a dash (-) separator for the date portion of the
    #  timestamp, and the date must be in the following format:
    #  YYYY-MM-DD (year-month-day). The hh:mm:ss
    #  (hour-minute-second) portion of the timestamp must use a
    #  colon (:) separator."
    #
    #
    # To test, log into BigQuery web console and try SQL like this
    #   (assumption is that the same timestamp parser is used during
    #   CSV loads)
    #
    #      select TIMESTAMP("2000-01-02 16:34:56.789012US/Eastern") as a;
    #
    # Tests performed and result displayed on console query:
    #
    # DATE:
    # * 01-02-2019 (rejected):
    # * 01/02/19 (rejected):
    # * 2019-01-01 (accepted): 2019-01-01
    # DATETIME:
    # * 2019-01-01 1:00pm (rejected):
    # * 2019-01-01 1:00:00pm (rejected)
    # * 2019-01-01 1:00PM (rejected):
    # * 2019-01-01 13:00 (rejected):
    # * 2019-01-01 13:00:00 (accepted): 2019-01-01T13:00:00
    # * 2019-01-01 1:00pm US/Eastern (rejected):
    # * 2019-01-01 1:00:00pm US/Eastern (rejected):
    # * 2019-01-01 13:00:00 US/Eastern (rejected):
    # * 2019-01-01 13:00:00 EST (rejected):
    # * 1997-12-17 07:37:16-08 (rejected)
    # * 2019-01-01T13:00:00 (accepted): 2019-01-01T13:00:00
    #
    # TIME:
    # * 1:00pm (rejected):
    # * 1:00:00pm (rejected):
    # * 13:00 (rejected):
    # * 13:00:00 (accepted): 13:00:00
    # * 1:00pm US/Eastern (rejected):
    # * 1:00pm EST (rejected):
    # * 07:37:16-08 (rejected):
    #
    # TIMESTAMP ("Required format is YYYY-MM-DD
    # HH:MM[:SS[.SSSSSS]]", which is BS, as it doesn't specify the
    # timezone format):
    #
    # * 2019-01-01 1:00pm (rejected):
    # * 2019-01-01 1:00:00pm (rejected)
    # * 2019-01-01 1:00PM (rejected):
    # * 2019-01-01 13:00 (rejected):
    # * 2019-01-01 13:00:00 (accepted): 2019-01-01T13:00:00
    # * 2019-01-01 1:00pm US/Eastern (rejected):
    # * 2019-01-01 1:00:00pm US/Eastern (rejected):
    # * 2019-01-01 13:00:00 US/Eastern (rejected):
    # * 2019-01-01 13:00:00 EST (rejected):
    # * 1997-12-17 07:37:16-08 (accepted): 1997-12-17 15:37:16 UTC
    # * 2019-01-01T13:00:00-08 (accepted): 2019-01-01 21:00:00 UTC
    # * 2000-01-02 16:34:56.789012+0000 (rejected)
    # * 2000-01-02 16:34:56.789012+00:00 (accepted)
    # * 2000-01-02 16:34:56.789012EST (rejected)
    # * 2000-01-02 16:34:56.789012US/Eastern (rejected)
    # * 2000-01-02 16:34:56.789012UTC (accepted): 2000-01-02 16:34:56.789012 UTC
    # * 2000-01-02 16:34:56.789012 UTC (accepted: 2000-01-02 16:34:56.789012 UTC
    #
    # https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#examples
    #
    # https://stackoverflow.com/questions/47466296/bigquery-datetime-format-csv-to-bigquery-yyyy-mm-dd-hhmmss-ssssss
    #
    # BigQuery supports exactly one format of ingesting timestamps
    # with timezones (what they call 'TIMESTAMP' they call timestamps
    # without timezones 'DATETIME'.
    #
    # That format they accept is ISO 8601, which sounds all nice and
    # standardy. Usable timestamps look like 2000-01-02
    # 16:34:56.789012+00:00.
    # Cool cool. The only issue is that Python's strftime doesn't
    # actually provide a way to add the ':' in the timezone
    # offset. The only timezone offset code, %z, does not provide the
    # colon. Other implementations (GNU libc) offers the %:z option,
    # but that doesn't exist in Python and thus in Pandas.
    #
    # So if you're using Python to export timestamps with timezones,
    # you should probably use the `YYYY-MM-DD HH24:MI:SS` format and
    # express them in UTC.
    #
    # https://stackoverflow.com/questions/44836581/does-python-time-strftime-process-timezone-options-correctly-for-rfc-3339
    # https://stackoverflow.com/questions/28729212/pandas-save-date-in-iso-format
    #
    if hints.datetimeformat in ['YYYY-MM-DD HH24:MI:SS', 'YYYY-MM-DD HH:MI:SS']:
        pass
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformat', hints)
    quiet_remove(unhandled_hints, 'datetimeformat')

    if hints.datetimeformattz in ['YYYY-MM-DD HH:MI:SSOF',
                                  'YYYY-MM-DD HH24:MI:SSOF',
                                  'YYYY-MM-DD HH:MI:SS']:
        pass
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'datetimeformattz', hints)
    quiet_remove(unhandled_hints, 'datetimeformattz')

    if hints.timeonlyformat in ['HH24:MI:SS', 'HH:MI:SS']:
        pass
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'timeonlyformat', hints)
    quiet_remove(unhandled_hints, 'timeonlyformat')

    # No options to change this.  Tested with unix newlines, dos
    # newlines and mac newlines and all were understood.:
    if hints.record_terminator in ['\n', '\r\n', '\r', None]:
        pass
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'record-terminator', hints)
    quiet_remove(unhandled_hints, 'record-terminator')

    # No way to flag compression, but tested uncompressed, with
    # gzip and works great.  .bz2 gives "400 Unsupported
    # compression type".  Not sure about .lzo, but pandas can't
    # handle it regardless, so doubt it's handled.
    if hints.compression is None or hints.compression == 'GZIP':
        pass
    else:
        cant_handle_hint(fail_if_cant_handle_hint, 'compression', hints)
    quiet_remove(unhandled_hints, 'compression')
예제 #11
0
def create_bq_schema():
    schema = []
    for col in bus_res_keys:
        name = col.split(':')[-1]
        if col in date_cols:
            type = "TIMESTAMP"
        else:
            type = "STRING"
        x = SchemaField(name, type)
        schema.append(x)
    return schema


if __name__ == '__main__':
    client = bigquery.Client(project=PROJECT_ID)
    table_ref = client.dataset('bus').table('bus')

    load_config = LoadJobConfig()
    load_config.skip_leading_rows = 1
    load_config.schema = create_bq_schema()

    bucket = storage.Client(project=PROJECT_ID).bucket(BUCKET)
    for blob in bucket.list_blobs():
        uri = "gs://{bucket}/{filename}".format(bucket=BUCKET,
                                                filename=blob.name)
        print("Loading {}".format(blob.name))
        job = client.load_table_from_uri(uri,
                                         table_ref,
                                         job_config=load_config)
        job.result()