def test_flat_schema(self): schema = '{ "type": "SCHEMA", "stream": "simple_stream", "schema": { "properties": { "id": { "type": [ "null", "string" ] }, "name": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "integer" ] }, "ratio": { "type": [ "null", "number" ] }, "timestamp": { "type": "string", "format": "date-time" }, "date": { "type": "string", "format": "date" } }, "type": [ "null", "object" ] }, "key_properties": [ "id" ], "bookmark_properties": [ "date" ] }' msg = singer.parse_message(schema) schema = build_schema(msg.schema, key_properties=msg.key_properties, add_metadata=True) for f in schema: if f.name == "id": self.assertEqual(f.field_type.upper(), "STRING") elif f.name == "name": self.assertEqual(f.field_type.upper(), "STRING") elif f.name == "value": self.assertEqual(f.field_type.upper(), "INTEGER") elif f.name == "ratio": self.assertEqual(f.field_type.upper(), "FLOAT") elif f.name == "timestamp": self.assertEqual(f.field_type.upper(), "TIMESTAMP") elif f.name == "date": self.assertEqual(f.field_type.upper(), "DATE")
def _load_to_bq(self, client, dataset, table_name, table_schema, table_config, key_props, metadata_columns, truncate, rows): logger = self.logger partition_field = table_config.get("partition_field", None) cluster_fields = table_config.get("cluster_fields", None) force_fields = table_config.get("force_fields", {}) schema = build_schema(table_schema, key_properties=key_props, add_metadata=metadata_columns, force_fields=force_fields) load_config = LoadJobConfig() load_config.ignore_unknown_values = True load_config.schema = schema if partition_field: load_config.time_partitioning = bigquery.table.TimePartitioning( type_=bigquery.table.TimePartitioningType.DAY, field=partition_field) if cluster_fields: load_config.clustering_fields = cluster_fields load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON if truncate: logger.info(f"Load {table_name} by FULL_TABLE (truncate)") load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE else: logger.info(f"Appending to {table_name}") load_config.write_disposition = WriteDisposition.WRITE_APPEND logger.info("loading {} to BigQuery".format(table_name)) load_job = None try: load_job = client.load_table_from_file(rows, dataset.table(table_name), job_config=load_config, rewind=True) logger.info("loading job {}".format(load_job.job_id)) job = load_job.result() logger.info(job._properties) return job except google_exceptions.BadRequest as err: logger.error("failed to load table {} from file: {}".format( table_name, str(err))) if load_job and load_job.errors: reason = err.errors[0]["reason"] messages = [f"{err['message']}" for err in load_job.errors] logger.error("reason: {reason}, errors:\n{e}".format( reason=reason, e="\n".join(messages))) err.message = f"reason: {reason}, errors: {';'.join(messages)}" raise err
def test_nested_schema_v3(self): schema = '{ "type":"SCHEMA", "stream":"orders", "schema": { "properties": { "address_id": { "type": [ "null", "string" ] }, "address_is_active": { "type": [ "null", "boolean" ] }, "billing_address": { "properties": { "address1": { "type": [ "null", "string" ] }, "address2": { "type": [ "null", "string" ] }, "city": { "type": [ "null", "string" ] }, "company": { "type": [ "null", "string" ] }, "country": { "type": [ "null", "string" ] }, "first_name": { "type": [ "null", "string" ] }, "last_name": { "type": [ "null", "string" ] }, "phone": { "type": [ "null", "string" ] }, "province": { "type": [ "null", "string" ] }, "zip": { "type": [ "null", "string" ] } }, "type": [ "null", "object" ], "additionalProperties": false }, "charge_id": { "type": [ "null", "string" ] }, "charge_status": { "type": [ "null", "string" ] }, "created_at": { "format": "date-time", "type": [ "null", "string" ] }, "customer_id": { "type": [ "null", "string" ] }, "discount_codes": { "anyOf": [ { "type": "array", "items": { "type": "object", "additionalProperties": false, "properties": { "amount": { "type": [ "null", "number" ] }, "code": { "type": [ "null", "string" ] }, "type": { "type": [ "null", "string" ] } } } }, { "type": "null" } ] }, "email": { "type": [ "null", "string" ] }, "first_name": { "type": [ "null", "string" ] }, "hash": { "type": [ "null", "string" ] }, "id": { "type": [ "null", "string" ] }, "is_prepaid": { "type": [ "null", "boolean" ] }, "last_name": { "type": [ "null", "string" ] }, "line_items": { "anyOf": [ { "type": "array", "items": { "type": "object", "additionalProperties": false, "properties": { "grams": { "type": [ "null", "integer" ] }, "images": { "type": [ "null", "object" ], "additionalProperties": false, "properties": { "large": { "type": [ "null", "string" ] }, "medium": { "type": [ "null", "string" ] }, "original": { "type": [ "null", "string" ] }, "small": { "type": [ "null", "string" ] } } }, "price": { "type": [ "null", "number" ], "multipleOf": 1e-08 }, "properties": { "anyOf": [ { "type": "array", "items": { "type": "object", "additionalProperties": false, "properties": { "name": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "string" ] } } } }, { "type": "null" } ] }, "quantity": { "type": [ "null", "integer" ] }, "shopify_product_id": { "type": [ "null", "string" ] }, "shopify_variant_id": { "type": [ "null", "string" ] }, "sku": { "type": [ "null", "string" ] }, "subscription_id": { "type": [ "null", "string" ] }, "title": { "type": [ "null", "string" ] }, "variant_title": { "type": [ "null", "string" ] }, "vendor": { "type": [ "null", "string" ] } } } }, { "type": "null" } ] }, "note": { "type": [ "null", "string" ] }, "note_attributes": { "anyOf": [ { "type": "array", "items": { "type": "object", "additionalProperties": false, "properties": { "name": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "string" ] } } } }, { "type": "null" } ] }, "payment_processor": { "type": [ "null", "string" ] }, "processed_at": { "format": "date-time", "type": [ "null", "string" ] }, "scheduled_at": { "format": "date-time", "type": [ "null", "string" ] }, "shipped_date": { "format": "date-time", "type": [ "null", "string" ] }, "shipping_address": { "properties": { "address1": { "type": [ "null", "string" ] }, "address2": { "type": [ "null", "string" ] }, "city": { "type": [ "null", "string" ] }, "company": { "type": [ "null", "string" ] }, "country": { "type": [ "null", "string" ] }, "first_name": { "type": [ "null", "string" ] }, "last_name": { "type": [ "null", "string" ] }, "phone": { "type": [ "null", "string" ] }, "province": { "type": [ "null", "string" ] }, "zip": { "type": [ "null", "string" ] } }, "type": [ "null", "object" ], "additionalProperties": false }, "shipping_date": { "format": "date-time", "type": [ "null", "string" ] }, "shipping_lines": { "anyOf": [ { "type": "array", "items": { "type": "object", "additionalProperties": false, "properties": { "code": { "type": [ "null", "string" ] }, "price": { "type": [ "null", "number" ] }, "title": { "type": [ "null", "string" ] } } } }, { "type": "null" } ] }, "shopify_cart_token": { "type": [ "null", "string" ] }, "shopify_customer_id": { "type": [ "null", "string" ] }, "shopify_id": { "type": [ "null", "string" ] }, "shopify_order_id": { "type": [ "null", "string" ] }, "shopify_order_number": { "type": [ "null", "string" ] }, "status": { "type": [ "null", "string" ] }, "subtotal_price": { "type": [ "null", "number" ] }, "tags": { "type": [ "null", "string" ] }, "tax_lines": { "anyOf": [ { "type": "array", "items": { "type": "object", "additionalProperties": false, "properties": { "code": { "type": [ "null", "string" ] }, "price": { "type": [ "null", "number" ] }, "title": { "type": [ "null", "string" ] } } } }, { "type": "null" } ] }, "total_discounts": { "multipleOf": 1e-08, "type": [ "null", "number" ] }, "total_line_items_price": { "multipleOf": 1e-08, "type": [ "null", "number" ] }, "total_price": { "type": [ "null", "number" ] }, "total_refunds": { "multipleOf": 1e-08, "type": [ "null", "number" ] }, "total_tax": { "multipleOf": 1e-08, "type": [ "null", "number" ] }, "total_weight": { "type": [ "null", "integer" ] }, "transaction_id": { "type": [ "null", "string" ] }, "type": { "type": [ "null", "string" ] }, "updated_at": { "format": "date-time", "type": [ "null", "string" ] } }, "type": "object", "additionalProperties": false }, "key_properties":[ "Id" ] }' msg = singer.parse_message(schema) schema = build_schema(msg.schema, key_properties=msg.key_properties, add_metadata=True) self.assertTrue(True)
def test_nested_schema_v2(self): schema = '{"type": "SCHEMA", "stream": "campaigns", "schema": {"type": ["null", "object"], "additionalProperties": false, "properties": {"AudienceAdsBidAdjustment": {"type": ["null", "integer"]}, "BiddingScheme": {"anyOf": [{"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "InheritedBidStrategyType": {"type": ["null", "string"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "MaxCpc": {"type": ["null", "object"], "additionalProperties": false, "properties": {"Amount": {"type": ["null", "number"]}}}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "MaxCpc": {"type": ["null", "object"], "additionalProperties": false, "properties": {"Amount": {"type": ["null", "number"]}}}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "MaxCpc": {"type": ["null", "object"], "additionalProperties": false, "properties": {"Amount": {"type": ["null", "number"]}}}, "TargetCpa": {"type": ["null", "number"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "TargetRoas": {"type": ["null", "number"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "MaxCpc": {"type": ["null", "object"], "additionalProperties": false, "properties": {"Amount": {"type": ["null", "number"]}}}, "TargetRoas": {"type": ["null", "number"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "MaxCpc": {"type": ["null", "object"], "additionalProperties": false, "properties": {"Amount": {"type": ["null", "number"]}}}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "MaxCpc": {"type": ["null", "object"], "additionalProperties": false, "properties": {"Amount": {"type": ["null", "number"]}}}, "TargetAdPosition": {"type": ["null", "string"]}, "TargetImpressionShare": {"type": ["null", "number"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}}}]}, "BudgetType": {"type": ["null", "string"]}, "DailyBudget": {"type": ["null", "number"]}, "ExperimentId": {"type": ["null", "integer"]}, "FinalUrlSuffix": {"type": ["null", "string"]}, "ForwardCompatibilityMap": {"type": ["null", "object"], "properties": {"KeyValuePairOfstringstring": {"type": ["null", "array"], "items": {"type": ["null", "object"], "additionalProperties": false, "properties": {"key": {"type": ["null", "string"]}, "value": {"type": ["null", "string"]}}}}}}, "Id": {"type": ["null", "integer"]}, "Name": {"type": ["null", "string"]}, "Status": {"type": ["null", "string"]}, "SubType": {"type": ["null", "string"]}, "TimeZone": {"type": ["null", "string"]}, "TrackingUrlTemplate": {"type": ["null", "string"]}, "UrlCustomParameters": {"type": ["null", "object"], "additionalProperties": false, "properties": {"Parameters": {"type": ["null", "object"], "properties": {"CustomParameter": {"type": ["null", "array"], "items": {"type": ["null", "object"], "additionalProperties": false, "properties": {"Key": {"type": ["null", "string"]}, "Value": {"type": ["null", "string"]}}}}}}}}, "CampaignType": {"type": ["null", "string"]}, "Settings": {"type": ["null", "object"], "properties": {"Setting": {"type": ["null", "array"], "items": {"anyOf": [{"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "Details": {"type": ["null", "object"], "properties": {"TargetSettingDetail": {"type": ["null", "array"], "items": {"type": ["null", "object"], "additionalProperties": false, "properties": {"CriterionTypeGroup": {"type": ["null", "string"]}, "TargetAndBid": {"type": ["boolean"]}}}}}}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "LocalInventoryAdsEnabled": {"type": ["null", "boolean"]}, "Priority": {"type": ["null", "integer"]}, "SalesCountryCode": {"type": ["null", "string"]}, "StoreId": {"type": ["null", "integer"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "BidBoostValue": {"type": ["null", "number"]}, "BidMaxValue": {"type": ["null", "number"]}, "BidOption": {"type": ["null", "string"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}, "DomainName": {"type": ["null", "string"]}, "Language": {"type": ["null", "string"]}, "PageFeedIds": {"type": ["null", "object"], "properties": {"long": {"type": ["null", "array"], "items": {"type": "integer"}}}}, "Source": {"type": ["null", "string"]}}}, {"type": ["null", "object"], "additionalProperties": false, "properties": {"Type": {"type": ["null", "string"]}}}]}}}}, "BudgetId": {"type": ["null", "integer"]}, "Languages": {"type": ["null", "object"], "properties": {"string": {"type": ["null", "array"], "items": {"type": "string"}}}}, "AdScheduleUseSearcherTimeZone": {"type": ["null", "boolean"]}}}, "key_properties": ["Id"]}' msg = singer.parse_message(schema) schema = build_schema(msg.schema, key_properties=msg.key_properties, add_metadata=True) self.assertTrue(True)
def test_nested_schema(self): schema = '{ "type": "SCHEMA", "stream": "nested_stream", "schema": { "properties": { "account_id": { "type": [ "null", "string" ] }, "account_name": { "type": [ "null", "string" ] }, "action_values": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "actions": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "ad_id": { "type": [ "null", "string" ] }, "ad_name": { "type": [ "null", "string" ] }, "adset_id": { "type": [ "null", "string" ] }, "adset_name": { "type": [ "null", "string" ] }, "age": { "type": [ "null", "integer", "string" ] }, "campaign_id": { "type": [ "null", "string" ] }, "campaign_name": { "type": [ "null", "string" ] }, "canvas_avg_view_percent": { "type": [ "null", "number" ] }, "canvas_avg_view_time": { "type": [ "null", "number" ] }, "clicks": { "type": [ "null", "integer" ] }, "conversion_rate_ranking": { "type": [ "null", "string" ] }, "cost_per_action_type": { "items": { "properties": { "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "string" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "cost_per_inline_link_click": { "type": [ "null", "number" ] }, "cost_per_inline_post_engagement": { "type": [ "null", "number" ] }, "cost_per_unique_action_type": { "items": { "properties": { "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "string" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "cost_per_unique_click": { "type": [ "null", "number" ] }, "cost_per_unique_inline_link_click": { "type": [ "null", "number" ] }, "cpc": { "type": [ "null", "number" ] }, "cpm": { "type": [ "null", "number" ] }, "cpp": { "type": [ "null", "number" ] }, "ctr": { "type": [ "null", "number" ] }, "date_start": { "format": "date-time", "type": [ "null", "string" ] }, "date_stop": { "format": "date-time", "type": [ "null", "string" ] }, "engagement_rate_ranking": { "type": [ "null", "string" ] }, "frequency": { "type": [ "null", "number" ] }, "gender": { "type": [ "null", "string" ] }, "impressions": { "type": [ "null", "integer" ] }, "inline_link_click_ctr": { "type": [ "null", "number" ] }, "inline_link_clicks": { "type": [ "null", "integer" ] }, "inline_post_engagement": { "type": [ "null", "integer" ] }, "objective": { "type": [ "null", "string" ] }, "outbound_clicks": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "quality_ranking": { "type": [ "null", "string" ] }, "reach": { "type": [ "null", "integer" ] }, "social_spend": { "type": [ "null", "number" ] }, "spend": { "type": [ "null", "number" ] }, "unique_actions": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "unique_clicks": { "type": [ "null", "integer" ] }, "unique_ctr": { "type": [ "null", "number" ] }, "unique_inline_link_click_ctr": { "type": [ "null", "number" ] }, "unique_inline_link_clicks": { "type": [ "null", "integer" ] }, "unique_link_clicks_ctr": { "type": [ "null", "number" ] }, "video_30_sec_watched_actions": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "video_p100_watched_actions": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "video_p25_watched_actions": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "video_p50_watched_actions": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "video_p75_watched_actions": { "items": { "properties": { "1d_click": { "type": [ "null", "number" ] }, "1d_view": { "type": [ "null", "number" ] }, "28d_click": { "type": [ "null", "number" ] }, "28d_view": { "type": [ "null", "number" ] }, "7d_click": { "type": [ "null", "number" ] }, "7d_view": { "type": [ "null", "number" ] }, "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "video_play_curve_actions": { "items": { "properties": { "action_type": { "type": [ "null", "string" ] }, "value": { "items": { "type": [ "null", "integer" ] }, "type": [ "null", "array" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] }, "website_ctr": { "items": { "properties": { "action_destination": { "type": [ "null", "string" ] }, "action_target_id": { "type": [ "null", "string" ] }, "action_type": { "type": [ "null", "string" ] }, "value": { "type": [ "null", "number" ] } }, "type": [ "null", "object" ] }, "type": [ "null", "array" ] } }, "type": [ "null", "object" ] }, "key_properties": [ "campaign_id", "adset_id", "ad_id", "date_start", "age", "gender" ], "bookmark_properties": [ "date_start" ] }' msg = singer.parse_message(schema) schema = build_schema(msg.schema, key_properties=msg.key_properties, add_metadata=True) for f in schema: if f.name in ("date_start", "date_stop"): self.assertEqual(f.field_type.upper(), "TIMESTAMP")
def persist_lines_job( client, dataset, lines=None, truncate=False, forced_fulltables=[], validate_records=True, table_suffix=None, ): state = None schemas = {} key_properties = {} rows = {} errors = {} table_suffix = table_suffix or "" for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): table_name = msg.stream + table_suffix if table_name not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema" .format(table_name)) schema = schemas[table_name] if validate_records: validate(msg.record, schema) new_rec = filter(schema, msg.record) # NEWLINE_DELIMITED_JSON expects literal JSON formatted data, with a newline character splitting each row. data = bytes( json.dumps(new_rec, cls=DecimalEncoder) + "\n", "UTF-8") rows[table_name].write(data) state = None elif isinstance(msg, singer.StateMessage): logger.debug("Setting state to {}".format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table_name = msg.stream + table_suffix if table_name in rows: continue schemas[table_name] = msg.schema key_properties[table_name] = msg.key_properties rows[table_name] = TemporaryFile(mode="w+b") errors[table_name] = None elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) for table in rows.keys(): key_props = key_properties[table] SCHEMA = build_schema(schemas[table], key_properties=key_props) load_config = LoadJobConfig() load_config.schema = SCHEMA load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON if truncate or (table in forced_fulltables): logger.info(f"Load {table} by FULL_TABLE") load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE logger.info("loading {} to Bigquery.\n".format(table)) try: load_job = client.load_table_from_file(rows[table], dataset.table(table), job_config=load_config, rewind=True) logger.info("loading job {}".format(load_job.job_id)) logger.info(load_job.result()) except google_exceptions.BadRequest as err: logger.error("failed to load table {} from file: {}".format( table, str(err))) if load_job.errors: messages = [ f"reason: {err['reason']}, message: {err['message']}" for err in load_job.errors ] logger.error("errors:\n{}".format("\n".join(messages))) raise yield state
def persist_lines_stream(project_id, dataset_id, lines=None, validate_records=True): state = None schemas = {} key_properties = {} tables = {} rows = {} errors = {} bigquery_client = bigquery.Client(project=project_id) dataset_ref = bigquery_client.dataset(dataset_id) dataset = Dataset(dataset_ref) try: dataset = bigquery_client.create_dataset( Dataset(dataset_ref)) or Dataset(dataset_ref) except exceptions.Conflict: pass for line in lines: try: msg = singer.parse_message(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if isinstance(msg, singer.RecordMessage): if msg.stream not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema" .format(msg.stream)) schema = schemas[msg.stream] if validate_records: validate(msg.record, schema) err = None try: err = bigquery_client.insert_rows_json(tables[msg.stream], [msg.record]) except Exception as exc: logger.error( f"failed to insert rows for {tables[msg.stream]}: {str(exc)}\n{msg.record}" ) raise errors[msg.stream] = err rows[msg.stream] += 1 state = None elif isinstance(msg, singer.StateMessage): logger.debug("Setting state to {}".format(msg.value)) state = msg.value elif isinstance(msg, singer.SchemaMessage): table = msg.stream schemas[table] = msg.schema key_properties[table] = msg.key_properties tables[table] = bigquery.Table(dataset.table(table), schema=build_schema(schemas[table])) rows[table] = 0 errors[table] = None try: tables[table] = bigquery_client.create_table(tables[table]) except exceptions.Conflict: pass elif isinstance(msg, singer.ActivateVersionMessage): # This is experimental and won't be used yet pass else: raise Exception("Unrecognized message {}".format(msg)) for table in errors.keys(): if not errors[table]: logging.info("Loaded {} row(s) from {} into {}:{}".format( rows[table], dataset_id, table, tables[table].path)) emit_state(state) else: logging.error("Errors: %s", errors[table]) return state
def persist_lines_stream( # noqa: 211 client: Client, project_id, dataset: Dataset, lines: TextIO, truncate: bool, forced_fulltables: list, validate_records: bool = True, table_suffix: Optional[str] = None, table_prefix: Optional[str] = None, ) -> Iterator[Optional[str]]: """Stream data into BigQuery. Arguments: client {Client} -- BigQuery client dataset {Dataset} -- BigQuery dataset lines {TextIO} -- Tap stream Keyword Arguments: truncate {bool} -- Whether to truncunate the table forced_fulltables {list} -- List of tables to truncunate validate_records {bool} -- Whether to alidate records (default: {True}) table_suffix {Optional[str]} -- Suffix for tables (default: {None}) table_prefix {Optional[str]} -- Prefix for tables (default: {None}) Raises: SchemaNotFoundException: If the schema message was not received yet InvalidSingerMessage: Invalid Sinnger message Yields: Iterator[Optional[str]] -- State """ # Create variable in which we save data in the upcomming loop state: Optional[str] = None schemas: dict = {} key_properties: dict = {} tables: dict = {} rows: dict = {} errors: dict = {} table_suffix = table_suffix or '' table_prefix = table_prefix or '' # For every Singer input message for line in lines: # Parse the message try: msg: Union[SchemaMessage, StateMessage, RecordMessage] = (parse_message(line)) except json.decoder.JSONDecodeError: LOGGER.error(f'Unable to parse Singer Message:\n{line}') raise # There can be several kind of messages. When inserting data, the # schema message comes first if isinstance(msg, SchemaMessage): # Schema message, create the table table_name: str = table_prefix + msg.stream + table_suffix # Save the schema, key_properties and message to use in the # record messages that are following schemas[table_name] = msg.schema key_properties[table_name] = msg.key_properties tables[table_name] = bigquery.Table( dataset.table(table_name), schema=build_schema(schemas[table_name]), ) rows[table_name] = 0 errors[table_name] = None dataset_id: str = dataset.dataset_id if not table_exists(client, project_id, dataset_id, table_name): # Create the table client.create_table(tables[table_name]) elif truncate or table_name in forced_fulltables: LOGGER.info(f'Load {table_name} by FULL_TABLE') # When truncating is enabled and the table exists, the table # has to be recreated. Because of this, we have to wait # otherwise data can be lost, see: # https://stackoverflow.com/questions/36846571/ # bigquery-table-truncation-before-streaming-not-working LOGGER.info(f'Deleting table {table_name} because it exists') client.delete_table(tables[table_name]) LOGGER.info(f'Recreating table {table_name}') client.create_table(tables[table_name]) LOGGER.info( 'Sleeping for 5 minutes before streaming data, ' f'to avoid streaming data loss in {table_name}', ) time.sleep(FIVE_MINUTES) # Delete table elif isinstance(msg, RecordMessage): # Record message table_name = table_prefix + msg.stream + table_suffix if table_name not in schemas: raise SchemaNotFoundException( f'A record for stream {table_name} was encountered before ' 'a corresponding schema', ) # Retrieve schema schema: dict = schemas[table_name] # Retrieve table table_ref: TableReference = tables[table_name] # Validate the record if validate_records: # Raises ValidationError if the record has invalid schema validate(msg.record, schema) # Filter the record record_input: Optional[Union[dict, str, list]] = filter_schema( schema, msg.record, ) # Somewhere in the process, the input record can have decimal # values e.g. "value": Decimal('10.25'). These are not JSON # erializable. Therefore, we dump the JSON here, which converts # them to string. Thereafter, we load the dumped JSON so we get a # dictionary again, which we can insert to BigQuery record_json: str = json.dumps(record_input, cls=DecimalEncoder) record: dict = json.loads(record_json) # Save the error err: Optional[list] = None try: # Insert record err = client.insert_rows(table_ref, [record]) except Exception as exc: LOGGER.error( f'Failed to insert rows for {table_name}: {exc}\n' f'{record}\n{err}', ) raise # Save errors of the stream and increate the insert rows errors[msg.stream] = err rows[msg.stream] += 1 state = None elif isinstance(msg, StateMessage): # State messages LOGGER.debug(f'Setting state to {msg.value}') state = msg.value else: raise InvalidSingerMessage(f'Unrecognized Singer Message:\n {msg}') for table in errors.keys(): if errors[table]: logging.error(f'Errors: {errors[table]}') else: logging.info( 'Loaded {rows} row(s) from {source} into {tab}:{path}'.format( rows=rows[table], source=dataset.dataset_id, tab=table, path=tables[table].path, ), ) yield state
def persist_lines_job( # noqa: WPS210, WPS211, WPS213, WPS231, WPS238 client: Client, dataset: Dataset, lines: TextIO, truncate: bool, forced_fulltables: list, validate_records: bool = True, table_suffix: Optional[str] = None, table_prefix: Optional[str] = None, ) -> Iterator[Optional[str]]: """Perform a load job into BigQuery. Arguments: client {Client} -- BigQuery client dataset {Dataset} -- BigQuery dataset lines {TextIO} -- Tap stream Keyword Arguments: truncate {bool} -- Whether to truncunate the table forced_fulltables {list} -- List of tables to truncunate validate_records {bool} -- Whether to alidate records (default: {True}) table_suffix {Optional[str]} -- Suffix for tables (default: {None}) table_prefix {Optional[str]} -- Prefix for tables (default: {None}) Raises: SchemaNotFoundException: If the schema message was not received yet InvalidSingerMessage: Invalid Sinnger message Yields: Iterator[Optional[str]] -- State """ # Create variable in which we save data in the upcomming loop state: Optional[str] = None schemas: dict = {} key_properties: dict = {} rows: dict = {} errors: dict = {} table_suffix = table_suffix or '' table_prefix = table_prefix or '' # For every Singer input message for line in lines: # Parse the message try: msg: Union[SchemaMessage, StateMessage, RecordMessage] = ( parse_message(line) ) except json.decoder.JSONDecodeError: LOGGER.error(f'Unable to parse Singer Message:\n{line}') raise # There can be several kind of messages. When inserting data, the # schema message comes first if isinstance(msg, SchemaMessage): # Schema message, save schema table_name: str = table_prefix + msg.stream + table_suffix # Skip schema if already created if table_name in rows: continue # Save schema and setup a temp file for data storage schemas[table_name] = msg.schema key_properties[table_name] = msg.key_properties rows[table_name] = TemporaryFile(mode='w+b') errors[table_name] = None elif isinstance(msg, RecordMessage): # Record message table_name = table_prefix + msg.stream + table_suffix if table_name not in schemas: raise SchemaNotFoundException( f'A record for stream {table_name} was encountered before ' 'a corresponding schema', ) # Retrieve schema schema: dict = schemas[table_name] # Validate the record if validate_records: # Raises ValidationError if the record has invalid schema validate(msg.record, schema) record_input: Optional[Union[dict, str, list]] = filter_schema( schema, msg.record, ) # Somewhere in the process, the input record can have decimal # values e.g. "value": Decimal('10.25'). These are not JSON # erializable. Therefore, we dump the JSON here, which converts # them to string. Thereafter, we load the dumped JSON so we get a # dictionary again, which we can insert to BigQuery record_str: str = '{rec}\n'.format( rec=json.dumps(record_input, cls=DecimalEncoder), ) record: bytes = bytes(record_str, 'UTF-8') # Save data to load later rows[table_name].write(record) state = None elif isinstance(msg, StateMessage): # State messages LOGGER.debug(f'Setting state to {msg.value}') state = msg.value else: raise InvalidSingerMessage( f'Unrecognized Singer Message:\n {msg}', ) # After all recordsa are received, setup a load job per stream for table in rows.keys(): # Prepare load job key_props: str = key_properties[table] load_config: LoadJobConfig = LoadJobConfig() load_config.schema = build_schema( schemas[table], key_properties=key_props, ) load_config.source_format = SourceFormat.NEWLINE_DELIMITED_JSON # Overwrite the table if truncate is enabled if truncate or table in forced_fulltables: LOGGER.info(f'Load {table} by FULL_TABLE') load_config.write_disposition = WriteDisposition.WRITE_TRUNCATE LOGGER.info(f'loading {table} to Bigquery.') # Setup load job load_job: LoadJob = client.load_table_from_file( rows[table], dataset.table(table), job_config=load_config, rewind=True, ) LOGGER.info(f'loading job {load_job.job_id}') # Run load job try: load_job.result() except google_exceptions.GoogleAPICallError as err: # Parse errors LOGGER.error(f'failed to load table {table} from file: {err}') if load_job.errors: messages: list = [ f"reason: {err['reason']}, message: {err['message']}" for err in load_job.errors ] messages_str: str = '\n'.join(messages) LOGGER.error(f'errors:\n{messages_str}') raise LOGGER.info( f'Loaded {load_job.output_rows} row(s) in ' f'{load_job.destination}', ) yield state