def test_execute(self, mock_hook): mock_instance = mock_hook.return_value operator = LocalFilesystemToGCSOperator(task_id='gcs_to_file_sensor', dag=self.dag, **self._config) operator.execute(None) mock_instance.upload.assert_called_once_with( bucket_name=self._config['bucket'], filename=self._config['src'], gzip=self._config['gzip'], mime_type=self._config['mime_type'], object_name=self._config['dst'])
def test_execute_negative(self, mock_hook): mock_instance = mock_hook.return_value operator = LocalFilesystemToGCSOperator( task_id='gcs_to_file_sensor', dag=self.dag, src='/tmp/fake*.csv', dst='test/test1.csv', **self._config, ) print(glob('/tmp/fake*.csv')) with pytest.raises(ValueError): operator.execute(None) mock_instance.assert_not_called()
def test_init(self): operator = LocalFilesystemToGCSOperator(task_id='file_to_gcs_operator', dag=self.dag, **self._config) self.assertEqual(operator.src, self._config['src']) self.assertEqual(operator.dst, self._config['dst']) self.assertEqual(operator.bucket, self._config['bucket']) self.assertEqual(operator.mime_type, self._config['mime_type']) self.assertEqual(operator.gzip, self._config['gzip'])
def test_execute_wildcard(self, mock_hook): mock_instance = mock_hook.return_value operator = LocalFilesystemToGCSOperator( task_id='gcs_to_file_sensor', dag=self.dag, src='/tmp/fake*.csv', dst='test/', **self._config ) operator.execute(None) object_names = ['test/' + os.path.basename(fp) for fp in glob('/tmp/fake*.csv')] files_objects = zip(glob('/tmp/fake*.csv'), object_names) calls = [ mock.call( bucket_name=self._config['bucket'], filename=filepath, gzip=self._config['gzip'], mime_type=self._config['mime_type'], object_name=object_name, ) for filepath, object_name in files_objects ] mock_instance.upload.assert_has_calls(calls)
def bs_customer_invoice_chinook_dag(): @task() def extract_transform(): conn = sqlite3.connect(f"{DATA_PATH}/chinook.db") with open(f"{BASE_PATH}/sql/chinook.sql", "r") as query: df = pd.read_sql(query.read(), conn) df.to_csv(OUT_PATH, index=False, header=False) #prevent on create Index column and exclude the header row start = DummyOperator(task_id='start') end = DummyOperator(task_id='end') extracted_transformed_data = extract_transform() stored_data_gcs = LocalFilesystemToGCSOperator( task_id="store_to_gcs", gcp_conn_id=GOOGLE_CLOUD_CONN_ID, src=OUT_PATH, dst=GCS_OBJECT_NAME, bucket=BUCKET_NAME ) loaded_data_bigquery = GCSToBigQueryOperator( task_id='load_to_bigquery', bigquery_conn_id=GOOGLE_CLOUD_CONN_ID, bucket=BUCKET_NAME, source_objects=[GCS_OBJECT_NAME], destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TABLE_NAME}", schema_fields=[ #based on https://cloud.google.com/bigquery/docs/schemas {'name': 'customer_id', 'type': 'INT64', 'mode': 'REQUIRED'}, {'name': 'full_name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'company', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'address', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'city', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'state', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'country', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'postal_code', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'phone', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'fax', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'email', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'invoice_id', 'type': 'INT64', 'mode': 'NULLABLE'}, {'name': 'invoice_date', 'type': 'DATE', 'mode': 'NULLABLE'}, {'name': 'billing_address', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'billing_city', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'billing_state', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'billing_country', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'billing_postal_code', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'total', 'type': 'FLOAT64', 'mode': 'NULLABLE'}, ], autodetect=False, write_disposition='WRITE_TRUNCATE', #If the table already exists - overwrites the table data ) start >> extracted_transformed_data extracted_transformed_data >> stored_data_gcs stored_data_gcs >> loaded_data_bigquery loaded_data_bigquery >> end
def test_execute_multiple(self, mock_hook): mock_instance = mock_hook.return_value operator = LocalFilesystemToGCSOperator( task_id='gcs_to_file_sensor', dag=self.dag, src=self.testfiles, dst='test/', **self._config ) operator.execute(None) files_objects = zip( self.testfiles, ['test/' + os.path.basename(testfile) for testfile in self.testfiles] ) calls = [ mock.call( bucket_name=self._config['bucket'], filename=filepath, gzip=self._config['gzip'], mime_type=self._config['mime_type'], object_name=object_name, ) for filepath, object_name in files_objects ] mock_instance.upload.assert_has_calls(calls)
def test_init(self): operator = LocalFilesystemToGCSOperator( task_id='file_to_gcs_operator', dag=self.dag, src=self.testfile1, dst='test/test1.csv', **self._config, ) self.assertEqual(operator.src, self.testfile1) self.assertEqual(operator.dst, 'test/test1.csv') self.assertEqual(operator.bucket, self._config['bucket']) self.assertEqual(operator.mime_type, self._config['mime_type']) self.assertEqual(operator.gzip, self._config['gzip'])
def test_init(self): operator = LocalFilesystemToGCSOperator( task_id='file_to_gcs_operator', dag=self.dag, src=self.testfile1, dst='test/test1.csv', **self._config, ) assert operator.src == self.testfile1 assert operator.dst == 'test/test1.csv' assert operator.bucket == self._config['bucket'] assert operator.mime_type == self._config['mime_type'] assert operator.gzip == self._config['gzip']
def bs_database_sqlite_dag(): @task() def extract_transform(): conn = sqlite3.connect(f"{DATA_PATH}/database.sqlite") with open(f"{BASE_PATH}/sql/database_sqlite.sql", "r") as query: df = pd.read_sql(query.read(), conn) df.to_csv(OUT_PATH, index=False, header=False ) #prevent on create Index column and exclude the header row start = DummyOperator(task_id='start') end = DummyOperator(task_id='end') extracted_transformed_data = extract_transform() stored_data_gcs = LocalFilesystemToGCSOperator( task_id="store_to_gcs", gcp_conn_id=GOOGLE_CLOUD_CONN_ID, src=OUT_PATH, dst='extract_transform_database_sqlite.csv', bucket=BUCKET_NAME) loaded_data_bigquery = GCSToBigQueryOperator( task_id='load_to_bigquery', bigquery_conn_id=GOOGLE_CLOUD_CONN_ID, bucket=BUCKET_NAME, source_objects=[GCS_OBJECT_NAME], destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TABLE_NAME}", schema_fields=[ #based on https://cloud.google.com/bigquery/docs/schemas { 'name': 'reviewid', 'type': 'INT64', 'mode': 'REQUIRED' }, { 'name': 'title', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'artist', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'url', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'score', 'type': 'FLOAT64', 'mode': 'NULLABLE' }, { 'name': 'best_new_music', 'type': 'INTEGER', 'mode': 'NULLABLE' }, { 'name': 'author', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'author_type', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'pub_date', 'type': 'DATE', 'mode': 'NULLABLE' }, { 'name': 'pub_weekday', 'type': 'INT64', 'mode': 'NULLABLE' }, { 'name': 'pub_day', 'type': 'INT64', 'mode': 'NULLABLE' }, { 'name': 'pub_month', 'type': 'INT64', 'mode': 'NULLABLE' }, { 'name': 'pub_year', 'type': 'INT64', 'mode': 'NULLABLE' }, { 'name': 'concat_genre', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'concat_label', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'concat_year', 'type': 'STRING', 'mode': 'NULLABLE' }, ], autodetect=False, write_disposition= 'WRITE_TRUNCATE', #If the table already exists - overwrites the table data ) start >> extracted_transformed_data extracted_transformed_data >> stored_data_gcs stored_data_gcs >> loaded_data_bigquery loaded_data_bigquery >> end
fork = DummyOperator(task_id='fork', trigger_rule='one_success', dag=dag) join = DummyOperator(task_id='join', trigger_rule='one_success', dag=dag) t_downloadlogtocloud = BashOperator(task_id="download_state_file", bash_command=downloadlogscript, dag=dag) t_removefile = BashOperator( task_id='remove_temp_file', bash_command=removetempfile, dag=dag, ) ## change it suitable to your setting t_analytics = LocalFilesystemToGCSOperator(task_id="uploadtostorage", src=destination_file, dst=gcsdir, bucket=GCS_BUCKET, gcp_conn_id=GCS_CONN_ID, dag=dag) ## change it suitable for your setting t_sendresult = SimpleHttpOperator(task_id='sendnotification', method='POST', http_conn_id='notificationserver', endpoint='api/logUpdate', data=json.dumps({"source_file": source_file}), headers={"Content-Type": "application/json"}, dag=dag) ''' the dependencies among tasks ''' t_downloadlogtocloud >> t_analytics
check_dataset = BranchPythonOperator(task_id='check_dataset', python_callable=check_dataset, dag=dag, trigger_rule='none_failed_or_skipped') extract_mysql_to_local_pq = PythonOperator(task_id='extract_mysql_to_local_pq', python_callable=mysql_to_pq, op_kwargs=func_param, trigger_rule='all_done', dag=dag) local_pq_to_gcs = LocalFilesystemToGCSOperator( task_id='local_pq_to_gcs', src=func_param['source_transform'], dst=func_param['dest_blob_transform'], bucket=func_param['bucket_name'], gcp_conn_id='google_cloud_default', trigger_rule='all_done', dag=dag) load_gcs_pq_to_bq = GCSToBigQueryOperator( task_id='load_gcs_pq_to_bq', bucket=func_param['bucket_name'], source_objects=[func_param['dest_blob_transform'] ], #default take list of objects, destination_project_dataset_table='project_four_airflow.sales', source_format='PARQUET', write_disposition='WRITE_APPEND', google_cloud_storage_conn_id='google_cloud_default', trigger_rule='all_done', dag=dag)
def bs_file1000_dag(): @task() def extract_transform(): df = pd.read_excel(f"{DATA_PATH}/file_1000.xls", index_col=0).reset_index(drop=True) df = df.drop(columns='First Name.1') df['full_name'] = df['First Name'] + " " + df['Last Name'] df['gender'] = df['Gender'].apply(lambda row: 'M' if row == 'Male' else 'F') df['date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y', errors='coerce') df = df.drop(columns=['Date', 'Gender']) df.columns = [ 'first_name', 'last_name', 'country', 'age', 'id', 'full_name', 'gender', 'date' ] df = df.reindex(columns=[ 'id', 'first_name', 'last_name', 'full_name', 'date', 'age', 'gender', 'country' ]).reset_index(drop=True) df.to_csv(OUT_PATH, index=False, header=False) start = DummyOperator(task_id='start') end = DummyOperator(task_id='end') extract_task = extract_transform() stored_data_gcs = LocalFilesystemToGCSOperator( task_id="store_to_gcs", gcp_conn_id=GOOGLE_CLOUD_CONN_ID, src=OUT_PATH, dst=GCS_OBJECT_NAME, bucket=BUCKET_NAME) loaded_data_bigquery = GCSToBigQueryOperator( task_id='load_to_bigquery', bigquery_conn_id=GOOGLE_CLOUD_CONN_ID, bucket=BUCKET_NAME, source_objects=[GCS_OBJECT_NAME], destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TABLE_NAME}", schema_fields=[ #based on https://cloud.google.com/bigquery/docs/schemas { 'name': 'id', 'type': 'INT64', 'mode': 'REQUIRED' }, { 'name': 'first_name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'last_name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'full_name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'date', 'type': 'DATE', 'mode': 'NULLABLE' }, { 'name': 'age', 'type': 'INT64', 'mode': 'NULLABLE' }, { 'name': 'gender', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'country', 'type': 'STRING', 'mode': 'NULLABLE' }, ], autodetect=False, write_disposition= 'WRITE_TRUNCATE', #If the table already exists - overwrites the table data ) start >> extract_task extract_task >> stored_data_gcs stored_data_gcs >> loaded_data_bigquery loaded_data_bigquery >> end
def bs_disaster_dag(): @task() def extract_transform(): df = pd.read_csv(f"{DATA_PATH}/disaster_data.csv") columns = ['text', 'location'] for column in columns: df[column] = df[column].str.replace(r'\s{2,}', ' ', regex=True) df[column] = df[column].str.replace(r"[^a-zA-Z0-9\,]", ' ', regex=True) df.to_csv(OUT_PATH, index=False, header=False) start = DummyOperator(task_id='start') end = DummyOperator(task_id='end') extract_transform_task = extract_transform() stored_data_gcs = LocalFilesystemToGCSOperator( task_id="store_to_gcs", gcp_conn_id=GOOGLE_CLOUD_CONN_ID, src=OUT_PATH, dst=GCS_OBJECT_NAME, bucket=BUCKET_NAME) loaded_data_bigquery = GCSToBigQueryOperator( task_id='load_to_bigquery', bigquery_conn_id=GOOGLE_CLOUD_CONN_ID, bucket=BUCKET_NAME, source_objects=[GCS_OBJECT_NAME], destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TABLE_NAME}", schema_fields=[ #based on https://cloud.google.com/bigquery/docs/schemas { 'name': 'id', 'type': 'INT64', 'mode': 'REQUIRED' }, { 'name': 'keyword', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'location', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'text', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'target', 'type': 'INT64', 'mode': 'NULLABLE' }, ], autodetect=False, write_disposition= 'WRITE_TRUNCATE', #If the table already exists - overwrites the table data ) start >> extract_transform_task extract_transform_task >> stored_data_gcs stored_data_gcs >> loaded_data_bigquery loaded_data_bigquery >> end
create_bucket1 = GCSCreateBucketOperator( task_id="create_bucket1", bucket_name=BUCKET_1, project_id=PROJECT_ID ) create_bucket2 = GCSCreateBucketOperator( task_id="create_bucket2", bucket_name=BUCKET_2, project_id=PROJECT_ID ) list_buckets = GCSListObjectsOperator(task_id="list_buckets", bucket=BUCKET_1) list_buckets_result = BashOperator( task_id="list_buckets_result", bash_command="echo \"{{ task_instance.xcom_pull('list_buckets') }}\"", ) upload_file = LocalFilesystemToGCSOperator( task_id="upload_file", src=PATH_TO_UPLOAD_FILE, dst=BUCKET_FILE_LOCATION, bucket=BUCKET_1, ) transform_file = GCSFileTransformOperator( task_id="transform_file", source_bucket=BUCKET_1, source_object=BUCKET_FILE_LOCATION, transform_script=["python", PATH_TO_TRANSFORM_SCRIPT], ) # [START howto_operator_gcs_bucket_create_acl_entry_task] gcs_bucket_create_acl_entry_task = GCSBucketCreateAclEntryOperator( bucket=BUCKET_1, entity=GCS_ACL_ENTITY, role=GCS_ACL_BUCKET_ROLE, task_id="gcs_bucket_create_acl_entry_task", )
def bs_reviews_dag(): @task() def merge_reviews(reviews: list): df_merge = pd.concat([pd.read_json(review) for review in reviews], ignore_index=True) print(df_merge) df_merge.to_csv(OUT_PATH, index=False, header=False) @task() def extract_reviews(filename): print(filename) file_path = f"{DATA_PATH}/{filename}" if 'csv' in filename: df = pd.read_csv(file_path) else: df = pd.read_excel(file_path) print(df) return df.to_json() start = DummyOperator(task_id='start') end = DummyOperator(task_id='end') filenames = os.listdir(DATA_PATH) filtered_filename = list( filter(lambda filename: re.match(r"(^reviews)", filename), filenames)) extracted_list = [] for i in range(len(filtered_filename)): extracted = extract_reviews(filtered_filename[i]) extracted_list.append(extracted) merged = merge_reviews(extracted_list) stored_data_gcs = LocalFilesystemToGCSOperator( task_id="store_to_gcs", gcp_conn_id=GOOGLE_CLOUD_CONN_ID, src=OUT_PATH, dst=GCS_OBJECT_NAME, bucket=BUCKET_NAME) loaded_data_bigquery = GCSToBigQueryOperator( task_id='load_to_bigquery', bigquery_conn_id=GOOGLE_CLOUD_CONN_ID, bucket=BUCKET_NAME, source_objects=[GCS_OBJECT_NAME], destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TABLE_NAME}", schema_fields=[ #based on https://cloud.google.com/bigquery/docs/schemas { 'name': 'listing_id', 'type': 'INT64', 'mode': 'NULLABLE' }, { 'name': 'id', 'type': 'INT64', 'mode': 'REQUIRED' }, { 'name': 'date', 'type': 'DATE', 'mode': 'NULLABLE' }, { 'name': 'reviewer_id', 'type': 'INT64', 'mode': 'NULLABLE' }, { 'name': 'reviewer_name', 'type': 'STRING', 'mode': 'NULLABLE' }, { 'name': 'comments', 'type': 'STRING', 'mode': 'NULLABLE' }, ], autodetect=False, allow_quoted_newlines=True, write_disposition= 'WRITE_TRUNCATE', #If the table already exists - overwrites the table data ) start >> extracted_list >> merged merged >> stored_data_gcs stored_data_gcs >> loaded_data_bigquery >> end
def bs_tweets_dag(): @task() def extract_transform_tweets(): df = pd.read_json(f"{DATA_PATH}/tweet_data.json", lines=True) df['created_at'] = df['created_at'].dt.tz_convert(None) columns = ['text', 'source'] for column in columns: df[column] = df[column].str.replace(r"[^a-zA-Z0-9\,#@]", ' ', regex=True) df[column] = df[column].str.replace(r"\s{2,}", ' ', regex=True) filtered_columns = filter(lambda col: col != 'extended_entities' and col != 'contributors' and col != 'entities' and col != 'retweeted_status' and col != 'user' and col != 'in_reply_to_user_id_str' and col != 'in_reply_to_status_id_str' , list(df.columns)) df_filtered = df[filtered_columns] df_filtered.to_csv(OUT_TWEETS_PATH, index=False, header=False) @task() def extract_transform_tweets_user(): df = pd.read_json(f"{DATA_PATH}/tweet_data.json", lines=True) users = [ {**row['user'], 'tweet_id': row['id']} for _, row in df.iterrows() ] df_users = pd.DataFrame(users) df_users['created_at'] = pd.to_datetime(df_users['created_at'], format='%a %b %d %H:%M:%S %z %Y' ).dt.tz_convert(None) filtered_column = list(filter(lambda col: col != 'id' and col != 'tweet_id', list(df_users.columns))) df_users = df_users.reindex(columns=['id', 'tweet_id', *(filtered_column)]) df_users.to_csv(OUT_TWEETS_USER_PATH, index=False, header=False) start = DummyOperator(task_id='start') end = DummyOperator(task_id='end') et_tweets = extract_transform_tweets() et_tweets_user = extract_transform_tweets_user() stored_tweets_data_gcs = LocalFilesystemToGCSOperator( task_id="store_tweets_to_gcs", gcp_conn_id=GOOGLE_CLOUD_CONN_ID, src=OUT_TWEETS_PATH, dst=GCS_OBJECT_TWEETS_NAME, bucket=BUCKET_NAME ) stored_tweets_user_data_gcs = LocalFilesystemToGCSOperator( task_id="store_tweets_user_to_gcs", gcp_conn_id=GOOGLE_CLOUD_CONN_ID, src=OUT_TWEETS_USER_PATH, dst=GCS_OBJECT_TWEETS_USER_NAME, bucket=BUCKET_NAME ) loaded_tweets_data_bigquery = GCSToBigQueryOperator( task_id='load_tweets_to_bigquery', bigquery_conn_id=GOOGLE_CLOUD_CONN_ID, bucket=BUCKET_NAME, source_objects=[GCS_OBJECT_TWEETS_NAME], destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TWEETS_TABLE_NAME}", schema_fields=[ #based on https://cloud.google.com/bigquery/docs/schemas {'name': 'truncated', 'type': 'BOOL', 'mode': 'NULLABLE'}, {'name': 'text', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'is_quote_status', 'type': 'BOOL', 'mode': 'NULLABLE'}, {'name': 'in_reply_to_status_id', 'type': 'FLOAT64', 'mode': 'NULLABLE'}, {'name': 'in_reply_to_user_id', 'type': 'FLOAT64', 'mode': 'NULLABLE'}, {'name': 'id', 'type': 'INT64', 'mode': 'REQUIRED'}, {'name': 'favorite_count', 'type': 'INT64', 'mode': 'NULLABLE'}, {'name': 'retweeted', 'type': 'BOOL', 'mode': 'NULLABLE'}, {'name': 'coordinates', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'source', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'in_reply_to_screen_name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'id_str', 'type': 'INT64', 'mode': 'NULLABLE'}, {'name': 'retweet_count', 'type': 'INT64', 'mode': 'NULLABLE'}, {'name': 'metadata', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'favorited', 'type': 'BOOL', 'mode': 'NULLABLE'}, {'name': 'geo', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'lang', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'created_at', 'type': 'DATETIME', 'mode': 'NULLABLE'}, {'name': 'place', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'quoted_status_id', 'type': 'FLOAT64', 'mode': 'NULLABLE'}, {'name': 'quoted_status', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'possibly_sensitive', 'type': 'FLOAT64', 'mode': 'NULLABLE'}, {'name': 'quoted_status_id_str', 'type': 'FLOAT64', 'mode': 'NULLABLE'}, ], autodetect=False, write_disposition='WRITE_TRUNCATE', #If the table already exists - overwrites the table data ) loaded_tweets_user_data_bigquery = GCSToBigQueryOperator( task_id='load_tweets_user_to_bigquery', bigquery_conn_id=GOOGLE_CLOUD_CONN_ID, bucket=BUCKET_NAME, source_objects=[GCS_OBJECT_TWEETS_USER_NAME], destination_project_dataset_table=f"{DATASET_ID}.{BIGQUERY_TWEETS_USER_TABLE_NAME}", schema_fields=[ {'name': 'id', 'type': 'INT64', 'mode': 'REQUIRED'}, {'name': 'tweet_id', 'type': 'INT64', 'mode': 'NULLABLE'}, {'name': 'follow_request_sent', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'has_extended_profile', 'type': 'BOOL', 'mode': 'NULLABLE'}, {'name': 'profile_use_background_image', 'type': 'BOOL', 'mode': 'NULLABLE'}, {'name': 'verified', 'type': 'BOOL', 'mode': 'NULLABLE'}, {'name': 'translator_type', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'profile_text_color', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'profile_image_url_https', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'profile_sidebar_fill_color', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'entities', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'followers_count', 'type': 'INT64', 'mode': 'NULLABLE'}, {'name': 'protected', 'type': 'BOOL', 'mode': 'NULLABLE'}, {'name': 'location', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'default_profile_image', 'type': 'BOOL', 'mode': 'NULLABLE'}, {'name': 'id_str', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'lang', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'utc_offset', 'type': 'FLOAT64', 'mode': 'NULLABLE'}, {'name': 'statuses_count', 'type': 'INT64', 'mode': 'NULLABLE'}, {'name': 'description', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'friends_count', 'type': 'INT64', 'mode': 'NULLABLE'}, {'name': 'profile_background_image_url_https', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'profile_link_color', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'profile_image_url', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'following', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'geo_enabled', 'type': 'BOOL', 'mode': 'NULLABLE'}, {'name': 'profile_background_color', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'profile_banner_url', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'profile_background_image_url', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'screen_name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'is_translation_enabled', 'type': 'BOOL', 'mode': 'NULLABLE'}, {'name': 'profile_background_tile', 'type': 'BOOL', 'mode': 'NULLABLE'}, {'name': 'favourites_count', 'type': 'INT64', 'mode': 'NULLABLE'}, {'name': 'name', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'notifications', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'url', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'created_at', 'type': 'DATETIME', 'mode': 'NULLABLE'}, {'name': 'contributors_enabled', 'type': 'BOOL', 'mode': 'NULLABLE'}, {'name': 'time_zone', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'profile_sidebar_border_color', 'type': 'STRING', 'mode': 'NULLABLE'}, {'name': 'default_profile', 'type': 'BOOL', 'mode': 'NULLABLE'}, {'name': 'is_translator', 'type': 'BOOL', 'mode': 'NULLABLE'}, {'name': 'listed_count', 'type': 'INT64', 'mode': 'NULLABLE'}, ], autodetect=False, allow_quoted_newlines=True, write_disposition='WRITE_TRUNCATE', ) start >> [et_tweets, et_tweets_user] et_tweets >> stored_tweets_data_gcs et_tweets_user >> stored_tweets_user_data_gcs stored_tweets_data_gcs >> loaded_tweets_data_bigquery >> end stored_tweets_user_data_gcs >> loaded_tweets_user_data_bigquery >> end
}) pg_poc_pull = PythonOperator( task_id='PG_TO_FILE', python_callable=postgres_to_file, postgres_conn_id='spacecadets_postgres', op_kwargs={ 'conn_id': 'spacecadets_postgres', 'sql': pg_movies_dirs, 'filename': pg_csv_filename }, ) upload_pg_file = LocalFilesystemToGCSOperator( task_id="PG_UPLOAD_FILE", src=pg_csv_filename, dst=GCS_FILENAME.format('movies_directors', pg_base_filename), bucket=BUCKET, ) upload_mysql_file = LocalFilesystemToGCSOperator( task_id="MYSQL_UPLOAD_FILE", src=mysql_csv_filename, dst=GCS_FILENAME.format('movies_directors', mysql_base_filename), bucket=BUCKET, ) # t1, t2 and t3 are examples of tasks created by instantiating operators print_date = BashOperator( task_id='print_date', bash_command='date', )
python_callable=get_initial_id, op_kwargs=func_param dag=dag) check_data=BranchPythonOperator(task_id='check_data', python_callable=check_data, dag=dag) get_data_from_mongodb = PythonOperator(task_id='get_data_from_mongodb', python_callable=extract_mongodb, op_kwargs=func_param, dag=dag) load_to_staging= LocalFilesystemToGCSOperator(task_id='load_to_staging', src=func_param['source'], dst=func_param['dest_blob'], bucket=func_param['bucket_name'], gcp_conn_id='google_cloud_default', dag=dag) # update_bigquery_fact_table=BranchPythonOperator(task_id='update_bigquery_fact_table', # python_callable=table_existence, # dag=dag) transform_tripdata= PythonOperator(task_id='transform_tripdata', # pull xcom from extract_json python_callable=transform_tripdata, op_kwargs=func_param, dag=dag) local_parquet_to_gcs= LocalFilesystemToGCSOperator(task_id='local_parquet_to_gcs', src=func_param['source_transform'], dst=func_param['dest_blob_transform'],
autodetect=True) except GoogleAPIError: print('{} table already exist, skip loading table'.format( table_name)) get_weather_json = PythonOperator(task_id='get_weather_json', python_callable=get_weather_json, op_kwargs=func_param, trigger_rule='all_done', dag=dag) load_to_staging = LocalFilesystemToGCSOperator( task_id='load_to_staging', src=func_param['source_weather'], dst=func_param['dest_blob'], bucket=func_param['bucket_name'], gcp_conn_id='google_cloud_default', trigger_rule='all_done', dag=dag) check_dataset = BranchPythonOperator(task_id='check_dataset', python_callable=table_existence, trigger_rule='all_done', dag=dag) transform_raw_json = PythonOperator(task_id='transform_raw_json', python_callable=transform_json_data, op_kwargs=func_param, trigger_rule='none_failed_or_skipped', dag=dag)