class FileToGCS(luigi.Task): """Uploads a file from local to Google Cloud Storage. Parameters ---------- client: `luigi.contrib.gcs.GCSClient()` instance, optional (default is a new instance) source: str E.g. "./path/to/my/file.csv" destination: str E.g. "gs://bucket/my/file.csv" References ---------- https://luigi.readthedocs.io/en/stable/api/luigi.contrib.gcs.html """ client = luigi.Parameter(default=gcs.GCSClient()) source = luigi.Parameter() destination = luigi.Parameter() # "gs://bi_poc/my-test.txt" def output(self): return gcs.GCSTarget(self.destination, client=self.client) def run(self): with open(self.source, 'r') as infile: with gcs.GCSTarget(self.destination, client=self.client).open(mode='w') as outfile: outfile.write(infile.read())
def setUp(self): self.gcs_client = gcs.GCSClient(CREDENTIALS) self.bq_client = bigquery.BigQueryClient(CREDENTIALS) self.table_id = "avro_bq_table" self.gcs_dir_url = 'gs://' + BUCKET_NAME + "/foo" self.addCleanup(self.gcs_client.remove, self.gcs_dir_url) self.addCleanup( self.bq_client.delete_dataset, bigquery.BQDataset(PROJECT_ID, DATASET_ID, EU_LOCATION)) self._produce_test_input()
def setUp(self): self.bq_client = bigquery.BigQueryClient(CREDENTIALS) self.gcs_client = gcs.GCSClient(CREDENTIALS) # Setup GCS input data try: self.gcs_client.client.buckets().insert(project=PROJECT_ID, body={ 'name': BUCKET_NAME, 'location': EU_LOCATION }).execute() except googleapiclient.errors.HttpError as ex: # todo verify that existing dataset is not US if ex.resp.status != 409: # bucket already exists raise self.gcs_client.remove(bucket_url(''), recursive=True) self.gcs_client.mkdir(bucket_url('')) text = '\n'.join( map(json.dumps, [{ 'field1': 'hi', 'field2': 1 }, { 'field1': 'bye', 'field2': 2 }])) self.gcs_file = bucket_url(self.id()) self.gcs_client.put_string(text, self.gcs_file) # Setup BigQuery datasets self.table = bigquery.BQTable(project_id=PROJECT_ID, dataset_id=DATASET_ID, table_id=self.id().split('.')[-1], location=None) self.table_eu = bigquery.BQTable(project_id=PROJECT_ID, dataset_id=EU_DATASET_ID, table_id=self.id().split('.')[-1] + '_eu', location=EU_LOCATION) self.addCleanup(self.gcs_client.remove, bucket_url(''), recursive=True) self.addCleanup(self.bq_client.delete_dataset, self.table.dataset) self.addCleanup(self.bq_client.delete_dataset, self.table_eu.dataset) self.bq_client.delete_dataset(self.table.dataset) self.bq_client.delete_dataset(self.table_eu.dataset) self.bq_client.make_dataset(self.table.dataset, body={}) self.bq_client.make_dataset(self.table_eu.dataset, body={})
def setUp(self): self.client = gcs.GCSClient(CREDENTIALS) global ATTEMPTED_BUCKET_CREATE if not ATTEMPTED_BUCKET_CREATE: try: self.client.client.buckets().insert( project=PROJECT_ID, body={'name': BUCKET_NAME}).execute() except googleapiclient.errors.HttpError as ex: if ex.resp.status != 409: # bucket already exists raise ATTEMPTED_BUCKET_CREATE = True self.client.remove(bucket_url(''), recursive=True) self.client.mkdir(bucket_url(''))
def __init__(self, path, client=None): client = client or get_default_client() self.gcs_client = gcs.GCSClient(client.oauth()) super(AtomicGCSFile, self).__init__(path)
import google.auth from google.cloud import storage, bigquery import luigi from luigi.contrib import gcs as luigi_gcs from luigi.contrib import bigquery as luigi_bigquery from luigi.contrib import external_program # Google Cloud PROJECT_ID = 'senpai-io' BUCKET_NAME = 'senpai-io.appspot.com' BUCKET_PATH = 'gs://{}'.format(BUCKET_NAME) BUCKET_SUBDIR = 'quandl-stage' CREDENTIALS, _ = google.auth.default() GCS_CLIENT = luigi_gcs.GCSClient(CREDENTIALS) GCS_BUCKET = storage.Client().get_bucket(BUCKET_NAME) # BQ_CLIENT = luigi_bigquery.BigQueryClient(CREDENTIALS) # Dates TODAY = datetime.today() YESTERDAY = TODAY - timedelta(days=1) # Logging logger = logging.getLogger('luigi-interface') logger.setLevel(logging.INFO) fh = logging.FileHandler( 'logs/{date:%Y-%m-%d}-luigi.log'.format(date=YESTERDAY)) fh.setLevel(logging.INFO) logger.addHandler(fh)
def run(self): client = gcs.GCSClient(oauth_credentials=self.credentials) client.put(self.input().path, self.output().path)