def __init__(self, bucket_name, path, config=None): self.bucket_name = bucket_name if path[0] == '/': self.path = path[1:] else: self.path = path self.config = config or get_config()
def __init__(self, dataset_id, table_id, schema=None, empty=False, config=None, append=False): self.dataset_id = dataset_id self.table_id = table_id self.schema = schema or [] self.empty = empty self.config = config or get_config() self.append = append
class Query(luigi.Task): config = get_config() debug = False timeout = 3600 source = None variables = {} def query(self): return NotImplemented() def load_query(self, source): env = jinja2.Environment( loader=jinja2.PackageLoader(self.__module__, '.')) template = env.get_template(source) return template.render(task=self, **self.variables) def run_query(self, query): result = self.output() client = self.config.get_client() logger.info("%s: query: %s", self, query) job_id, _ = client.query(query) logger.info("%s: bigquery.job.id: %s", self, job_id) complete, result_size = client.check_job(job_id) try: if self.timeout: timeout = time.time() + self.timeout else: timeout = None while not complete: if timeout and time.time() > timeout: raise QueryTimeout('{0} timed out'.format(self)) time.sleep(5) complete, result_size = client.check_job(job_id) except: raise logger.info("%s: bigquery.job.result: job_id=%s result_size=%d", self, job_id, result_size) return ResultProxy(Job(client, job_id)) def run(self): query = self.load_query(self.source) if self.source else self.query() result = self.run_query(query) target = self.output() if target and isinstance(target, ResultTarget): target.save_result_state(result) if self.debug: import pandas as pd TERMINAL_WIDTH = 120 pd.options.display.width = TERMINAL_WIDTH six.print_('-' * TERMINAL_WIDTH) six.print_('Query result:') six.print_(result.to_dataframe()) six.print_('-' * TERMINAL_WIDTH)
class TableTask(luigi.Task): config = get_config() dataset_id = luigi.Parameter() table_id = luigi.Parameter() schema = luigi.Parameter(default=[], significant=False) empty = luigi.BooleanParameter(default=False, significant=False) def requires(self): return DatasetTask(self.dataset_id) def output(self): return TableTarget(self.dataset_id, self.table_id, empty=self.empty) def run(self): client = self.config.get_client() logger.info('%s: creating table: %s.%s', self, self.datasset_id, self.table_id) client.create_table(self.dataset_id, self.table_id, self.schema)
class DatasetTask(luigi.Task): config = get_config() dataset_id = luigi.Parameter() def output(self): return DatasetTarget(self.dataset_id) def run(self): client = self.config.get_client() logger.info('%s: creating dataset: %s', self, self.dataset_id) client.create_dataset(self.dataset_id) max_retry = 30 retry = 0 while True: time.sleep(5.0) if client.check_dataset(self.dataset_id): break retry += 1 if retry > max_retry: msg = "DatasetTask(dataset_id={0}) max retry error.".format( self.dataset_id) logger.error(msg) raise Exception(msg)
def __init__(self, bucket_name, config=None): self.bucket_name = bucket_name self.config = config or get_config()
def __init__(self, path, config=None): self.path = path self.config = config or get_config()
def __init__(self, dataset_id, table_id, empty=False, config=None, append=False): self.dataset_id = dataset_id self.table_id = table_id self.empty = empty self.config = config or get_config() self.append = append
def __init__(self, dataset_id, config=None): self.dataset_id = dataset_id self.config = config or get_config()