예제 #1
0
 def __init__(self, bucket_name, path, config=None):
     self.bucket_name = bucket_name
     if path[0] == '/':
         self.path = path[1:]
     else:
         self.path = path
     self.config = config or get_config()
예제 #2
0
 def __init__(self, dataset_id, table_id, schema=None, empty=False, config=None, append=False):
     self.dataset_id = dataset_id
     self.table_id = table_id
     self.schema = schema or []
     self.empty = empty
     self.config = config or get_config()
     self.append = append
예제 #3
0
class Query(luigi.Task):
    config = get_config()
    debug = False
    timeout = 3600
    source = None
    variables = {}

    def query(self):
        return NotImplemented()

    def load_query(self, source):
        env = jinja2.Environment(
            loader=jinja2.PackageLoader(self.__module__, '.'))
        template = env.get_template(source)
        return template.render(task=self, **self.variables)

    def run_query(self, query):
        result = self.output()
        client = self.config.get_client()

        logger.info("%s: query: %s", self, query)
        job_id, _ = client.query(query)
        logger.info("%s: bigquery.job.id: %s", self, job_id)

        complete, result_size = client.check_job(job_id)
        try:
            if self.timeout:
                timeout = time.time() + self.timeout
            else:
                timeout = None

            while not complete:
                if timeout and time.time() > timeout:
                    raise QueryTimeout('{0} timed out'.format(self))
                time.sleep(5)
                complete, result_size = client.check_job(job_id)
        except:
            raise

        logger.info("%s: bigquery.job.result: job_id=%s result_size=%d", self,
                    job_id, result_size)

        return ResultProxy(Job(client, job_id))

    def run(self):
        query = self.load_query(self.source) if self.source else self.query()
        result = self.run_query(query)
        target = self.output()

        if target and isinstance(target, ResultTarget):
            target.save_result_state(result)

        if self.debug:
            import pandas as pd
            TERMINAL_WIDTH = 120
            pd.options.display.width = TERMINAL_WIDTH
            six.print_('-' * TERMINAL_WIDTH)
            six.print_('Query result:')
            six.print_(result.to_dataframe())
            six.print_('-' * TERMINAL_WIDTH)
예제 #4
0
class TableTask(luigi.Task):
    config = get_config()
    dataset_id = luigi.Parameter()
    table_id = luigi.Parameter()
    schema = luigi.Parameter(default=[], significant=False)
    empty = luigi.BooleanParameter(default=False, significant=False)

    def requires(self):
        return DatasetTask(self.dataset_id)

    def output(self):
        return TableTarget(self.dataset_id, self.table_id, empty=self.empty)

    def run(self):
        client = self.config.get_client()
        logger.info('%s: creating table: %s.%s', self, self.datasset_id,
                    self.table_id)
        client.create_table(self.dataset_id, self.table_id, self.schema)
예제 #5
0
class DatasetTask(luigi.Task):
    config = get_config()
    dataset_id = luigi.Parameter()

    def output(self):
        return DatasetTarget(self.dataset_id)

    def run(self):
        client = self.config.get_client()
        logger.info('%s: creating dataset: %s', self, self.dataset_id)
        client.create_dataset(self.dataset_id)

        max_retry = 30
        retry = 0
        while True:
            time.sleep(5.0)
            if client.check_dataset(self.dataset_id):
                break
            retry += 1
            if retry > max_retry:
                msg = "DatasetTask(dataset_id={0}) max retry error.".format(
                    self.dataset_id)
                logger.error(msg)
                raise Exception(msg)
예제 #6
0
 def __init__(self, bucket_name, config=None):
     self.bucket_name = bucket_name
     self.config = config or get_config()
예제 #7
0
 def __init__(self, path, config=None):
     self.path = path
     self.config = config or get_config()
예제 #8
0
 def __init__(self, dataset_id, table_id, empty=False, config=None, append=False):
     self.dataset_id = dataset_id
     self.table_id = table_id
     self.empty = empty
     self.config = config or get_config()
     self.append = append
예제 #9
0
 def __init__(self, dataset_id, config=None):
     self.dataset_id = dataset_id
     self.config = config or get_config()