def create_schema(self, check_exists=False): super().create_schema(check_exists) if check_exists and self.does_table_exist(): return if "init_query" in self.config: qstring = read_string("sql/{}.sql".format( self.config["init_query"])) qparams = self.get_query_params(self.date) qstring = qstring.format(**qparams) self.run_query(self.date, qstring) else: # Run a empty query to create schema start_date = "1970-01-01" qstring = read_string("sql/{}.sql".format(self.config["query"])) qparams = self.get_query_params(start_date) qstring = qstring.format(**qparams) LIMIT_REGEX = r"(.*)(LIMIT\s+[0-9]+)(.*)" if re.match(LIMIT_REGEX, qstring, re.IGNORECASE): re.sub(LIMIT_REGEX, r"\1 LIMIT 0 \3", qstring, flags=re.IGNORECASE) else: qstring += " LIMIT 0" self.run_query(start_date, qstring)
def build_query(config: Dict[str, Any], start_date: str, end_date: str) -> str: """Build query based on configs and args. :rtype: str :param config: the config of the query :param start_date: the start date string for the query :param end_date: the end date string for the query :return: the composed query string """ query = "" if "udf" in config: for udf in config["udf"]: query += read_string("udf/{}.sql".format(udf)) if "udf_js" in config: for udf_js in config["udf_js"]: query += read_string("udf_js/{}.sql".format(udf_js)) if "query" in config: query += read_string("sql/{}.sql".format(config["query"])).format( project=config["project"], dataset=config["dataset"], table=config["table"], start_date=start_date, end_date=end_date, ) return query
def daily_cleanup(self, d): if self.is_write_append(): if "cleanup_query" in self.config: qstring = read_string("sql/{}.sql".format(self.config["cleanup_query"])) qparams = self.get_query_params(d) query_job = self.client.query(qstring.format(**qparams)) query_job.result() log.info("Done custom cleaning up.") elif "execution_date_field" in self.config["params"]: qstring = read_string("sql/cleanup_generic.sql") qparams = self.get_query_params(d) query_job = self.client.query(qstring.format(**qparams)) query_job.result() log.info("Done generic cleaning up.")
def run_query(self, date, qstring=None): if qstring is None: qstring = read_string("sql/{}.sql".format(self.config["query"])) qparams = self.get_query_params(date) qstring = qstring.format(**qparams) table_ref = self.client.dataset( self.config["params"]["dataset"]).table( self.config["params"]["dest"]) job_config = bigquery.QueryJobConfig() job_config.write_disposition = ( bigquery.WriteDisposition.WRITE_APPEND if self.is_write_append() else bigquery.WriteDisposition.WRITE_TRUNCATE) if ("allow_field_addition" in self.config and self.config["allow_field_addition"]): job_config.schema_update_options = [ bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION ] job_config.destination = table_ref if "partition_field" in self.config: job_config.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field=self.config["partition_field"], ) query = self.client.query(qstring, job_config=job_config) query.result() if "create_view_alt" in self.config and self.config["create_view_alt"]: self.create_view("_view")
def test_read_string(): STR = "test1" fname = _create_temp_file() with open(fname, "w") as f: f.write(STR) data = read_string(fname) assert data == STR os.remove(fname)
def create_schema(self, check_exists=False): udfs = [] if "udf" in self.config: udfs += [("udf_%s" % x, read_string("udf/{}.sql".format(x))) for x in self.config["udf"]] if "udf_js" in self.config: udfs += [("udf_js_%s" % x, read_string("udf_js/{}.sql".format(x))) for x in self.config["udf_js"]] for udf, qstring in udfs: if check_exists and self.does_routine_exist(udf): continue qstring = qstring % ( self.config["params"]["project"], self.config["params"]["dataset"], ) # Initiate the query to create the routine. query_job = self.client.query(qstring) # Make an API request. # Wait for the query to complete. query_job.result() # Waits for the job to complete. log.info("Created routine {}".format(query_job.ddl_target_routine))
def create_view(self, postfix=None): qstring = read_string("sql/{}.sql".format(self.config["query"])) shared_dataset_ref = self.client.dataset( self.config["params"]["dataset"]) view_ref = shared_dataset_ref.table(self.config["params"]["dest"] + (postfix if postfix else "")) view = bigquery.Table(view_ref) qparams = self.get_query_params(self.date) view.view_query = qstring.format(**qparams) if self.does_table_exist(postfix): view = self.client.update_table(view, ["view_query"]) # API request else: view = self.client.create_table(view) # API request log.info("Successfully created view at {}".format(view.full_table_id))
def create_schema(self, check_exists=False): super().create_schema(check_exists) if check_exists and self.does_table_exist(): return qstring = read_string("sql/{}.sql".format(self.config["query"])) shared_dataset_ref = self.client.dataset(self.config["params"]["dataset"]) view_ref = shared_dataset_ref.table(self.config["params"]["dest"]) view = bigquery.Table(view_ref) qparams = self.get_query_params(self.date) view.view_query = qstring.format(**qparams) if self.does_table_exist(): view = self.client.update_table(view, ["view_query"]) # API request else: view = self.client.create_table(view) # API request log.info("Successfully created view at {}".format(view.full_table_id))
def run_query(self, date, qstring=None): if qstring is None: qstring = read_string("sql/{}.sql".format(self.config["query"])) qparams = self.get_query_params(date) qstring = qstring.format(**qparams) table_ref = self.client.dataset(self.config["params"]["dataset"]).table( self.config["params"]["dest"] ) job_config = bigquery.QueryJobConfig() job_config.write_disposition = ( bigquery.WriteDisposition.WRITE_APPEND if self.is_write_append() else bigquery.WriteDisposition.WRITE_TRUNCATE ) job_config.destination = table_ref if "partition_field" in self.config: job_config.time_partitioning = bigquery.TimePartitioning( type_=bigquery.TimePartitioningType.DAY, field=self.config["partition_field"], ) query = self.client.query(qstring, job_config=job_config) query.result()
def extract_via_fs( self, source: str, config: Dict[str, Any], stage: str = "raw", date: datetime.datetime = None, ) -> DataFrame: """Extract data from file system and convert into DataFrame. The logic is based on task config, see `configs/*.py` :rtype: DataFrame :param source: name of the data source to be extracted, specified in task config, see `configs/*.py` :param config: config of the data source to be extracted, specified in task config, see `configs/*.py` :param stage: the stage of the loaded data, could be raw/staging/production. :param date: the date part of the data file name, will use `self.current_date` if not specified :return: the extracted DataFrame """ # extract paged raw files if "paths" in config: fpaths = config["paths"] elif stage == "raw": fpaths = self.get_filepaths(source, config, stage, "fs", date) else: fpaths = [self.get_filepath(source, config, stage, "fs", date)] if "iterator" in config: extracted = None if "iterator" not in config else dict() for fpath in fpaths: raw = read_string(fpath) it = get_file_ext(fpath) self.raw[it] = raw extracted[it] = convert_df(raw, config) log.info("%s-%s-%s/%s x %d iterators extracted from file system" % ( stage, self.task, source, (self.current_date if date is None else date).date(), len(fpaths), )) else: extracted = None for fpath in fpaths: raw = read_string(fpath) if extracted is None: self.raw[source] = [raw] extracted = convert_df(raw, config) else: self.raw[source] += [raw] extracted = extracted.append(convert_df(raw, config)) extracted = extracted.reset_index(drop=True) log.info("%s-%s-%s/%s x %d pages extracted from file system" % ( stage, self.task, source, (self.current_date if date is None else date).date(), len(fpaths), )) return extracted