def __init__( self, args: Namespace, sources: Dict[str, Any], schema: List[Tuple[str, np.generic]], destinations: Dict[str, Any], stage: str, task: str, ): """Initiate parameters and client libraries for ETL task. :param args: args passed from command line, see `get_arg_parser()` :param sources: data source to be extracted, specified in task config, see `configs/*.py` :param schema: the target schema to load to. :param destinations: destinations to load data to, specified in task config, see `configs/*.py` :param stage: the stage of the loaded data, could be staging/production. :param task: the name of the task. """ # Clear cached files if args.rm: for source in sources: files = [] files += glob.glob( get_path_format(True).format( prefix=destinations["fs"]["prefix"], stage="raw", task=args.task, source=source, )) files += glob.glob( get_path_format(True).format( prefix=destinations["fs"]["prefix"], stage=stage, task=args.task, source=source, )) for f in files: log.info("Removing cached file: %s" % f) os.remove(f) self.task = task self.stage = stage self.args = args self.period = args.period self.current_date = args.date self.last_month = lookback_dates(args.date, args.period) self.sources = sources coltypes = [] for coltype in schema: coltypes += [Column(coltype[0], [IsDtypeValidation(coltype[1])])] self.schema = Schema(coltypes) self.raw_schema = schema self.destinations = destinations self.raw = dict() self.extracted_base = dict() self.extracted = dict() self.transformed = dict() self.gcs = storage.Client()
def extract(self): """Inherit from super class and extract latest fb_index for later use.""" super().extract() source = "bukalapak" if not self.args.source or source in self.args.source.split(","): config = self.sources[source] yesterday = lookback_dates(self.current_date, 1) if self.args.dest != "fs": self.extracted[source + "_base"] = self.extract_via_gcs( source, config, "raw", yesterday) else: self.extracted[source + "_base"] = self.extract_via_fs( source, config, "raw", yesterday)
def get_backfill_dates(self): if "backfill_days" in self.config: bf_dates = [] for bf_day in self.config["backfill_days"]: bf_dates += [ lookback_dates( datetime.datetime.strptime( self.date, utils.config.DEFAULT_DATE_FORMAT), bf_day, ).strftime(utils.config.DEFAULT_DATE_FORMAT) ] return bf_dates return
def extract_via_api( self, source: str, config: Dict[str, Any], stage: str = "raw", date: datetime.datetime = None, ) -> Union[DataFrame, Dict[str, DataFrame]]: """Extract data from API and convert into DataFrame. The logic is based on task config, see `configs/*.py` :rtype: DataFrame :param source: name of the data source to be extracted, specified in task config, see `configs/*.py` :param config: config of the data source to be extracted, specified in task config, see `configs/*.py` :param stage: the stage of the loaded data, could be raw/staging/production. :param date: the date part of the data file name, will use `self.current_date` if not specified :return: the extracted `DataFrame` """ # API paging start_date = (self.last_month.strftime(config["date_format"]) if date is None else lookback_dates(date, self.period)) end_date = (self.current_date.strftime(config["date_format"]) if date is None else date) request_interval = (config["request_interval"] if "request_interval" in config else 1) if "iterator" in config: raw = dict() extracted = dict() for it in config["iterator"]: log.debug("waiting for %s iterator %d" % (source, it)) time.sleep(request_interval) it = str(it) url = config["url"].format( api_key=config["api_key"], start_date=start_date, end_date=end_date, iterator=it, ) r = requests.get(url, allow_redirects=True) raw[it] = r.text extracted[it] = convert_df(raw[it], config) self.raw[source] = raw log.info("%s-%s-%s/%s x %d iterators extracted from API" % (stage, self.task, source, self.current_date.date(), len(extracted))) return extracted elif "page_size" in config: limit = config["page_size"] url = config["url"].format( api_key=config["api_key"], start_date=start_date, end_date=end_date, page=1, limit=limit, ) r = requests.get(url, allow_redirects=True) raw = [r.text] extracted = convert_df(raw[0], config) count = int(json_extract(raw[0], config["json_path_page_count"])) if count is None or int(count) <= 1: self.raw[source] = raw log.info("%s-%s-%s/%s x 1 page extracted from API" % (stage, self.task, source, self.current_date.date())) return extracted for page in range(2, count): log.debug("waiting for %s page %d" % (source, page)) time.sleep(request_interval) url = config["url"].format( api_key=config["api_key"], start_date=start_date, end_date=end_date, page=page, limit=limit, ) r = requests.get(url, allow_redirects=True) raw += [r.text] extracted = extracted.append(convert_df(raw[page - 1], config)) extracted = extracted.reset_index(drop=True) self.raw[source] = raw log.info( "%s-%s-%s/%s x %d pages extracted from API" % (stage, self.task, source, self.current_date.date(), count)) return extracted else: url = config["url"].format(api_key=config["api_key"], start_date=start_date, end_date=end_date) r = requests.get(url, allow_redirects=True) raw = r.text self.raw[source] = raw log.info("%s-%s-%s/%s extracted from API" % ("raw", self.task, source, self.current_date.date())) return convert_df(raw, config)
def get_latest_date(self): # assuming the latest date passed is one day behind lookback_period = ( 1 if "days_behind" not in self.config else self.config["days_behind"] + 1 ) return lookback_dates(datetime.datetime.utcnow(), lookback_period).date()
"""Adjust ETL task.""" import datetime from argparse import Namespace from typing import Dict, Any, List, Tuple from tasks import base import numpy as np from utils.config import get_configs, get_arg_parser import logging from utils.marshalling import lookback_dates log = logging.getLogger(__name__) DEFAULTS = {"date": lookback_dates(datetime.datetime.utcnow(), 1)} class AdjustEtlTask(base.EtlTask): """ETL task to compute Adjust from events.""" def __init__( self, args: Namespace, sources: Dict[str, Any], schema: List[Tuple[str, np.generic]], destinations: Dict[str, Any], ): """Initialize Adjust ETL task. :param args: args passed from command line, see `get_arg_parser()` :param sources: data source to be extracted, specified in task config, see `configs/*.py`