from pyspark.sql.functions import udf, array from pyspark.sql.types import StringType, IntegerType from pyspark.sql.functions import col class Job(ETL_Base): def transform(self, some_events): udf_format_datetime = udf(self.format_datetime, StringType()) events_cleaned = some_events \ .withColumn('timestamp_obj', udf_format_datetime(some_events.timestamp).cast("timestamp")) \ .where(col('timestamp').like("%2.016%") == False) return events_cleaned @staticmethod def format_datetime(wiki_dt): dt = {} dt['year'] = wiki_dt[:4] dt['month'] = wiki_dt[4:6] dt['day'] = wiki_dt[6:8] dt['hour'] = wiki_dt[8:10] dt['minute'] = wiki_dt[10:12] dt['sec'] = wiki_dt[12:14] return '{year}-{month}-{day} {hour}:{minute}:{sec}'.format(**dt) if __name__ == "__main__": args = {'job_param_file': 'conf/jobs_metadata.yml'} Commandliner(Job, **args)
from yaetos.etl_utils import Commandliner Commandliner(Job=None, launcher_file='jobs/generic/launcher.py')
from yaetos.etl_utils import Commandliner Commandliner(Job=None)
class Job(ETL_Base): """To run/deploy sql jobs, using --sql_file arg.""" def set_jargs(self, pre_jargs, loaded_inputs={}): # Function called only if running the job directly, i.e. "python yaetos/sql_job.py --sql_file=jobs/some_job.sql", ignored if running from "python jobs/generic/launcher.py --job_name=some_job.sql" sql_file = pre_jargs['cmd_args']['sql_file'] job_name = Job_Yml_Parser.set_job_name_from_file(sql_file) pre_jargs['job_args']['job_name'] = job_name return Job_Args_Parser(defaults_args=pre_jargs['defaults_args'], yml_args=None, job_args=pre_jargs['job_args'], cmd_args=pre_jargs['cmd_args'], loaded_inputs=loaded_inputs) def transform(self, **ignored): sql = self.read_sql_file(self.jargs.sql_file) df = self.query(sql) if self.jargs.merged_args.get('repartition'): df = df.repartition(self.jargs.merged_args['repartition']) return df @staticmethod def read_sql_file(fname): fh = open(fname, 'r') sql = fh.read() fh.close() return sql if __name__ == "__main__": Commandliner(Job)
def define_commandline_args(): parser = Commandliner.define_commandline_args() parser.add_argument("-q", "--sql_file", help="path of sql file to run") return parser