def requires(self): yield ( ExternalHiveTask(table='student_courseenrollment', database=hive_database_name()), ExternalHiveTask(table='auth_user', database=hive_database_name()), ExternalHiveTask(table='last_country_of_user', database=hive_database_name()), )
def query(self): query = """ USE {database_name}; INSERT OVERWRITE TABLE {table} PARTITION ({partition.query_spec}) SELECT act.course_id as course_id, CONCAT(cal.iso_week_start, ' 00:00:00') as interval_start, CONCAT(cal.iso_week_end, ' 00:00:00') as interval_end, act.category as label, COUNT(DISTINCT username) as count FROM user_activity act JOIN calendar cal ON act.`date` = cal.`date` AND act.dt >= "{interval_start}" AND act.dt < "{interval_end}" WHERE "{interval_start}" <= cal.`date` AND cal.`date` < "{interval_end}" GROUP BY act.course_id, cal.iso_week_start, cal.iso_week_end, act.category; """.format( database_name=hive_database_name(), table=self.hive_table_task.table, partition=self.partition, interval_start=self.interval.date_a.isoformat(), interval_end=self.interval.date_b.isoformat(), ) return query
def query(self): query_format = textwrap.dedent(""" USE {database_name}; DROP TABLE IF EXISTS {table_name}; CREATE EXTERNAL TABLE {table_name} ( `date` STRING, course_id STRING, country_code STRING, count INT, cumulative_count INT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '{location}'; INSERT OVERWRITE TABLE {table_name} SELECT sce.dt, sce.course_id, uc.country_code, sum(if(sce.is_active, 1, 0)), count(sce.user_id) FROM student_courseenrollment sce LEFT OUTER JOIN last_country_of_user_id uc on sce.user_id = uc.user_id GROUP BY sce.dt, sce.course_id, uc.country_code; """) query = query_format.format( database_name=hive_database_name(), location=self.table_location, table_name=self.table, ) log.debug('Executing hive query: %s', query) return query
def query(self): query_format = textwrap.dedent(""" USE {database_name}; DROP TABLE IF EXISTS {table_name}; CREATE EXTERNAL TABLE {table_name} ( date STRING, course_id STRING, country_code STRING, count INT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '{location}'; INSERT OVERWRITE TABLE course_enrollment_location_current SELECT sce.dt, sce.course_id, uc.country_code, count(sce.user_id) FROM student_courseenrollment sce LEFT OUTER JOIN auth_user au on sce.user_id = au.id LEFT OUTER JOIN last_country_of_user uc on au.username = uc.username WHERE sce.is_active > 0 GROUP BY sce.dt, sce.course_id, uc.country_code; """) query = query_format.format( database_name=hive_database_name(), location=self.output().path, table_name='course_enrollment_location_current', ) log.debug('Executing hive query: %s', query) return query
def query(self): query = """ USE {database_name}; INSERT OVERWRITE TABLE {table} PARTITION ({partition.query_spec}) SELECT act.course_id as course_id, CONCAT(cal.iso_week_start, ' 00:00:00') as interval_start, CONCAT(cal.iso_week_end, ' 00:00:00') as interval_end, act.category as label, COUNT(DISTINCT user_id) as count FROM user_activity_by_user act JOIN calendar cal ON act.`date` = cal.`date` AND act.dt >= "{interval_start}" AND act.dt < "{interval_end}" WHERE "{interval_start}" <= cal.`date` AND cal.`date` < "{interval_end}" GROUP BY act.course_id, cal.iso_week_start, cal.iso_week_end, act.category; """.format( database_name=hive_database_name(), table=self.hive_table_task.table, partition=self.partition, interval_start=self.interval.date_a.isoformat(), interval_end=self.interval.date_b.isoformat(), ) return query
def query(self): # pragma: no cover full_insert_query = """ USE {database_name}; INSERT INTO TABLE {table} PARTITION ({partition.query_spec}) {insert_query}; """.format(database_name=hive_database_name(), table=self.partition_task.hive_table_task.table, partition=self.partition, insert_query=self.insert_query.strip(), # pylint: disable=no-member ) return textwrap.dedent(full_insert_query)
def query(self): query = """ USE {database_name}; INSERT OVERWRITE TABLE {table} PARTITION ({partition.query_spec}) {if_not_exists} SELECT pr.course_id, pr.answer_id, pr.problem_id, pr.problem, pr.username, pr.question, pr.score, pr.max_score, pr.correct, pr.answer, pr.total_attempts, pr.first_attempt_date, pr.last_attempt_date, CONCAT(COALESCE(cb.course_path, '{deleted_blocks_path}'), '{path_delimiter}', pr.problem) as location, COALESCE(cb.sort_idx, -1) as sort_idx FROM {problem_response_table} pr LEFT OUTER JOIN {course_blocks_table} cb ON (cb.block_id=pr.problem_id and cb.{course_blocks_partition}) WHERE pr.{problem_response_partition} ORDER BY pr.course_id, sort_idx, pr.first_attempt_date """.format( database_name=hive_database_name(), table=self.hive_table_task.table, partition=self.partition, path_delimiter=self.path_delimiter, deleted_blocks_path=self.deleted_blocks_path, if_not_exists='' if self.overwrite else 'IF NOT EXISTS', problem_response_table=self.problem_response_partition. hive_table_task.table, problem_response_partition="{}='{}'".format( self.problem_response_partition.hive_table_task.partition_by, self.problem_response_partition.partition_value), course_blocks_table=self.course_blocks_partition.hive_table_task. table, course_blocks_partition="{}='{}'".format( self.course_blocks_partition.hive_table_task.partition_by, self.course_blocks_partition.partition_value), ) query = textwrap.dedent(query) log.debug('query: %s', query) return query
def query(self): # TODO: Figure out how to clean up old data. This just cleans # out old metastore info, and doesn't actually remove the table # data. # Ensure there is exactly one available partition in the # table. Don't keep historical partitions since we don't want # to commit to taking snapshots at any regular interval. They # will happen when/if they need to happen. Table snapshots # should *not* be used for analyzing trends, instead we should # rely on events or database tables that keep historical # information. query_format = textwrap.dedent(""" USE {database_name}; DROP TABLE IF EXISTS `{table_name}`; CREATE EXTERNAL TABLE `{table_name}` ( {col_spec} ) PARTITIONED BY (dt STRING) {table_format} LOCATION '{location}'; ALTER TABLE `{table_name}` ADD PARTITION (dt = '{partition_date}'); """) query = query_format.format( database_name=hive_database_name(), table_name=self.table_name, col_spec=','.join([ '`{}` {}'.format(name, col_type) for name, col_type in self.columns ]), location=self.table_location, table_format=self.table_format, partition_date=self.partition_date, ) log.debug('Executing hive query: %s', query) # Mark the output as having been removed, even though # that doesn't technically happen until the query has been # executed (and in particular that the 'DROP TABLE' is executed). log.info("Marking existing output as having been removed for task %s", str(self)) self.attempted_removal = True return query
def query(self): # TODO: Figure out how to clean up old data. This just cleans # out old metastore info, and doesn't actually remove the table # data. # Ensure there is exactly one available partition in the # table. Don't keep historical partitions since we don't want # to commit to taking snapshots at any regular interval. They # will happen when/if they need to happen. Table snapshots # should *not* be used for analyzing trends, instead we should # rely on events or database tables that keep historical # information. query_format = textwrap.dedent( """ USE {database_name}; DROP TABLE IF EXISTS {table_name}; CREATE EXTERNAL TABLE {table_name} ( {col_spec} ) PARTITIONED BY (dt STRING) {table_format} LOCATION '{location}'; ALTER TABLE {table_name} ADD PARTITION (dt = '{partition_date}'); """ ) query = query_format.format( database_name=hive_database_name(), table_name=self.table_name, col_spec=",".join([" ".join(c) for c in self.columns]), location=self.table_location, table_format=self.table_format, partition_date=self.partition_date, ) log.debug("Executing hive query: %s", query) # Mark the output as having been removed, even though # that doesn't technically happen until the query has been # executed (and in particular that the 'DROP TABLE' is executed). log.info("Marking existing output as having been removed for task %s", str(self)) self.attempted_removal = True return query
def query(self): query = """ USE {database_name}; INSERT OVERWRITE TABLE {table} PARTITION ({partition.query_spec}) {if_not_exists} SELECT pr.course_id, pr.answer_id, pr.problem_id, pr.problem, pr.username, pr.question, pr.score, pr.max_score, pr.correct, pr.answer, pr.total_attempts, pr.first_attempt_date, pr.last_attempt_date, CONCAT(COALESCE(cb.course_path, '{deleted_blocks_path}'), '{path_delimiter}', pr.problem) as location, COALESCE(cb.sort_idx, -1) as sort_idx FROM {problem_response_table} pr LEFT OUTER JOIN {course_blocks_table} cb ON (cb.block_id=pr.problem_id and cb.{course_blocks_partition}) WHERE pr.{problem_response_partition} ORDER BY pr.course_id, sort_idx, pr.first_attempt_date """.format( database_name=hive_database_name(), table=self.hive_table_task.table, partition=self.partition, path_delimiter=self.path_delimiter, deleted_blocks_path=self.deleted_blocks_path, if_not_exists='' if self.overwrite else 'IF NOT EXISTS', problem_response_table=self.problem_response_partition.hive_table_task.table, problem_response_partition="{}='{}'".format(self.problem_response_partition.hive_table_task.partition_by, self.problem_response_partition.partition_value), course_blocks_table=self.course_blocks_partition.hive_table_task.table, course_blocks_partition="{}='{}'".format(self.course_blocks_partition.hive_table_task.partition_by, self.course_blocks_partition.partition_value), ) query = textwrap.dedent(query) log.debug('query: %s', query) return query
def query(self): query = """ USE {database_name}; INSERT OVERWRITE TABLE {table} PARTITION ({partition.query_spec}) SELECT au.id, ua.course_id, ua.`date`, ua.category, ua.count FROM auth_user au JOIN user_activity ua ON au.username = ua.username; """.format( database_name=hive_database_name(), table=self.hive_table_task.table, partition=self.partition, ) return query
def query(self): query = """ USE {database_name}; DROP TABLE IF EXISTS {table_name}; CREATE EXTERNAL TABLE {table_name} ( program_id STRING, program_type STRING, program_title STRING, catalog_course STRING, catalog_course_title STRING, course_id STRING, org_id STRING, partner_short_code STRING, program_slot_number INT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '{location}'; INSERT OVERWRITE TABLE {table_name} SELECT p.program_id, p.program_type, p.program_title, p.catalog_course, p.catalog_course_title, p.course_id, p.org_id, p.partner_short_code, o.program_slot_number FROM program_course p LEFT JOIN program_course_order o ON p.program_id = o.program_id AND p.catalog_course = o.catalog_course; """.format( database_name=hive_database_name(), location=self.table_location, table_name=self.table, ) log.debug('Executing hive query: %s', query) return query
def output(self): return HivePartitionTarget( self.table_name, self.partition, database=hive_database_name(), fail_missing_table=False )
def output(self): return HivePartitionTarget(self.table_name, self.partition, database=hive_database_name(), fail_missing_table=False)