def execute(self, context=None): try: metastore_hook = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id) table_metadata = metastore_hook.get_table(self.table, db=self.schema) is_partitioned = len(table_metadata.partitionKeys) > 0 column_string = ', '.join([col.name for col in table_metadata.sd.cols]) where_clause = 'WHERE {}'.format(self.partition) if is_partitioned else '' self.hql = "SELECT COUNT(col2sum) FROM (SELECT COUNT(1) AS col2sum FROM {}.{} {} GROUP BY {}) t2 " \ "WHERE t2.col2sum > 1".format(self.schema, self.table, where_clause, column_string) hook = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id, mapred_queue=self.mapred_queue) hook.hive_cli_params = '-S' # suppress hive junk output output = hook.run_cli(hql=self.hql, schema=self.schema) output_row = int(output.strip()) if output_row > self.max_duplicates: raise AirflowException('There are {} duplicate records found whereas the max number of duplicates' ' can be {}'.format(output_row, self.max_duplicates)) except Exception as e: raise AirflowException('An error occurred with the following duplicate check query:\n\t{}\n{}' .format(self.hql, e))
def execute(self, context): ti = context['ti'] host, dagid, taskid, exectime = ti.hostname.split( '.')[0], ti.dag_id, ti.task_id, ti.execution_date.isoformat() hook = HiveCliHook( hive_cli_conn_id=self.hive_cli_conn_id, mapred_queue=self.mapred_queue, mapred_job_name='Airflow HiveEmailOperator task for {}.{}.{}.{}'. format(host, dagid, taskid, exectime)) hook.hive_cli_params = '-S' # suppress hive junk output output = hook.run_cli(hql=self.hql, schema=self.schema, hive_conf={'hive.cli.print.header': 'true'}) output_rows = [line for line in output.split('\n') if line] col_names = output_rows[0].split('\t') output_rows = output_rows[1:] if len(output_rows) > self.cutoff: msg = 'The query returned > {} rows.. Adding tsv as an attachment.'.format( self.cutoff) logging.warn(msg) f = tempfile.NamedTemporaryFile(delete=False) f.write(output) f.close() self.files = [f.name] self.html_content = '{}<br>Dag id: {}<br>Task id: {}<br>Execution Time: {}'.format( msg, dagid, taskid, exectime) else: context.update({ 'hql': self.hql, 'rows': output_rows, 'col_names': col_names }) if not self.html_content: check_path = os.path.join(os.path.dirname(__file__), '..', 'templates', 'hive_email_default.html') else: dag_path = conf.get('core', 'dags_folder') check_path = os.path.join( dag_path, os.path.dirname(context['dag'].filepath), self.html_content) if os.path.exists(check_path): path, filename = os.path.split(os.path.abspath(check_path)) template = Environment( loader=FileSystemLoader(path)).get_template(filename) logging.info("Using templated file located at: {path}".format( path=check_path)) else: template = Environment(loader=BaseLoader()).from_string( self.html_content) self.html_content = template.render(**context) super(HiveEmailOperator, self).execute(context) # delete the temp file after successfully attached to email if len(output_rows) > self.cutoff: os.unlink(f.name)