示例#1
0
    def execute(self, context=None):
        try:
            metastore_hook = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
            table_metadata = metastore_hook.get_table(self.table, db=self.schema)
            is_partitioned = len(table_metadata.partitionKeys) > 0
            column_string = ', '.join([col.name for col in table_metadata.sd.cols])

            where_clause = 'WHERE {}'.format(self.partition) if is_partitioned else ''
            self.hql = "SELECT COUNT(col2sum) FROM (SELECT COUNT(1) AS col2sum FROM {}.{} {} GROUP BY {}) t2 " \
                       "WHERE t2.col2sum > 1".format(self.schema, self.table, where_clause, column_string)

            hook = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id, mapred_queue=self.mapred_queue)
            hook.hive_cli_params = '-S'  # suppress hive junk output
            output = hook.run_cli(hql=self.hql, schema=self.schema)
            output_row = int(output.strip())

            if output_row > self.max_duplicates:
                raise AirflowException('There are {} duplicate records found whereas the max number of duplicates'
                                       ' can be {}'.format(output_row, self.max_duplicates))

        except Exception as e:
            raise AirflowException('An error occurred with the following duplicate check query:\n\t{}\n{}'
                                   .format(self.hql, e))
示例#2
0
    def execute(self, context):
        ti = context['ti']
        host, dagid, taskid, exectime = ti.hostname.split(
            '.')[0], ti.dag_id, ti.task_id, ti.execution_date.isoformat()
        hook = HiveCliHook(
            hive_cli_conn_id=self.hive_cli_conn_id,
            mapred_queue=self.mapred_queue,
            mapred_job_name='Airflow HiveEmailOperator task for {}.{}.{}.{}'.
            format(host, dagid, taskid, exectime))
        hook.hive_cli_params = '-S'  # suppress hive junk output
        output = hook.run_cli(hql=self.hql,
                              schema=self.schema,
                              hive_conf={'hive.cli.print.header': 'true'})

        output_rows = [line for line in output.split('\n') if line]
        col_names = output_rows[0].split('\t')
        output_rows = output_rows[1:]

        if len(output_rows) > self.cutoff:
            msg = 'The query returned > {} rows.. Adding tsv as an attachment.'.format(
                self.cutoff)
            logging.warn(msg)
            f = tempfile.NamedTemporaryFile(delete=False)
            f.write(output)
            f.close()
            self.files = [f.name]
            self.html_content = '{}<br>Dag id: {}<br>Task id: {}<br>Execution Time: {}'.format(
                msg, dagid, taskid, exectime)
        else:
            context.update({
                'hql': self.hql,
                'rows': output_rows,
                'col_names': col_names
            })

            if not self.html_content:
                check_path = os.path.join(os.path.dirname(__file__), '..',
                                          'templates',
                                          'hive_email_default.html')
            else:
                dag_path = conf.get('core', 'dags_folder')
                check_path = os.path.join(
                    dag_path, os.path.dirname(context['dag'].filepath),
                    self.html_content)

            if os.path.exists(check_path):
                path, filename = os.path.split(os.path.abspath(check_path))
                template = Environment(
                    loader=FileSystemLoader(path)).get_template(filename)
                logging.info("Using templated file located at: {path}".format(
                    path=check_path))
            else:
                template = Environment(loader=BaseLoader()).from_string(
                    self.html_content)

            self.html_content = template.render(**context)

        super(HiveEmailOperator, self).execute(context)

        # delete the temp file after successfully attached to email
        if len(output_rows) > self.cutoff:
            os.unlink(f.name)