Exemplo n.º 1
0
 def execute(self, context=None):
     logging.info('Executing SQL check: ' + self.sql)
     hook = PrestoHook(presto_conn_id=self.presto_conn_id)
     records = hook.get_first(hql=self.sql)
     if not records:
         raise AirflowException("The query returned None")
     test_results = []
     except_temp = ("Test failed.\nPass value:{self.pass_value}\n"
                    "Query:\n{self.sql}\nResults:\n{records!s}")
     if not self.is_numeric_value_check:
         tests = [str(r) == self.pass_value for r in records]
     elif self.is_numeric_value_check:
         try:
             num_rec = [float(r) for r in records]
         except (ValueError, TypeError) as e:
             cvestr = "Converting a result to float failed.\n"
             raise AirflowException(cvestr+except_temp.format(**locals()))
         if self.has_tolerance:
             tests = [
                 r / (1 + self.tol) <= self.pass_value <= r / (1 - self.tol)
                 for r in num_rec]
         else:
             tests = [r == self.pass_value for r in num_rec]
     if not all(tests):
         raise AirflowException(except_temp.format(**locals()))
Exemplo n.º 2
0
 def data(self):
     table = request.args.get("table")
     sql = "SELECT * FROM {table} LIMIT 1000;".format(table=table)
     h = PrestoHook(PRESTO_CONN_ID)
     df = h.get_pandas_df(sql)
     return df.to_html(
         classes="table table-striped table-bordered table-hover",
         index=False,
         na_rep='',)
Exemplo n.º 3
0
    def __init__(self,
                 sql,
                 presto_conn_id=conf.get('hooks', 'PRESTO_DEFAULT_CONN_ID'),
                 *args,
                 **kwargs):
        super(PrestoCheckOperator, self).__init__(*args, **kwargs)

        self.presto_conn_id = presto_conn_id
        self.hook = PrestoHook(presto_conn_id=presto_conn_id)
        self.sql = sql
Exemplo n.º 4
0
 def data(self):
     table = request.args.get("table")
     sql = "SELECT * FROM {table} LIMIT 1000;".format(table=table)
     h = PrestoHook(PRESTO_CONN_ID)
     df = h.get_pandas_df(sql)
     return df.to_html(
         classes="table table-striped table-bordered table-hover",
         index=False,
         na_rep='',
     )
Exemplo n.º 5
0
 def execute(self, context=None):
     hook = PrestoHook(presto_conn_id=self.presto_conn_id)
     logging.info('Executing SQL check: ' + self.sql)
     records = hook.get_first(hql=self.sql)
     logging.info("Record: " + str(records))
     if not records:
         raise AirflowException("The query returned None")
     elif not all([bool(r) for r in records]):
         exceptstr = "Test failed.\nQuery:\n{q}\nResults:\n{r!s}"
         raise AirflowException(exceptstr.format(q=self.sql, r=records))
     logging.info("Success.")
Exemplo n.º 6
0
    def execute(self, context):
        presto = PrestoHook(presto_conn_id=self.presto_conn_id)
        logging.info("Extracting data from Presto")
        logging.info(self.sql)
        results = presto.get_records(self.sql)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        if self.mysql_preoperator:
            logging.info("Running MySQL preoperator")
            logging.info(self.mysql_preoperator)
            mysql.run(self.mysql_preoperator)

        logging.info("Inserting rows into MySQL")
        mysql.insert_rows(table=self.mysql_table, rows=results)
Exemplo n.º 7
0
    def execute(self, context):
        presto = PrestoHook(presto_conn_id=self.presto_conn_id)
        logging.info("Extracting data from Presto")
        logging.info(self.sql)
        results = presto.get_records(self.sql)

        mysql = MySqlHook(mysql_conn_id=self.mysql_conn_id)
        if self.mysql_preoperator:
            logging.info("Running MySQL preoperator")
            logging.info(self.mysql_preoperator)
            mysql.run(self.mysql_preoperator)

        logging.info("Inserting rows into MySQL")
        mysql.insert_rows(table=self.mysql_table, rows=results)
Exemplo n.º 8
0
class PrestoCheckOperator(BaseOperator):
    """
    Performs a simple check using sql code in a specific Presto database.

    :param sql: the sql to be executed
    :type sql: string
    :param presto_dbid: reference to the Presto database
    :type presto_dbid: string
    """

    __mapper_args__ = {
        'polymorphic_identity': 'PrestoCheckOperator'
    }
    template_fields = ('sql',)
    template_ext = ('.hql', '.sql',)

    @apply_defaults
    def __init__(
            self, sql,
            presto_conn_id=conf.get('hooks', 'PRESTO_DEFAULT_CONN_ID'),
            *args, **kwargs):
        super(PrestoCheckOperator, self).__init__(*args, **kwargs)

        self.presto_conn_id = presto_conn_id
        self.hook = PrestoHook(presto_conn_id=presto_conn_id)
        self.sql = sql

    def execute(self, execution_date=None):
        logging.info('Executing SQL check: ' + self.sql)
        records = self.hook.get_records(hql=self.sql)
        if not records:
            return False
        else:
            return not any([ bool(r) for r in records[0] ])
Exemplo n.º 9
0
    def __init__(
            self, sql,
            presto_conn_id=conf.get('hooks', 'PRESTO_DEFAULT_CONN_ID'),
            *args, **kwargs):
        super(PrestoCheckOperator, self).__init__(*args, **kwargs)

        self.presto_conn_id = presto_conn_id
        self.hook = PrestoHook(presto_conn_id=presto_conn_id)
        self.sql = sql
Exemplo n.º 10
0
 def execute(self, context=None):
     hook = PrestoHook(presto_conn_id=self.presto_conn_id)
     logging.info('Executing SQL check: ' + self.sql2)
     row2 = hook.get_first(hql=self.sql2)
     logging.info('Executing SQL check: ' + self.sql1)
     row1 = hook.get_first(hql=self.sql1)
     if not row2:
         raise AirflowException("The query {q} returned None").format(
             q=self.sql2)
     if not row1:
         raise AirflowException("The query {q} returned None").format(
             q=self.sql1)
     current = dict(zip(self.metrics_sorted, row1))
     reference = dict(zip(self.metrics_sorted, row2))
     ratios = {}
     test_results = {}
     rlog = "Ratio for {0}: {1} \n Ratio threshold : {2}"
     fstr = "'{k}' check failed. {r} is above {tr}"
     estr = "The following tests have failed:\n {0}"
     countstr = "The following {j} tests out of {n} failed:"
     for m in self.metrics_sorted:
         if current[m] == 0 or reference[m] == 0:
             ratio = None
         else:
             ratio = float(max(current[m], reference[m])) / \
                 min(current[m], reference[m])
         logging.info(rlog.format(m, ratio, self.metrics_thresholds[m]))
         ratios[m] = ratio
         test_results[m] = ratio < self.metrics_thresholds[m]
     if not all(test_results.values()):
         failed_tests = [it[0] for it in test_results.items() if not it[1]]
         j = len(failed_tests)
         n = len(self.metrics_sorted)
         logging.warning(countstr.format(**locals()))
         for k in failed_tests:
             logging.warning(
                 fstr.format(k=k,
                             r=ratios[k],
                             tr=self.metrics_thresholds[k]))
         raise AirflowException(estr.format(", ".join(failed_tests)))
     logging.info("All tests have passed")
Exemplo n.º 11
0
 def execute(self, context=None):
     hook = PrestoHook(presto_conn_id=self.presto_conn_id)
     logging.info('Executing SQL check: ' + self.sql2)
     row2 = hook.get_first(hql=self.sql2)
     logging.info('Executing SQL check: ' + self.sql1)
     row1 = hook.get_first(hql=self.sql1)
     if not row2:
         raise AirflowException("The query {q} returned None").format(q=self.sql2)
     if not row1:
         raise AirflowException("The query {q} returned None").format(q=self.sql1)
     current = dict(zip(self.metrics_sorted, row1))
     reference = dict(zip(self.metrics_sorted, row2))
     ratios = {}
     test_results = {}
     rlog = "Ratio for {0}: {1} \n Ratio threshold : {2}"
     fstr = "'{k}' check failed. {r} is above {tr}"
     estr = "The following tests have failed:\n {0}"
     countstr = "The following {j} tests out of {n} failed:"
     for m in self.metrics_sorted:
         if current[m] == 0 or reference[m] == 0:
             ratio = None
         else:
             ratio = float(max(current[m], reference[m])) / \
                 min(current[m], reference[m])
         logging.info(rlog.format(m, ratio, self.metrics_thresholds[m]))
         ratios[m] = ratio
         test_results[m] = ratio < self.metrics_thresholds[m]
     if not all(test_results.values()):
         failed_tests = [it[0] for it in test_results.items() if not it[1]]
         j = len(failed_tests)
         n = len(self.metrics_sorted)
         logging.warning(countstr.format(**locals()))
         for k in failed_tests:
             logging.warning(fstr.format(k=k, r=ratios[k],
                             tr=self.metrics_thresholds[k]))
         raise AirflowException(estr.format(", ".join(failed_tests)))
     logging.info("All tests have passed")
Exemplo n.º 12
0
class PrestoCheckOperator(BaseOperator):
    """
    Performs a simple check using sql code in a specific Presto database.

    :param sql: the sql to be executed
    :type sql: string
    :param presto_dbid: reference to the Presto database
    :type presto_dbid: string
    """

    __mapper_args__ = {'polymorphic_identity': 'PrestoCheckOperator'}
    template_fields = ('sql', )
    template_ext = (
        '.hql',
        '.sql',
    )

    @apply_defaults
    def __init__(self,
                 sql,
                 presto_conn_id=conf.get('hooks', 'PRESTO_DEFAULT_CONN_ID'),
                 *args,
                 **kwargs):
        super(PrestoCheckOperator, self).__init__(*args, **kwargs)

        self.presto_conn_id = presto_conn_id
        self.hook = PrestoHook(presto_conn_id=presto_conn_id)
        self.sql = sql

    def execute(self, execution_date=None):
        logging.info('Executing SQL check: ' + self.sql)
        records = self.hook.get_records(hql=self.sql)
        if not records:
            return False
        else:
            return not any([bool(r) for r in records[0]])
Exemplo n.º 13
0
    def execute(self, context=None):
        metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
        table = metastore.get_table(table_name=self.table)
        field_types = {col.name: col.type for col in table.sd.cols}

        exprs = {
            ('', 'count'): 'COUNT(*)'
        }
        for col, col_type in field_types.items():
            d = {}
            if self.assignment_func:
                d = self.assignment_func(col, col_type)
                if d is None:
                    d = self.get_default_exprs(col, col_type)
            else:
                d = self.get_default_exprs(col, col_type)
            exprs.update(d)
        exprs.update(self.extra_exprs)
        exprs = OrderedDict(exprs)
        exprs_str = ",\n        ".join([
            v + " AS " + k[0] + '__' + k[1]
            for k, v in exprs.items()])

        where_clause = [
            "{0} = '{1}'".format(k, v) for k, v in self.partition.items()]
        where_clause = " AND\n        ".join(where_clause)
        sql = """
        SELECT
            {exprs_str}
        FROM {self.table}
        WHERE
            {where_clause};
        """.format(**locals())

        hook = PrestoHook(presto_conn_id=self.presto_conn_id)
        logging.info('Executing SQL check: ' + sql)
        row = hook.get_first(hql=sql)
        logging.info("Record: " + str(row))
        if not row:
            raise Exception("The query returned None")

        part_json = json.dumps(self.partition, sort_keys=True)

        logging.info("Deleting rows from previous runs if they exist")
        mysql = MySqlHook(self.mysql_conn_id)
        sql = """
        SELECT 1 FROM hive_stats
        WHERE
            table_name='{self.table}' AND
            partition_repr='{part_json}' AND
            dttm='{self.dttm}'
        LIMIT 1;
        """.format(**locals())
        if mysql.get_records(sql):
            sql = """
            DELETE FROM hive_stats
            WHERE
                table_name='{self.table}' AND
                partition_repr='{part_json}' AND
                dttm='{self.dttm}';
            """.format(**locals())
            mysql.run(sql)

        logging.info("Pivoting and loading cells into the Airflow db")
        rows = [
            (self.ds, self.dttm, self.table, part_json) +
            (r[0][0], r[0][1], r[1])
            for r in zip(exprs, row)]
        mysql.insert_rows(
            table='hive_stats',
            rows=rows,
            target_fields=[
                'ds',
                'dttm',
                'table_name',
                'partition_repr',
                'col',
                'metric',
                'value',
            ]
        )
Exemplo n.º 14
0
    def execute(self, context=None):
        metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
        table = metastore.get_table(table_name=self.table)
        field_types = {col.name: col.type for col in table.sd.cols}

        exprs = {('', 'count'): 'COUNT(*)'}
        for col, col_type in field_types.items():
            d = {}
            if self.assignment_func:
                d = self.assignment_func(col, col_type)
                if d is None:
                    d = self.get_default_exprs(col, col_type)
            else:
                d = self.get_default_exprs(col, col_type)
            exprs.update(d)
        exprs.update(self.extra_exprs)
        exprs = OrderedDict(exprs)
        exprs_str = ",\n        ".join(
            [v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()])

        where_clause = [
            "{0} = '{1}'".format(k, v) for k, v in self.partition.items()
        ]
        where_clause = " AND\n        ".join(where_clause)
        sql = """
        SELECT
            {exprs_str}
        FROM {self.table}
        WHERE
            {where_clause};
        """.format(**locals())

        hook = PrestoHook(presto_conn_id=self.presto_conn_id)
        logging.info('Executing SQL check: ' + sql)
        row = hook.get_first(hql=sql)
        logging.info("Record: " + str(row))
        if not row:
            raise AirflowException("The query returned None")

        part_json = json.dumps(self.partition, sort_keys=True)

        logging.info("Deleting rows from previous runs if they exist")
        mysql = MySqlHook(self.mysql_conn_id)
        sql = """
        SELECT 1 FROM hive_stats
        WHERE
            table_name='{self.table}' AND
            partition_repr='{part_json}' AND
            dttm='{self.dttm}'
        LIMIT 1;
        """.format(**locals())
        if mysql.get_records(sql):
            sql = """
            DELETE FROM hive_stats
            WHERE
                table_name='{self.table}' AND
                partition_repr='{part_json}' AND
                dttm='{self.dttm}';
            """.format(**locals())
            mysql.run(sql)

        logging.info("Pivoting and loading cells into the Airflow db")
        rows = [(self.ds, self.dttm, self.table, part_json) +
                (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)]
        mysql.insert_rows(table='hive_stats',
                          rows=rows,
                          target_fields=[
                              'ds',
                              'dttm',
                              'table_name',
                              'partition_repr',
                              'col',
                              'metric',
                              'value',
                          ])
Exemplo n.º 15
0
 def get_db_hook(self):
     return PrestoHook(presto_conn_id=self.presto_conn_id)