Пример #1
1
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        logging.info("Extracting data from Hive")
        hive_table = "druid." + context["task_instance_key_str"].replace(".", "_")
        sql = self.sql.strip().strip(";")
        hql = """\
        set mapred.output.compress=false;
        set hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY  '\t'
        STORED AS TEXTFILE
        TBLPROPERTIES ('serialization.null.format' = '')
        AS
        {sql}
        """.format(
            **locals()
        )
        logging.info("Running command:\n {}".format(hql))
        hive.run_cli(hql)

        m = HiveMetastoreHook(self.metastore_conn_id)
        t = m.get_table(hive_table)

        columns = [col.name for col in t.sd.cols]

        hdfs_uri = m.get_table(hive_table).sd.location
        pos = hdfs_uri.find("/user")
        static_path = hdfs_uri[pos:]

        schema, table = hive_table.split(".")

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)
        logging.info("Inserting rows into Druid")
        logging.info("HDFS path: " + static_path)

        try:
            druid.load_from_hdfs(
                datasource=self.druid_datasource,
                intervals=self.intervals,
                static_path=static_path,
                ts_dim=self.ts_dim,
                columns=columns,
                num_shards=self.num_shards,
                target_partition_size=self.target_partition_size,
                query_granularity=self.query_granularity,
                segment_granularity=self.segment_granularity,
                metric_spec=self.metric_spec,
                hadoop_dependency_coordinates=self.hadoop_dependency_coordinates,
            )
            logging.info("Load seems to have succeeded!")
        finally:
            logging.info("Cleaning up by dropping the temp " "Hive table {}".format(hive_table))
            hql = "DROP TABLE IF EXISTS {}".format(hive_table)
            hive.run_cli(hql)
Пример #2
0
 def test_get_max_partition_from_valid_part_specs_and_invalid_filter_map(self):
     with self.assertRaises(AirflowException):
         HiveMetastoreHook._get_max_partition_from_part_specs(
             [{'key1': 'value1', 'key2': 'value2'},
              {'key1': 'value3', 'key2': 'value4'}],
             'key1',
             {'key3': 'value5'})
Пример #3
0
def max_partition(
        table, schema="default", field=None, filter=None,
        metastore_conn_id='metastore_default'):
    '''
    Gets the max partition for a table.

    :param schema: The hive schema the table lives in
    :type schema: string
    :param table: The hive table you are interested in, supports the dot
        notation as in "my_database.my_table", if a dot is found,
        the schema param is disregarded
    :type table: string
    :param hive_conn_id: The hive connection you are interested in.
        If your default is set you don't need to use this parameter.
    :type hive_conn_id: string
    :param filter: filter on a subset of partition as in
        `sub_part='specific_value'`
    :type filter: string
    :param field: the field to get the max value from. If there's only
        one partition field, this will be inferred

    >>> max_partition('airflow.static_babynames_partitioned')
    '2015-01-01'
    '''
    from airflow.hooks.hive_hooks import HiveMetastoreHook
    if '.' in table:
        schema, table = table.split('.')
    hh = HiveMetastoreHook(metastore_conn_id=metastore_conn_id)
    return hh.max_partition(
        schema=schema, table_name=table, field=field, filter=filter)
Пример #4
0
 def test_get_max_partition_from_valid_part_specs_and_none_partition_key(self):
     with self.assertRaises(AirflowException):
         HiveMetastoreHook._get_max_partition_from_part_specs(
             [{'key1': 'value1', 'key2': 'value2'},
              {'key1': 'value3', 'key2': 'value4'}],
             None,
             self.VALID_FILTER_MAP)
Пример #5
0
 def table(self):
     table_name = request.args.get("table")
     m = HiveMetastoreHook(METASTORE_CONN_ID)
     table = m.get_table(table_name)
     return self.render(
         "metastore_browser/table.html",
         table=table, table_name=table_name, datetime=datetime, int=int)
Пример #6
0
def max_partition(
        table, schema="default", field=None, filter_map=None,
        metastore_conn_id='metastore_default'):
    """
    Gets the max partition for a table.

    :param schema: The hive schema the table lives in
    :type schema: str
    :param table: The hive table you are interested in, supports the dot
        notation as in "my_database.my_table", if a dot is found,
        the schema param is disregarded
    :type table: str
    :param metastore_conn_id: The hive connection you are interested in.
        If your default is set you don't need to use this parameter.
    :type metastore_conn_id: str
    :param filter_map: partition_key:partition_value map used for partition filtering,
                       e.g. {'key1': 'value1', 'key2': 'value2'}.
                       Only partitions matching all partition_key:partition_value
                       pairs will be considered as candidates of max partition.
    :type filter_map: map
    :param field: the field to get the max value from. If there's only
        one partition field, this will be inferred
    :type field: str

    >>> max_partition('airflow.static_babynames_partitioned')
    '2015-01-01'
    """
    from airflow.hooks.hive_hooks import HiveMetastoreHook
    if '.' in table:
        schema, table = table.split('.')
    hh = HiveMetastoreHook(metastore_conn_id=metastore_conn_id)
    return hh.max_partition(
        schema=schema, table_name=table, field=field, filter_map=filter_map)
Пример #7
0
    def execute(self, context):
        hive = HiveCliHook(hive_cli_conn_id=self.hive_cli_conn_id)
        self.log.info("Extracting data from Hive")
        hive_table = 'druid.' + context['task_instance_key_str'].replace('.', '_')
        sql = self.sql.strip().strip(';')
        tblproperties = ''.join([", '{}' = '{}'"
                                .format(k, v)
                                 for k, v in self.hive_tblproperties.items()])
        hql = """\
        SET mapred.output.compress=false;
        SET hive.exec.compress.output=false;
        DROP TABLE IF EXISTS {hive_table};
        CREATE TABLE {hive_table}
        ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
        STORED AS TEXTFILE
        TBLPROPERTIES ('serialization.null.format' = ''{tblproperties})
        AS
        {sql}
        """.format(hive_table=hive_table, tblproperties=tblproperties, sql=sql)
        self.log.info("Running command:\n %s", hql)
        hive.run_cli(hql)

        m = HiveMetastoreHook(self.metastore_conn_id)

        # Get the Hive table and extract the columns
        t = m.get_table(hive_table)
        columns = [col.name for col in t.sd.cols]

        # Get the path on hdfs
        static_path = m.get_table(hive_table).sd.location

        schema, table = hive_table.split('.')

        druid = DruidHook(druid_ingest_conn_id=self.druid_ingest_conn_id)

        try:
            index_spec = self.construct_ingest_query(
                static_path=static_path,
                columns=columns,
            )

            self.log.info("Inserting rows into Druid, hdfs path: %s", static_path)

            druid.submit_indexing_job(index_spec)

            self.log.info("Load seems to have succeeded!")
        finally:
            self.log.info(
                "Cleaning up by dropping the temp Hive table %s",
                hive_table
            )
            hql = "DROP TABLE IF EXISTS {}".format(hive_table)
            hive.run_cli(hql)
Пример #8
0
 def test_get_max_partition_from_mal_valid_part_names(self):
     max_partition = \
         HiveMetastoreHook._get_max_partition_from_part_names(['some_key=value1',
                                                               'some_key=value2',
                                                               'some_key=value3'],
                                                              'some_key')
     self.assertEqual(max_partition, 'value3')
Пример #9
0
 def test_get_max_partition_from_valid_part_specs(self):
     max_partition = \
         HiveMetastoreHook._get_max_partition_from_part_specs(
             [{'key1': 'value1', 'key2': 'value2'},
              {'key1': 'value3', 'key2': 'value4'}],
             'key1',
             self.VALID_FILTER_MAP)
     self.assertEqual(max_partition, b'value1')
Пример #10
0
    def test_get_max_partition_from_valid_part_specs_and_none_filter_map(self):
        max_partition = \
            HiveMetastoreHook._get_max_partition_from_part_specs(
                [{'key1': 'value1', 'key2': 'value2'},
                 {'key1': 'value3', 'key2': 'value4'}],
                'key1',
                None)

        # No partition will be filtered out.
        self.assertEqual(max_partition, b'value3')
    def poke_partition(self, partition):
        if not self.hook:
            from airflow.hooks.hive_hooks import HiveMetastoreHook
            self.hook = HiveMetastoreHook(
                metastore_conn_id=self.metastore_conn_id)

        schema, table, partition = self.parse_partition_name(partition)

        self.log.info('Poking for %s.%s/%s', schema, table, partition)
        return self.hook.check_for_named_partition(
            schema, table, partition)
Пример #12
0
 def poke(self, context):
     if '.' in self.table:
         self.schema, self.table = self.table.split('.')
     self.log.info(
         'Poking for table {self.schema}.{self.table}, '
         'partition {self.partition}'.format(**locals()))
     if not hasattr(self, 'hook'):
         from airflow.hooks.hive_hooks import HiveMetastoreHook
         self.hook = HiveMetastoreHook(
             metastore_conn_id=self.metastore_conn_id)
     return self.hook.check_for_partition(
         self.schema, self.table, self.partition)
 def poke(self, context):
     if '.' in self.table:
         self.schema, self.table = self.table.split('.')
     self.log.info(
         'Poking for table %s.%s, partition %s', self.schema, self.table, self.partition
     )
     if not hasattr(self, 'hook'):
         from airflow.hooks.hive_hooks import HiveMetastoreHook
         self.hook = HiveMetastoreHook(
             metastore_conn_id=self.metastore_conn_id)
     return self.hook.check_for_partition(
         self.schema, self.table, self.partition)
Пример #14
0
class HivePartitionSensor(BaseSensorOperator):
    """
    Waits for a partition to show up in Hive.

    Note: Because ``partition`` supports general logical operators, it
    can be inefficient. Consider using NamedHivePartitionSensor instead if
    you don't need the full flexibility of HivePartitionSensor.

    :param table: The name of the table to wait for, supports the dot
        notation (my_database.my_table)
    :type table: string
    :param partition: The partition clause to wait for. This is passed as
        is to the metastore Thrift client ``get_partitions_by_filter`` method,
        and apparently supports SQL like notation as in ``ds='2015-01-01'
        AND type='value'`` and comparison operators as in ``"ds>=2015-01-01"``
    :type partition: string
    :param metastore_conn_id: reference to the metastore thrift service
        connection id
    :type metastore_conn_id: str
    """
    template_fields = ('schema', 'table', 'partition',)
    ui_color = '#C5CAE9'

    @apply_defaults
    def __init__(
            self,
            table, partition="ds='{{ ds }}'",
            metastore_conn_id='metastore_default',
            schema='default',
            poke_interval=60*3,
            *args, **kwargs):
        super(HivePartitionSensor, self).__init__(
            poke_interval=poke_interval, *args, **kwargs)
        if not partition:
            partition = "ds='{{ ds }}'"
        self.metastore_conn_id = metastore_conn_id
        self.table = table
        self.partition = partition
        self.schema = schema

    def poke(self, context):
        if '.' in self.table:
            self.schema, self.table = self.table.split('.')
        self.log.info(
            'Poking for table {self.schema}.{self.table}, '
            'partition {self.partition}'.format(**locals()))
        if not hasattr(self, 'hook'):
            from airflow.hooks.hive_hooks import HiveMetastoreHook
            self.hook = HiveMetastoreHook(
                metastore_conn_id=self.metastore_conn_id)
        return self.hook.check_for_partition(
            self.schema, self.table, self.partition)
Пример #15
0
    def poke_partition(self, partition):
        if not self.hook:
            from airflow.hooks.hive_hooks import HiveMetastoreHook
            self.hook = HiveMetastoreHook(
                metastore_conn_id=self.metastore_conn_id)

        schema, table, partition = self.parse_partition_name(partition)

        self.log.info(
            'Poking for {schema}.{table}/{partition}'.format(**locals())
        )
        return self.hook.check_for_named_partition(
            schema, table, partition)
Пример #16
0
def closest_ds_partition(
        table, ds, before=True, schema="default",
        metastore_conn_id='metastore_default'):
    """
    This function finds the date in a list closest to the target date.
    An optional parameter can be given to get the closest before or after.

    :param table: A hive table name
    :type table: str
    :param ds: A datestamp ``%Y-%m-%d`` e.g. ``yyyy-mm-dd``
    :type ds: list[datetime.date]
    :param before: closest before (True), after (False) or either side of ds
    :type before: bool or None
    :returns: The closest date
    :rtype: str or None

    >>> tbl = 'airflow.static_babynames_partitioned'
    >>> closest_ds_partition(tbl, '2015-01-02')
    '2015-01-01'
    """
    from airflow.hooks.hive_hooks import HiveMetastoreHook
    if '.' in table:
        schema, table = table.split('.')
    hh = HiveMetastoreHook(metastore_conn_id=metastore_conn_id)
    partitions = hh.get_partitions(schema=schema, table_name=table)
    if not partitions:
        return None
    part_vals = [list(p.values())[0] for p in partitions]
    if ds in part_vals:
        return ds
    else:
        parts = [datetime.datetime.strptime(pv, '%Y-%m-%d')
                 for pv in part_vals]
        target_dt = datetime.datetime.strptime(ds, '%Y-%m-%d')
        closest_ds = _closest_date(target_dt, parts, before_target=before)
        return closest_ds.isoformat()
Пример #17
0
    def poke(self, context):
        if not hasattr(self, 'hook'):
            from airflow.hooks.hive_hooks import HiveMetastoreHook
            self.hook = HiveMetastoreHook(
                metastore_conn_id=self.metastore_conn_id)

        def poke_partition(partition):

            schema, table, partition = self.parse_partition_name(partition)

            self.log.info(
                'Poking for {schema}.{table}/{partition}'.format(**locals())
            )
            return self.hook.check_for_named_partition(
                schema, table, partition)

        while self.next_poke_idx < len(self.partition_names):
            if poke_partition(self.partition_names[self.next_poke_idx]):
                self.next_poke_idx += 1
            else:
                return False

        return True
Пример #18
0
 def setUp(self):
     configuration.load_test_config()
     args = {'owner': 'airflow', 'start_date': DEFAULT_DATE}
     self.dag = DAG('test_dag_id', default_args=args)
     self.next_day = (DEFAULT_DATE +
                      datetime.timedelta(days=1)).isoformat()[:10]
     self.database = 'airflow'
     self.partition_by = 'ds'
     self.table = 'static_babynames_partitioned'
     self.hql = """
     CREATE DATABASE IF NOT EXISTS {{ params.database }};
     USE {{ params.database }};
     DROP TABLE IF EXISTS {{ params.table }};
     CREATE TABLE IF NOT EXISTS {{ params.table }} (
         state string,
         year string,
         name string,
         gender string,
         num int)
     PARTITIONED BY ({{ params.partition_by }} string);
     ALTER TABLE {{ params.table }}
     ADD PARTITION({{ params.partition_by }}='{{ ds }}');
     """
     self.hook = HiveMetastoreHook()
     t = operators.hive_operator.HiveOperator(
         task_id='HiveHook_' + str(random.randint(1, 10000)),
         params={
             'database': self.database,
             'table': self.table,
             'partition_by': self.partition_by
         },
         hive_cli_conn_id='beeline_default',
         hql=self.hql,
         dag=self.dag)
     t.run(start_date=DEFAULT_DATE,
           end_date=DEFAULT_DATE,
           ignore_ti_state=True)
Пример #19
0
 def tearDown(self):
     hook = HiveMetastoreHook()
     with hook.get_conn() as metastore:
         metastore.drop_table(self.database, self.table, deleteData=True)
class NamedHivePartitionSensor(BaseSensorOperator):
    """
    Waits for a set of partitions to show up in Hive.

    :param partition_names: List of fully qualified names of the
        partitions to wait for. A fully qualified name is of the
        form ``schema.table/pk1=pv1/pk2=pv2``, for example,
        default.users/ds=2016-01-01. This is passed as is to the metastore
        Thrift client ``get_partitions_by_name`` method. Note that
        you cannot use logical or comparison operators as in
        HivePartitionSensor.
    :type partition_names: list[str]
    :param metastore_conn_id: reference to the metastore thrift service
        connection id
    :type metastore_conn_id: str
    """

    template_fields = ('partition_names',)
    ui_color = '#8d99ae'

    @apply_defaults
    def __init__(self,
                 partition_names,
                 metastore_conn_id='metastore_default',
                 poke_interval=60 * 3,
                 hook=None,
                 *args,
                 **kwargs):
        super().__init__(
            poke_interval=poke_interval, *args, **kwargs)

        if isinstance(partition_names, basestring):
            raise TypeError('partition_names must be an array of strings')

        self.metastore_conn_id = metastore_conn_id
        self.partition_names = partition_names
        self.hook = hook
        if self.hook and metastore_conn_id != 'metastore_default':
            self.log.warning(
                'A hook was passed but a non defaul metastore_conn_id=%s was used', metastore_conn_id
            )

    @staticmethod
    def parse_partition_name(partition):
        first_split = partition.split('.', 1)
        if len(first_split) == 1:
            schema = 'default'
            table_partition = max(first_split)  # poor man first
        else:
            schema, table_partition = first_split
        second_split = table_partition.split('/', 1)
        if len(second_split) == 1:
            raise ValueError('Could not parse ' + partition +
                             'into table, partition')
        else:
            table, partition = second_split
        return schema, table, partition

    def poke_partition(self, partition):
        if not self.hook:
            from airflow.hooks.hive_hooks import HiveMetastoreHook
            self.hook = HiveMetastoreHook(
                metastore_conn_id=self.metastore_conn_id)

        schema, table, partition = self.parse_partition_name(partition)

        self.log.info('Poking for %s.%s/%s', schema, table, partition)
        return self.hook.check_for_named_partition(
            schema, table, partition)

    def poke(self, context):

        self.partition_names = [
            partition_name for partition_name in self.partition_names
            if not self.poke_partition(partition_name)
        ]
        return not self.partition_names
Пример #21
0
 def test_get_max_partition_from_empty_part_names(self):
     max_partition = \
         HiveMetastoreHook._get_max_partition_from_part_names([], 'some_key')
     self.assertIsNone(max_partition)
Пример #22
0
 def test_get_max_partition_from_mal_formatted_part_names(self):
     with self.assertRaises(AirflowException):
         HiveMetastoreHook._get_max_partition_from_part_names(
             ['bad_partition_name'], 'some_key')
Пример #23
0
    def execute(self, context=None):
        metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
        table = metastore.get_table(table_name=self.table)
        field_types = {col.name: col.type for col in table.sd.cols}

        exprs = {
            ('', 'count'): 'COUNT(*)'
        }
        for col, col_type in list(field_types.items()):
            d = {}
            if self.assignment_func:
                d = self.assignment_func(col, col_type)
                if d is None:
                    d = self.get_default_exprs(col, col_type)
            else:
                d = self.get_default_exprs(col, col_type)
            exprs.update(d)
        exprs.update(self.extra_exprs)
        exprs = OrderedDict(exprs)
        exprs_str = ",\n        ".join([
            v + " AS " + k[0] + '__' + k[1]
            for k, v in exprs.items()])

        where_clause = ["{} = '{}'".format(k, v) for k, v in self.partition.items()]
        where_clause = " AND\n        ".join(where_clause)
        sql = "SELECT {exprs_str} FROM {table} WHERE {where_clause};".format(
            exprs_str=exprs_str, table=self.table, where_clause=where_clause)

        presto = PrestoHook(presto_conn_id=self.presto_conn_id)
        self.log.info('Executing SQL check: %s', sql)
        row = presto.get_first(hql=sql)
        self.log.info("Record: %s", row)
        if not row:
            raise AirflowException("The query returned None")

        part_json = json.dumps(self.partition, sort_keys=True)

        self.log.info("Deleting rows from previous runs if they exist")
        mysql = MySqlHook(self.mysql_conn_id)
        sql = """
        SELECT 1 FROM hive_stats
        WHERE
            table_name='{table}' AND
            partition_repr='{part_json}' AND
            dttm='{dttm}'
        LIMIT 1;
        """.format(table=self.table, part_json=part_json, dttm=self.dttm)
        if mysql.get_records(sql):
            sql = """
            DELETE FROM hive_stats
            WHERE
                table_name='{table}' AND
                partition_repr='{part_json}' AND
                dttm='{dttm}';
            """.format(table=self.table, part_json=part_json, dttm=self.dttm)
            mysql.run(sql)

        self.log.info("Pivoting and loading cells into the Airflow db")
        rows = [(self.ds, self.dttm, self.table, part_json) + (r[0][0], r[0][1], r[1])
                for r in zip(exprs, row)]
        mysql.insert_rows(
            table='hive_stats',
            rows=rows,
            target_fields=[
                'ds',
                'dttm',
                'table_name',
                'partition_repr',
                'col',
                'metric',
                'value',
            ]
        )
Пример #24
0
class NamedHivePartitionSensor(BaseSensorOperator):
    """
    Waits for a set of partitions to show up in Hive.

    :param partition_names: List of fully qualified names of the
        partitions to wait for. A fully qualified name is of the
        form ``schema.table/pk1=pv1/pk2=pv2``, for example,
        default.users/ds=2016-01-01. This is passed as is to the metastore
        Thrift client ``get_partitions_by_name`` method. Note that
        you cannot use logical or comparison operators as in
        HivePartitionSensor.
    :type partition_names: list of strings
    :param metastore_conn_id: reference to the metastore thrift service
        connection id
    :type metastore_conn_id: str
    """

    template_fields = ('partition_names', )
    ui_color = '#8d99ae'

    @apply_defaults
    def __init__(
            self,
            partition_names,
            metastore_conn_id='metastore_default',
            poke_interval=60 * 3,
            *args,
            **kwargs):
        super(NamedHivePartitionSensor, self).__init__(
            poke_interval=poke_interval, *args, **kwargs)

        if isinstance(partition_names, basestring):
            raise TypeError('partition_names must be an array of strings')

        self.metastore_conn_id = metastore_conn_id
        self.partition_names = partition_names
        self.next_poke_idx = 0

    @classmethod
    def parse_partition_name(self, partition):
        try:
            schema, table_partition = partition.split('.', 1)
            table, partition = table_partition.split('/', 1)
            return schema, table, partition
        except ValueError as e:
            raise ValueError('Could not parse ' + partition)

    def poke(self, context):
        if not hasattr(self, 'hook'):
            from airflow.hooks.hive_hooks import HiveMetastoreHook
            self.hook = HiveMetastoreHook(
                metastore_conn_id=self.metastore_conn_id)

        def poke_partition(partition):

            schema, table, partition = self.parse_partition_name(partition)

            self.log.info(
                'Poking for {schema}.{table}/{partition}'.format(**locals())
            )
            return self.hook.check_for_named_partition(
                schema, table, partition)

        while self.next_poke_idx < len(self.partition_names):
            if poke_partition(self.partition_names[self.next_poke_idx]):
                self.next_poke_idx += 1
            else:
                return False

        return True
class NamedHivePartitionSensor(BaseSensorOperator):
    """
    Waits for a set of partitions to show up in Hive.

    :param partition_names: List of fully qualified names of the
        partitions to wait for. A fully qualified name is of the
        form ``schema.table/pk1=pv1/pk2=pv2``, for example,
        default.users/ds=2016-01-01. This is passed as is to the metastore
        Thrift client ``get_partitions_by_name`` method. Note that
        you cannot use logical or comparison operators as in
        HivePartitionSensor.
    :type partition_names: list[str]
    :param metastore_conn_id: reference to the metastore thrift service
        connection id
    :type metastore_conn_id: str
    """

    template_fields = ('partition_names',)
    ui_color = '#8d99ae'

    @apply_defaults
    def __init__(self,
                 partition_names,
                 metastore_conn_id='metastore_default',
                 poke_interval=60 * 3,
                 hook=None,
                 *args,
                 **kwargs):
        super().__init__(
            poke_interval=poke_interval, *args, **kwargs)

        if isinstance(partition_names, str):
            raise TypeError('partition_names must be an array of strings')

        self.metastore_conn_id = metastore_conn_id
        self.partition_names = partition_names
        self.hook = hook
        if self.hook and metastore_conn_id != 'metastore_default':
            self.log.warning(
                'A hook was passed but a non defaul metastore_conn_id=%s was used', metastore_conn_id
            )

    @staticmethod
    def parse_partition_name(partition):
        first_split = partition.split('.', 1)
        if len(first_split) == 1:
            schema = 'default'
            table_partition = max(first_split)  # poor man first
        else:
            schema, table_partition = first_split
        second_split = table_partition.split('/', 1)
        if len(second_split) == 1:
            raise ValueError('Could not parse ' + partition +
                             'into table, partition')
        else:
            table, partition = second_split
        return schema, table, partition

    def poke_partition(self, partition):
        if not self.hook:
            from airflow.hooks.hive_hooks import HiveMetastoreHook
            self.hook = HiveMetastoreHook(
                metastore_conn_id=self.metastore_conn_id)

        schema, table, partition = self.parse_partition_name(partition)

        self.log.info('Poking for %s.%s/%s', schema, table, partition)
        return self.hook.check_for_named_partition(
            schema, table, partition)

    def poke(self, context):

        self.partition_names = [
            partition_name for partition_name in self.partition_names
            if not self.poke_partition(partition_name)
        ]
        return not self.partition_names
Пример #26
0
 def tearDown(self):
     hook = HiveMetastoreHook()
     with hook.get_conn() as metastore:
         metastore.drop_table(self.database, self.table, deleteData=True)
Пример #27
0
    def execute(self, context=None):
        metastore = HiveMetastoreHook(metastore_conn_id=self.metastore_conn_id)
        table = metastore.get_table(table_name=self.table)
        field_types = {col.name: col.type for col in table.sd.cols}

        exprs = {('', 'count'): 'COUNT(*)'}
        for col, col_type in list(field_types.items()):
            d = {}
            if self.assignment_func:
                d = self.assignment_func(col, col_type)
                if d is None:
                    d = self.get_default_exprs(col, col_type)
            else:
                d = self.get_default_exprs(col, col_type)
            exprs.update(d)
        exprs.update(self.extra_exprs)
        exprs = OrderedDict(exprs)
        exprs_str = ",\n        ".join(
            [v + " AS " + k[0] + '__' + k[1] for k, v in exprs.items()])

        where_clause = [
            "{} = '{}'".format(k, v) for k, v in self.partition.items()
        ]
        where_clause = " AND\n        ".join(where_clause)
        sql = "SELECT {exprs_str} FROM {table} WHERE {where_clause};".format(
            exprs_str=exprs_str, table=self.table, where_clause=where_clause)

        presto = PrestoHook(presto_conn_id=self.presto_conn_id)
        self.log.info('Executing SQL check: %s', sql)
        row = presto.get_first(hql=sql)
        self.log.info("Record: %s", row)
        if not row:
            raise AirflowException("The query returned None")

        part_json = json.dumps(self.partition, sort_keys=True)

        self.log.info("Deleting rows from previous runs if they exist")
        mysql = MySqlHook(self.mysql_conn_id)
        sql = """
        SELECT 1 FROM hive_stats
        WHERE
            table_name='{table}' AND
            partition_repr='{part_json}' AND
            dttm='{dttm}'
        LIMIT 1;
        """.format(table=self.table, part_json=part_json, dttm=self.dttm)
        if mysql.get_records(sql):
            sql = """
            DELETE FROM hive_stats
            WHERE
                table_name='{table}' AND
                partition_repr='{part_json}' AND
                dttm='{dttm}';
            """.format(table=self.table, part_json=part_json, dttm=self.dttm)
            mysql.run(sql)

        self.log.info("Pivoting and loading cells into the Airflow db")
        rows = [(self.ds, self.dttm, self.table, part_json) +
                (r[0][0], r[0][1], r[1]) for r in zip(exprs, row)]
        mysql.insert_rows(table='hive_stats',
                          rows=rows,
                          target_fields=[
                              'ds',
                              'dttm',
                              'table_name',
                              'partition_repr',
                              'col',
                              'metric',
                              'value',
                          ])
Пример #28
0
 def db(self):
     db = request.args.get("db")
     m = HiveMetastoreHook(METASTORE_CONN_ID)
     tables = sorted(m.get_tables(db=db), key=lambda x: x.tableName)
     return self.render(
         "metastore_browser/db.html", tables=tables, db=db)
Пример #29
0
 def test_get_max_partition_from_empty_part_specs(self):
     max_partition = \
         HiveMetastoreHook._get_max_partition_from_part_specs([],
                                                              'key1',
                                                              self.VALID_FILTER_MAP)
     self.assertIsNone(max_partition)
Пример #30
0
 def test_get_max_partition_from_empty_part_specs(self):
     max_partition = \
         HiveMetastoreHook._get_max_partition_from_part_specs([],
                                                              'key1',
                                                              self.VALID_FILTER_MAP)
     self.assertIsNone(max_partition)
Пример #31
0
 def test_get_metastore_databases(self):
     if six.PY2:
         from airflow.hooks.hive_hooks import HiveMetastoreHook
         hook = HiveMetastoreHook()
         hook.get_databases()
Пример #32
0
class NamedHivePartitionSensor(BaseSensorOperator):
    """
    Waits for a set of partitions to show up in Hive.

    :param partition_names: List of fully qualified names of the
        partitions to wait for. A fully qualified name is of the
        form ``schema.table/pk1=pv1/pk2=pv2``, for example,
        default.users/ds=2016-01-01. This is passed as is to the metastore
        Thrift client ``get_partitions_by_name`` method. Note that
        you cannot use logical or comparison operators as in
        HivePartitionSensor.
    :type partition_names: list of strings
    :param metastore_conn_id: reference to the metastore thrift service
        connection id
    :type metastore_conn_id: str
    """

    template_fields = ('partition_names', )
    ui_color = '#8d99ae'

    @apply_defaults
    def __init__(self,
                 partition_names,
                 metastore_conn_id='metastore_default',
                 poke_interval=60 * 3,
                 *args,
                 **kwargs):
        super(NamedHivePartitionSensor,
              self).__init__(poke_interval=poke_interval, *args, **kwargs)

        if isinstance(partition_names, basestring):
            raise TypeError('partition_names must be an array of strings')

        self.metastore_conn_id = metastore_conn_id
        self.partition_names = partition_names
        self.next_poke_idx = 0

    @classmethod
    def parse_partition_name(self, partition):
        try:
            schema, table_partition = partition.split('.', 1)
            table, partition = table_partition.split('/', 1)
            return schema, table, partition
        except ValueError as e:
            raise ValueError('Could not parse ' + partition)

    def poke(self, context):
        if not hasattr(self, 'hook'):
            from airflow.hooks.hive_hooks import HiveMetastoreHook
            self.hook = HiveMetastoreHook(
                metastore_conn_id=self.metastore_conn_id)

        def poke_partition(partition):

            schema, table, partition = self.parse_partition_name(partition)

            self.log.info(
                'Poking for {schema}.{table}/{partition}'.format(**locals()))
            return self.hook.check_for_named_partition(schema, table,
                                                       partition)

        while self.next_poke_idx < len(self.partition_names):
            if poke_partition(self.partition_names[self.next_poke_idx]):
                self.next_poke_idx += 1
            else:
                return False

        return True
Пример #33
0
 def test_get_metastore_databases(self):
     if six.PY2:
         from airflow.hooks.hive_hooks import HiveMetastoreHook
         hook = HiveMetastoreHook()
         hook.get_databases()
Пример #34
0
 def db(self):
     db = request.args.get("db")
     m = HiveMetastoreHook(METASTORE_CONN_ID)
     tables = sorted(m.get_tables(db=db), key=lambda x: x.tableName)
     return self.render("metastore_browser/db.html", tables=tables, db=db)
Пример #35
0
 def test_check_hms_clients_load_balance(self, mock_client, mock_get_conn, mock_shuffle):
     HiveMetastoreHook()._find_valid_server()
     mock_shuffle.assert_called_once_with(mock_get_conn.return_value)