Exemplo n.º 1
0
    def execute(self, context):
        """
        Executes the transfer operation from hdfs to Clickhouse.

        :param context: The context that is being provided when executing.
        :type context: dict
        """
        self.log.info("Connecting to hdfs using %s connection." % self.clickhouse_conn_id)
        clickhouse = ClickhouseHook(clickhouse_conn_id=self.clickhouse_conn_id, auth=self.auth)

        self.log.info("Connecting to hdfs using %s connection." % self.hdfs_conn_id)
        try:
            hdfs = HDFSHook(hdfs_conn_id=self.hdfs_conn_id)
            hdfs_client = hdfs.get_conn()
        except Exception as e:
            raise AirflowException("Failed to retireve hdfs client.", e)

        self.log.info("Checking hdfs paths %s" % ', '.join(self.hdfs_paths))
        try:
            ls = list(hdfs_client.ls(self.hdfs_paths))
            self.log.info("Total files: %s" % len(ls))
            total_size = sum([f.get('blocksize', 0) for f in ls])
            if total_size == 0:
                self.log.warning("Files are empty, skipping insert.")
                return
        except Exception as e:
            raise AirflowException("Error checking hdfs paths.", e)

        self.log.info("Reading from hdfs.")
        try:
            hdfs_stdout_gen = hdfs_client.cat(self.hdfs_paths)
            self.log.info("Loading into clickhouse table %s..." % self.clickhouse_table)
            total_rows = 0
            for hdfs_stdout in hdfs_stdout_gen:
                r = clickhouse.insert_rows(table=self.clickhouse_table, data=hdfs_stdout,
                                           row_format=self.row_format, timeout=self.timeout,
                                           params={'send_progress_in_http_headers': 1})
                written_rows = json.loads(r.headers['X-ClickHouse-Summary']).get('written_rows')
                if written_rows:
                    total_rows += int(written_rows)
                else:
                    self.log.warning('Failed to retrieve row count.')
            if total_rows == 0:
                raise AirflowException("Inserted rows: 0")
        except Exception as e:
            raise AirflowException("Error inserting into clickhouse", e)
        self.log.info("Successfully inserted %s rows into %s." % (total_rows, self.clickhouse_table))
Exemplo n.º 2
0
 def test_get_autoconfig_client(self, mock_get_connections, mock_client):
     conn = Connection(conn_id='hdfs',
                       conn_type='hdfs',
                       host='localhost',
                       port=8020,
                       login='******',
                       extra=json.dumps({'autoconfig': True}))
     mock_get_connections.return_value = [conn]
     HDFSHook(hdfs_conn_id='hdfs').get_conn()
     mock_client.assert_called_once_with(effective_user='******',
                                         use_sasl=False)
Exemplo n.º 3
0
 def test_get_ha_client(self, mock_get_connections):
     conn_1 = Connection(conn_id='hdfs_default',
                         conn_type='hdfs',
                         host='localhost',
                         port=8020)
     conn_2 = Connection(conn_id='hdfs_default',
                         conn_type='hdfs',
                         host='localhost2',
                         port=8020)
     mock_get_connections.return_value = [conn_1, conn_2]
     client = HDFSHook().get_conn()
     self.assertIsInstance(client, snakebite.client.HAClient)
Exemplo n.º 4
0
 def test_get_autoconfig_client_no_conn(self, mock_client):
     HDFSHook(hdfs_conn_id='hdfs_missing', autoconfig=True).get_conn()
     mock_client.assert_called_once_with(effective_user=None,
                                         use_sasl=False)
Exemplo n.º 5
0
 def test_get_client(self):
     client = HDFSHook(proxy_user='******').get_conn()
     self.assertIsInstance(client, snakebite.client.Client)
     self.assertEqual('localhost', client.host)
     self.assertEqual(8020, client.port)
     self.assertEqual('foo', client.service.channel.effective_user)
Exemplo n.º 6
0
 def test_get_client(self):
     client = HDFSHook(proxy_user='******').get_conn()
     assert isinstance(client, snakebite.client.Client)
     assert 'localhost' == client.host
     assert 8020 == client.port
     assert 'foo' == client.service.channel.effective_user