def get_completed_job(job_id): cmd = HiveCommand() cmd = cmd.find(job_id) print('Retrieving job:', job_id) if cmd.attributes.get('status', None) != 'done': raise ValueError('Job not yet completed') print('Job data successfully retrieved from Qubole') return cmd
def upgrade(context): logging.debug("Execute upgrade of `%d`" % version) template = Template(""" CREATE TABLE IF NOT EXISTS qubole_bi_${env}_${account_id}.tco_table ( node_id BIGINT, cluster_inst_id INT, cluster_id INT, instance_type STRING, availability_zone STRING, region STRING, hour BIGINT, price DOUBLE, node_run_time INT, approx_per_hour_price DOUBLE, ondemand_price DOUBLE, up_time STRING, down_time STRING, node_type STRING ) PARTITIONED BY (account_id INT, event_date STRING) STORED AS ORC LOCATION '${defloc}/qubole_bi/tco_table/' """) context["revisions.upgraded"].append(version) Migrations.upgrade( migration_number=version, command=HiveCommand.run( query=template.render_unicode(env=context["env"], account_id=context['account_id'], defloc=context['defloc']))) print context
def drop_temp_hive_table(self): command = None create_table_stmt = None try: create_table_stmt = "DROP TABLE IF EXISTS %s_airflow_temp;" % self.table_name command = HiveCommand.run(query=create_table_stmt, label='default') self.monitor_command(command, create_table_stmt) except Exception as e: if command is None: raise AirflowException( 'create_temp_hive_table call for %s failed. No command Id available.\n%s' % (create_table_stmt, e)) else: raise AirflowException( 'create_temp_hive_table call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s' % (create_table_stmt, command.id, e)) try: stmt = "s3cmd -c /usr/lib/hustler/s3cfg rm -rf s3://%s/result_cache/%s_airflow_temp/;" % \ (self.s3_bucket, self.table_name) command = Command.run(command_type='ShellCommand', inline=stmt, label='default') self.monitor_command(command, stmt) except Exception as e: if command is None: raise AirflowException( 'create_temp_hive_table call for %s failed. No command Id available.\n%s' % (create_table_stmt, e)) else: raise AirflowException( 'create_temp_hive_table call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s' % (create_table_stmt, command.id, e))
def downgrade(context): logging.debug("Execute downgrade of `%d`" % version) template = Template(""" DROP TABLE IF EXISTS qubole_bi_${env}_${account_id}.clusters_metadata2; """) Migrations.downgrade(migration_number=version, command=HiveCommand.run(query=template.render_unicode(env=context["env"], account_id = context['account_id'])))
def read_data_from_hive(self, query, cluster, verbose=False, **kwargs): print('Running Hive query') cmd = HiveCommand.create(query=query, print_logs_live=True, label=cluster, **kwargs) while cmd.attributes.get('status', None) != 'done': if verbose: cmd = self._get_logs(cmd) else: cmd = self._get_status(cmd) return cmd
def _get_logs(self, cmd): new_std_out = (cmd.get_log_partial()[0]).split('\n') for line in new_std_out[len(self.old_std_out):]: print(line) cmd = HiveCommand.find(cmd.id) self.old_std_out = new_std_out time.sleep(5) if cmd.attributes['status'] == 'error': raise RuntimeError('Job Failed') if cmd.attributes['status'] == 'cancelled': raise RuntimeError('Job Cancelled') return cmd
def upgrade(context): logging.debug("Execute upgrade of `%d`" % version) template = Template(""" CREATE DATABASE IF NOT EXISTS qubole_bi_${env}_${account_id}; """) context["revisions.upgraded"].append(version) Migrations.upgrade( migration_number=version, command=HiveCommand.run(query=template.render_unicode( env=context["env"], account_id=context['account_id']))) print context
def get_job_result(self, qubole_jid): """Finds and retrieves results for existing Qubole job. Args: id: qubole job id. Returns: Job stdout output. """ self._configure_qubole() qubole_jid = str(qubole_jid) return self._get_qubole_command_output(HiveCommand.find(qubole_jid))
def run_query(self, query, user): qbol.configure(api_token=self.configuration.get('token'), api_url='%s/api' % self.configuration.get('endpoint')) try: query_type = self.configuration.get('query_type', 'hive') if query_type == 'quantum': cmd = SqlCommand.create(query=query) elif query_type == 'hive': cmd = HiveCommand.create(query=query, label=self.configuration.get('cluster')) elif query_type == 'presto': cmd = PrestoCommand.create(query=query, label=self.configuration.get('cluster')) else: raise Exception("Invalid Query Type:%s.\ It must be : hive / presto / quantum." % self.configuration.get('query_type')) logging.info("Qubole command created with Id: %s and Status: %s", cmd.id, cmd.status) while not Command.is_done(cmd.status): time.sleep(qbol.poll_interval) cmd = Command.find(cmd.id) logging.info("Qubole command Id: %s and Status: %s", cmd.id, cmd.status) rows = [] columns = [] error = None if cmd.status == 'done': fp = StringIO() cmd.get_results(fp=fp, inline=True, delim='\t', fetch=False, qlog=None, arguments=['true']) results = fp.getvalue() fp.close() data = results.split('\r\n') columns = self.fetch_columns([(i, TYPE_STRING) for i in data.pop(0).split('\t')]) rows = [dict(zip((c['name'] for c in columns), row.split('\t'))) for row in data] json_data = json_dumps({'columns': columns, 'rows': rows}) except KeyboardInterrupt: logging.info('Sending KILL signal to Qubole Command Id: %s', cmd.id) cmd.cancel() error = "Query cancelled by user." json_data = None return json_data, error
def qubole(api_token,sql,replacements,filename): Qubole.configure(api_token=api_token) with open(sql,'r') as f: query = f.read() label='Trading-spark' query = find_replace_multi(query,replacements) hc = HiveCommand.run(query=query, label=label) cmd = Command.find(hc.id) out_file = filename + '.csv' with open(out_file, 'wb') as writer: cmd.get_results(writer) df = pd.read_csv(out_file, delimiter='\t') return df
def copy_to_temp_hive(self, columns): command = None copy_table_stmt = None try: copy_table_stmt = '''SET hive.exec.compress.output=false; INSERT INTO %s_airflow_temp select * from %s''' % ( self.table_name, self.table_name) command = HiveCommand.run(query=copy_table_stmt, label='default') self.monitor_command(command, copy_table_stmt) except Exception as e: if command is None: raise AirflowException( 'RS_copy_to_temp_hive call for %s failed. No command Id available. \n%s' % (copy_table_stmt, e)) else: raise AirflowException( 'RS_copy_to_temp_hive call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s' % (copy_table_stmt, command.id, e))
def upgrade(context): logging.debug("Execute upgrade of `%d`" % version) template = Template(""" CREATE EXTERNAL TABLE IF NOT EXISTS qubole_bi_${env}_${account_id}.clusters_metadata2 ( cluster_id STRING, cluster_inst_id STRING, tag STRING, ec2_tag STRING ) PARTITIONED BY (account_id INT, event_date STRING) STORED AS ORC Location '${defloc}/qubole_bi/clusters_metadata2/' """) context["revisions.upgraded"].append(version) Migrations.upgrade(migration_number=version, command=HiveCommand.run(query=template.render_unicode(env=context["env"], account_id = context['account_id'], defloc = context['defloc']))) print context
def test_execute_assert_query_fail(self, mock_get_hook): mock_cmd = mock.Mock() mock_cmd.status = 'error' mock_cmd.id = 123 mock_cmd.is_success = mock.Mock( return_value=HiveCommand.is_success(mock_cmd.status)) mock_hook = mock.Mock() mock_hook.get_first.return_value = [11] mock_hook.cmd = mock_cmd mock_get_hook.return_value = mock_hook operator = self.__construct_operator('select value from tab1 limit 1;', 5, 1) with self.assertRaises(AirflowException) as cm: operator.execute() self.assertNotIn('Qubole Command Id: ', str(cm.exception)) mock_cmd.is_success.assert_called_with(mock_cmd.status)
def test_execute_assertion_fail(self, mock_get_hook): mock_cmd = mock.Mock() mock_cmd.status = 'done' mock_cmd.id = 123 mock_cmd.is_success = mock.Mock( return_value=HiveCommand.is_success(mock_cmd.status)) mock_hook = mock.Mock() mock_hook.get_first.return_value = [11] mock_hook.cmd = mock_cmd mock_get_hook.return_value = mock_hook operator = self.__construct_operator('select value from tab1 limit 1;', 5, 1) with self.assertRaisesRegexp(AirflowException, 'Qubole Command Id: ' + str(mock_cmd.id)): operator.execute() mock_cmd.is_success.assert_called_with(mock_cmd.status)
def test_execute_assertion_fail(self, mock_get_hook): mock_cmd = mock.Mock() mock_cmd.status = 'done' mock_cmd.id = 123 mock_cmd.is_success = mock.Mock( return_value=HiveCommand.is_success(mock_cmd.status)) mock_hook = mock.Mock() mock_hook.get_first.return_value = [11] mock_hook.cmd = mock_cmd mock_get_hook.return_value = mock_hook operator = self.__construct_operator('select value from tab1 limit 1;', 5, 1) with self.assertRaisesRegex(AirflowException, 'Qubole Command Id: ' + str(mock_cmd.id)): operator.execute() mock_cmd.is_success.assert_called_once_with(mock_cmd.status)
def test_execute_assert_query_fail(self, mock_get_hook): mock_cmd = mock.Mock() mock_cmd.status = 'error' mock_cmd.id = 123 mock_cmd.is_success = mock.Mock( return_value=HiveCommand.is_success(mock_cmd.status)) mock_hook = mock.Mock() mock_hook.get_first.return_value = [11] mock_hook.cmd = mock_cmd mock_get_hook.return_value = mock_hook operator = self.__construct_operator('select value from tab1 limit 1;', 5, 1) with self.assertRaises(AirflowException) as cm: operator.execute() self.assertNotIn('Qubole Command Id: ', str(cm.exception)) mock_cmd.is_success.assert_called_once_with(mock_cmd.status)
def upgrade(context): logging.debug("Execute upgrade of `%d`" % version) template = Template(""" CREATE EXTERNAL TABLE IF NOT EXISTS qubole_bi_${env}_${account_id}.queries_info ( command_id STRING, engine STRING, similarity_identifier STRING, similarity_payload MAP<STRING, STRING>, counters MAP<STRING, STRING>, cluster_id INT) PARTITIONED BY (account_id INT, event_date STRING) STORED AS ORC LOCATION '${defloc}/qubole_bi/queries_info/' """) context["revisions.upgraded"].append(version) Migrations.upgrade( migration_number=version, command=HiveCommand.run( query=template.render_unicode(env=context["env"], account_id=context['account_id'], defloc=context['defloc']))) print context
def create_temp_hive_table(self, columns): command = None create_table_stmt = None try: create_table_stmt = """ SET hive.exec.compress.output=false; DROP TABLE IF EXISTS %s_airflow_temp; CREATE EXTERNAL TABLE %s_airflow_temp( %s) COMMENT 'The table %s_airflow_temp was generated by Hadoop Result Caching' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' STORED AS TEXTFILE LOCATION 's3n://%s/result_cache/%s_airflow_temp/' TBLPROPERTIES("orc.compress"="NONE");""" % \ (self.table_name, self.table_name, ',\n'.join(['%s %s' % (c[0], c[1]) for c in columns]), self.table_name, self.s3_bucket, self.table_name ) command = HiveCommand.run(query=create_table_stmt, label='default') self.monitor_command(command, create_table_stmt) except Exception as e: if command is None: raise AirflowException( 'RS_create_temp_hive_table call for %s failed. No command Id available. \n %s' % (create_table_stmt, e)) else: raise AirflowException( 'RS_create_temp_hive_table call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s \n %s' % (create_table_stmt, command.id, e))
def run_sql(self): command = None sql_stmt = None try: template = Template(self.raw_sql) dwd = DataWarehouseDates() sql_stmt = template.render( DS=self.ds, DS_TODAY=dwd.ds_today(self.ds), DS_DATE_ID=dwd.date_id_from_date_str(self.ds), DS_DATE_ADD=lambda days: dwd.date_add(self.ds, days), DS_TODAY_DATE_ADD=lambda days: dwd.date_add( dwd.ds_today(self.ds), days)) sql_stmt_with_tracking = '-- %s %s %s\n%s' % ( self.dag_id, self.task_id, self.ds, sql_stmt) qubole_name = '%s_%s_%s' % (self.dag_id, self.task_id, self.ds) if 'PRESTO' in self.db_type.upper(): command = PrestoCommand.create(query=sql_stmt_with_tracking, label=self.label, name=qubole_name) elif 'HIVE' in self.db_type.upper(): command = HiveCommand.run(query=sql_stmt_with_tracking, label=self.label, name=qubole_name) else: raise AirflowException('Invalid db_type specified.') self.monitor_command(command, sql_stmt_with_tracking) except Exception as e: if command is None: raise AirflowException( 'run_sql call for %s failed. No command Id available.\n%s' % (sql_stmt, e)) else: raise AirflowException( 'run_sql call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s' % (sql_stmt, command.id, e))
def get_table_columns(self): command = None sql = None try: sql = "show create table %s" % self.table_name command = HiveCommand.run(query=sql, label='default') buffer = io.BytesIO() command.get_results(fp=buffer) buffer.seek(0) buffer_list = [t for t in buffer] buffer.close() buffer = None column_names_types = [] for l in buffer_list: decoded = l.decode("utf-8") column_type = decoded.split() if len(column_type) > 1: if self.is_type(column_type[1].strip().strip(',').strip( '(').strip(')')): column_names_types.append( (column_type[0].strip(), column_type[1].strip(',').strip('(').strip(')'))) return column_names_types except Exception as e: if command is None: raise AirflowException( 'RS_get_table_columns call for %s failed. No command Id available.\n%s' % (sql, e)) else: raise AirflowException( 'RS_get_table_columns call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s' % (sql, command.id, e))
def upgrade(context): logging.debug("Execute upgrade of `%d`" % version) template = Template(""" CREATE EXTERNAL TABLE IF NOT EXISTS qubole_bi_${env}_${account_id}.`usage`( `cluster_id` INT, `cluster_inst_id` INT, `node_type` STRING, `cluster_type` STRING, `compute_usage` DECIMAL(30,8)) PARTITIONED BY ( `account_id` INT, `event_date` STRING) STORED AS ORC LOCATION '${defloc}/qubole_bi/qcuh/' """) context["revisions.upgraded"].append(version) Migrations.upgrade( migration_number=version, command=HiveCommand.run( query=template.render_unicode(env=context["env"], account_id=context['account_id'], defloc=context['defloc']))) print context
def setup(): """ Execute a test query and return the HiveCommand object """ hc = HiveCommand.run(query=TEST_QUERY) return hc
def run_query(self, query, user): qbol.configure( api_token=self.configuration.get("token"), api_url="%s/api" % self.configuration.get("endpoint"), ) try: query_type = self.configuration.get("query_type", "hive") if query_type == "quantum": cmd = SqlCommand.create(query=query) elif query_type == "hive": cmd = HiveCommand.create( query=query, label=self.configuration.get("cluster")) elif query_type == "presto": cmd = PrestoCommand.create( query=query, label=self.configuration.get("cluster")) else: raise Exception("Invalid Query Type:%s.\ It must be : hive / presto / quantum." % self.configuration.get("query_type")) logging.info("Qubole command created with Id: %s and Status: %s", cmd.id, cmd.status) while not Command.is_done(cmd.status): time.sleep(qbol.poll_interval) cmd = Command.find(cmd.id) logging.info("Qubole command Id: %s and Status: %s", cmd.id, cmd.status) rows = [] columns = [] error = None if cmd.status == "done": fp = StringIO() cmd.get_results( fp=fp, inline=True, delim="\t", fetch=False, qlog=None, arguments=["true"], ) results = fp.getvalue() fp.close() data = results.split("\r\n") columns = self.fetch_columns([ (i, TYPE_STRING) for i in data.pop(0).split("\t") ]) rows = [ dict( zip((column["name"] for column in columns), row.split("\t"))) for row in data ] json_data = json_dumps({"columns": columns, "rows": rows}) except KeyboardInterrupt: logging.info("Sending KILL signal to Qubole Command Id: %s", cmd.id) cmd.cancel() error = "Query cancelled by user." json_data = None return json_data, error
def run_hive_query_asynchronous(cluster_label, query_filename, **query_kwargs): template_loader = TemplateLoader(QUERY_DIR) query = template_loader.load_from_file(query_filename, **query_kwargs) return HiveCommand.create(query=query, label=cluster_label)