def drop_temp_hive_table(self): command = None create_table_stmt = None try: create_table_stmt = "DROP TABLE IF EXISTS %s_airflow_temp;" % self.table_name command = HiveCommand.run(query=create_table_stmt, label='default') self.monitor_command(command, create_table_stmt) except Exception as e: if command is None: raise AirflowException( 'create_temp_hive_table call for %s failed. No command Id available.\n%s' % (create_table_stmt, e)) else: raise AirflowException( 'create_temp_hive_table call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s' % (create_table_stmt, command.id, e)) try: stmt = "s3cmd -c /usr/lib/hustler/s3cfg rm -rf s3://%s/result_cache/%s_airflow_temp/;" % \ (self.s3_bucket, self.table_name) command = Command.run(command_type='ShellCommand', inline=stmt, label='default') self.monitor_command(command, stmt) except Exception as e: if command is None: raise AirflowException( 'create_temp_hive_table call for %s failed. No command Id available.\n%s' % (create_table_stmt, e)) else: raise AirflowException( 'create_temp_hive_table call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s' % (create_table_stmt, command.id, e))
def upgrade(context): logging.debug("Execute upgrade of `%d`" % version) template = Template(""" CREATE TABLE IF NOT EXISTS qubole_bi_${env}_${account_id}.tco_table ( node_id BIGINT, cluster_inst_id INT, cluster_id INT, instance_type STRING, availability_zone STRING, region STRING, hour BIGINT, price DOUBLE, node_run_time INT, approx_per_hour_price DOUBLE, ondemand_price DOUBLE, up_time STRING, down_time STRING, node_type STRING ) PARTITIONED BY (account_id INT, event_date STRING) STORED AS ORC LOCATION '${defloc}/qubole_bi/tco_table/' """) context["revisions.upgraded"].append(version) Migrations.upgrade( migration_number=version, command=HiveCommand.run( query=template.render_unicode(env=context["env"], account_id=context['account_id'], defloc=context['defloc']))) print context
def downgrade(context): logging.debug("Execute downgrade of `%d`" % version) template = Template(""" DROP TABLE IF EXISTS qubole_bi_${env}_${account_id}.clusters_metadata2; """) Migrations.downgrade(migration_number=version, command=HiveCommand.run(query=template.render_unicode(env=context["env"], account_id = context['account_id'])))
def upgrade(context): logging.debug("Execute upgrade of `%d`" % version) template = Template(""" CREATE DATABASE IF NOT EXISTS qubole_bi_${env}_${account_id}; """) context["revisions.upgraded"].append(version) Migrations.upgrade( migration_number=version, command=HiveCommand.run(query=template.render_unicode( env=context["env"], account_id=context['account_id']))) print context
def qubole(api_token,sql,replacements,filename): Qubole.configure(api_token=api_token) with open(sql,'r') as f: query = f.read() label='Trading-spark' query = find_replace_multi(query,replacements) hc = HiveCommand.run(query=query, label=label) cmd = Command.find(hc.id) out_file = filename + '.csv' with open(out_file, 'wb') as writer: cmd.get_results(writer) df = pd.read_csv(out_file, delimiter='\t') return df
def copy_to_temp_hive(self, columns): command = None copy_table_stmt = None try: copy_table_stmt = '''SET hive.exec.compress.output=false; INSERT INTO %s_airflow_temp select * from %s''' % ( self.table_name, self.table_name) command = HiveCommand.run(query=copy_table_stmt, label='default') self.monitor_command(command, copy_table_stmt) except Exception as e: if command is None: raise AirflowException( 'RS_copy_to_temp_hive call for %s failed. No command Id available. \n%s' % (copy_table_stmt, e)) else: raise AirflowException( 'RS_copy_to_temp_hive call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s' % (copy_table_stmt, command.id, e))
def upgrade(context): logging.debug("Execute upgrade of `%d`" % version) template = Template(""" CREATE EXTERNAL TABLE IF NOT EXISTS qubole_bi_${env}_${account_id}.clusters_metadata2 ( cluster_id STRING, cluster_inst_id STRING, tag STRING, ec2_tag STRING ) PARTITIONED BY (account_id INT, event_date STRING) STORED AS ORC Location '${defloc}/qubole_bi/clusters_metadata2/' """) context["revisions.upgraded"].append(version) Migrations.upgrade(migration_number=version, command=HiveCommand.run(query=template.render_unicode(env=context["env"], account_id = context['account_id'], defloc = context['defloc']))) print context
def upgrade(context): logging.debug("Execute upgrade of `%d`" % version) template = Template(""" CREATE EXTERNAL TABLE IF NOT EXISTS qubole_bi_${env}_${account_id}.queries_info ( command_id STRING, engine STRING, similarity_identifier STRING, similarity_payload MAP<STRING, STRING>, counters MAP<STRING, STRING>, cluster_id INT) PARTITIONED BY (account_id INT, event_date STRING) STORED AS ORC LOCATION '${defloc}/qubole_bi/queries_info/' """) context["revisions.upgraded"].append(version) Migrations.upgrade( migration_number=version, command=HiveCommand.run( query=template.render_unicode(env=context["env"], account_id=context['account_id'], defloc=context['defloc']))) print context
def create_temp_hive_table(self, columns): command = None create_table_stmt = None try: create_table_stmt = """ SET hive.exec.compress.output=false; DROP TABLE IF EXISTS %s_airflow_temp; CREATE EXTERNAL TABLE %s_airflow_temp( %s) COMMENT 'The table %s_airflow_temp was generated by Hadoop Result Caching' ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' LINES TERMINATED BY '\\n' STORED AS TEXTFILE LOCATION 's3n://%s/result_cache/%s_airflow_temp/' TBLPROPERTIES("orc.compress"="NONE");""" % \ (self.table_name, self.table_name, ',\n'.join(['%s %s' % (c[0], c[1]) for c in columns]), self.table_name, self.s3_bucket, self.table_name ) command = HiveCommand.run(query=create_table_stmt, label='default') self.monitor_command(command, create_table_stmt) except Exception as e: if command is None: raise AirflowException( 'RS_create_temp_hive_table call for %s failed. No command Id available. \n %s' % (create_table_stmt, e)) else: raise AirflowException( 'RS_create_temp_hive_table call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s \n %s' % (create_table_stmt, command.id, e))
def run_sql(self): command = None sql_stmt = None try: template = Template(self.raw_sql) dwd = DataWarehouseDates() sql_stmt = template.render( DS=self.ds, DS_TODAY=dwd.ds_today(self.ds), DS_DATE_ID=dwd.date_id_from_date_str(self.ds), DS_DATE_ADD=lambda days: dwd.date_add(self.ds, days), DS_TODAY_DATE_ADD=lambda days: dwd.date_add( dwd.ds_today(self.ds), days)) sql_stmt_with_tracking = '-- %s %s %s\n%s' % ( self.dag_id, self.task_id, self.ds, sql_stmt) qubole_name = '%s_%s_%s' % (self.dag_id, self.task_id, self.ds) if 'PRESTO' in self.db_type.upper(): command = PrestoCommand.create(query=sql_stmt_with_tracking, label=self.label, name=qubole_name) elif 'HIVE' in self.db_type.upper(): command = HiveCommand.run(query=sql_stmt_with_tracking, label=self.label, name=qubole_name) else: raise AirflowException('Invalid db_type specified.') self.monitor_command(command, sql_stmt_with_tracking) except Exception as e: if command is None: raise AirflowException( 'run_sql call for %s failed. No command Id available.\n%s' % (sql_stmt, e)) else: raise AirflowException( 'run_sql call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s' % (sql_stmt, command.id, e))
def get_table_columns(self): command = None sql = None try: sql = "show create table %s" % self.table_name command = HiveCommand.run(query=sql, label='default') buffer = io.BytesIO() command.get_results(fp=buffer) buffer.seek(0) buffer_list = [t for t in buffer] buffer.close() buffer = None column_names_types = [] for l in buffer_list: decoded = l.decode("utf-8") column_type = decoded.split() if len(column_type) > 1: if self.is_type(column_type[1].strip().strip(',').strip( '(').strip(')')): column_names_types.append( (column_type[0].strip(), column_type[1].strip(',').strip('(').strip(')'))) return column_names_types except Exception as e: if command is None: raise AirflowException( 'RS_get_table_columns call for %s failed. No command Id available.\n%s' % (sql, e)) else: raise AirflowException( 'RS_get_table_columns call for %s failed. https://api.qubole.com/v2/analyze?command_id=%s\n%s' % (sql, command.id, e))
def upgrade(context): logging.debug("Execute upgrade of `%d`" % version) template = Template(""" CREATE EXTERNAL TABLE IF NOT EXISTS qubole_bi_${env}_${account_id}.`usage`( `cluster_id` INT, `cluster_inst_id` INT, `node_type` STRING, `cluster_type` STRING, `compute_usage` DECIMAL(30,8)) PARTITIONED BY ( `account_id` INT, `event_date` STRING) STORED AS ORC LOCATION '${defloc}/qubole_bi/qcuh/' """) context["revisions.upgraded"].append(version) Migrations.upgrade( migration_number=version, command=HiveCommand.run( query=template.render_unicode(env=context["env"], account_id=context['account_id'], defloc=context['defloc']))) print context
def setup(): """ Execute a test query and return the HiveCommand object """ hc = HiveCommand.run(query=TEST_QUERY) return hc