Exemplo n.º 1
0
    def insert_hive(self):
        """Insert buffered logs to Hive."""
        global DEBUG

        log_txt = '\n'.join(self.buffer_logs)
        _log_txt = self._escape(log_txt.strip())
        _now = datetime.datetime.now()
        _time = _now.strftime("%Y-%m-%d %H:%M:%S.%f")
        format_data = {
            'failed_table_name': self.source_table.strip(),
            'log': _log_txt,
            'log_time': _time,
            'status': self.qa_status_log
        }
        with open(LOG_FILE, 'wb') as file_h:
            file_h.write(log_txt)

        # File-based insert operation
        self.pyhdfs.insert_update('',
                                  self.prepares_data(format_data),
                                  is_append=True)

        # We need to perform INVALIDATE METADATA to reflect Hive
        # file-based operations in Impala.
        ImpalaConnect.invalidate_metadata(self.host_name,
                                          'ibis.qa_export_results')

        msg = '\n' + '#' * 100
        msg += ("\nFor QA results: Run query in Impala: "
                "select status, * from ibis.qa_export_results where "
                "log_time='{0}'")
        msg += '\n' + '#' * 100
        msg = msg.format(_time)
        print msg
        logging.info('Inserted logs to Hive')
Exemplo n.º 2
0
    def get_record(self):
        """Get it-table record"""
        query = ("SELECT * FROM {0} WHERE "
                 "source_table_name='{1}' AND "
                 "source_database_name='{2}' AND "
                 "db_env='{3}';")
        query = query.format(self.it_table, self.table_name, self.database,
                             self.db_env)
        row = None
        try:
            row = ImpalaConnect.run_query(self.it_table_host,
                                          self.it_table,
                                          query,
                                          op='select')
        except socket.error as err:
            print "Socket error({0}): {1}".format(err.errno, err.strerror)
            print 'Trying again...'
            row = ImpalaConnect.run_query(self.it_table_host,
                                          self.it_table,
                                          query,
                                          op='select')

        if not row:
            err_msg = "Failed to fetch results for query: {0}"
            err_msg = err_msg.format(query)
            raise ValueError(err_msg)

        return row[0]
Exemplo n.º 3
0
    def update_checks_balances(self, workflow, start_time=None):
        """Given a workflow app name, query oozie api for workflow job,
        update checks_balances table with records
        """
        if workflow:
            for record in self.get_records(workflow):
                # Update records to checks balances table
                # Parquet size is dependent on live location size in hdfs.
                # Records only get replaced in live if
                # successful otherwise keep last ingest records
                self.update_old_cb_table(
                    record.domain, record.table_name, record)

            # We need to perform INVALIDATE METADATA to reflect Hive
            # file-based operations in Impala.
            ImpalaConnect.invalidate_metadata(self.host,
                                              'ibis.checks_balances_export')
Exemplo n.º 4
0
def main():
    global logger
    args = {
        'source_database_name': sys.argv[1],
        'source_table_name': sys.argv[2],
        'database': sys.argv[3],
        'target_table': sys.argv[4],
        'jars': sys.argv[5],
        'jdbc_url': sys.argv[6],
        'connection_factories': sys.argv[7],
        'user_name': sys.argv[8],
        'jceks': sys.argv[9],
        'password_alias': sys.argv[10],
        'host_name': os.environ['IMPALA_HOST'],
        'target_schema': sys.argv[12],
        'oozie_url': sys.argv[13],
        'workflow_name': sys.argv[14],
        'qa_exp_results_dir': sys.argv[16]
    }
    DEBUG = True if sys.argv[15] == 'true' else False

    cb_mgr = ChecksBalancesExportManager(args['host_name'], args['oozie_url'],
                                         args['qa_exp_results_dir'])
    actions = cb_mgr.check_if_workflow_actions(args['workflow_name'])
    for action in actions:
        if sys.argv[2] + '_export' in action.get_name():
            new_action = action

    source_table = args['source_database_name'] + '.' + \
        args['source_table_name']
    logger = LogHandler(args['host_name'], source_table,
                        args['qa_exp_results_dir'])
    try:

        target_table = args['database'] + '.' + args['target_table']
        val_obj = TableValidation(source_table, target_table, args)
        bool_status = val_obj.start_data_sampling(new_action)
        logger.insert_hive()
        exit_code = 0 if bool_status else 1
    except Exception as e:
        exit_code = 1
        logger.error(traceback.format_exc())
        logger.insert_hive()

    ImpalaConnect.close_conn()
    sys.exit(exit_code)
Exemplo n.º 5
0
 def get_source_row_count(self):
     """Fetch row count of source table.
     Method to query the hive db and find the row count for each table
     in the source list.
     Returns a dictionary with key as table name and value as the row count
     """
     row_count = -1
     count_query = "SELECT COUNT(*) FROM {0};".format(self.table)
     output = ImpalaConnect.run_query(self.host_name, self.table,
                                      count_query)
     row_count = output[0][0]
     return int(row_count)
Exemplo n.º 6
0
 def get_schema(self):
     """Get it-table schema"""
     query = "describe {0};".format(self.it_table)
     row = ImpalaConnect.run_query(self.it_table_host,
                                   self.it_table,
                                   query,
                                   op='select')
     if not row:
         err_msg = "Failed to fetch results for query: {0}"
         err_msg = err_msg.format(query)
         raise ValueError(err_msg)
     return row
Exemplo n.º 7
0
def main():
    global logger
    args = {
        'source_database_name': sys.argv[1],
        'source_table_name': sys.argv[2],
        'database': sys.argv[3],
        'target_table': sys.argv[4],
        'jars': sys.argv[5],
        'jdbc_url': sys.argv[6],
        'connection_factories': sys.argv[7],
        'user_name': sys.argv[8],
        'jceks': sys.argv[9],
        'password_alias': sys.argv[10],
        'host_name': os.environ['IMPALA_HOST'],
        'target_schema': sys.argv[11],
        'qa_exp_results_dir': sys.argv[13]
    }
    DEBUG = True if sys.argv[12] == 'true' else False

    source_table = args['source_database_name'] + '.' + \
        args['source_table_name']
    logger = LogHandler(source_table, args['host_name'],
                        args['qa_exp_results_dir'])
    try:

        target_table = args['database'] + '.' + args['target_table']
        val_obj = TableValidation(source_table, target_table, args)
        bool_status_list = val_obj.start_data_sampling()
        logger.insert_hive()
        exit_code = 0 if bool_status_list[0] else 1
        if 'db2' in args['jdbc_url']:
            exit_code = 0 if (bool_status_list[0] and bool_status_list[1]
                              and bool_status_list[2]) else 1
    except Exception as e:
        exit_code = 1
        logger.error(traceback.format_exc())
        logger.insert_hive()

    ImpalaConnect.close_conn()
    sys.exit(exit_code)
Exemplo n.º 8
0
 def get_ddl(self):
     """Query hive database for column name, data type, and column count.
     Returns a dictionary of key as table name and values as column
     name and datatype
     """
     ddl_list = []
     ddl_query = self.build_ddl_query()
     output = ImpalaConnect.run_query(self.host_name, self.table, ddl_query)
     # print 'output', output
     ddl_list = self.set_column_props(output)
     # sort the list of ddl objects as source and target ddl output
     # are not ordered
     # ddl_list.sort(key=lambda ddl: ddl.column_name)
     return ddl_list
Exemplo n.º 9
0
                job_numbers[action.get_name()] = action.get_external_id()
        sub_actions = ""
        if normal_or_sub:
            sub_workflow_to_update = job_numbers[max(job_numbers.keys())]
            id_externals = self.ooz.get_job(sub_workflow_to_update)
            workflow = id_externals[1]
            sub_actions = id_externals[1].get_actions()
        else:
            workflow = workflow_job
        return workflow

    def check_if_workflow_actions(self, app_name):
        workflow_job = self.get_workflow_job(app_name)
        actions = workflow_job.get_actions()
        print "actions", actions
        return actions

if __name__ == "__main__":
    # Application name
    app_name = sys.argv[1]
    HOST = os.environ['IMPALA_HOST']
    OOZIE_URL = sys.argv[2]
    CHK_BAL_EXP_DIR = sys.argv[3]
    export_cb_mgr = ChecksBalancesExportManager(
        HOST,
        OOZIE_URL,
        CHK_BAL_EXP_DIR)
    app_name = export_cb_mgr.check_if_workflow(app_name)
    export_cb_mgr.update_checks_balances(app_name)
    ImpalaConnect.close_conn()
Exemplo n.º 10
0
        def gen_view_hql(target_db, domain=None):
            """creates view hql"""
            if domain is not None:
                team_view_name = '{team_nm}.{team_db}_{team_table}'
                team_view_name = team_view_name.format(
                    team_nm=domain,
                    team_db=self.clean_database,
                    team_table=self.clean_table_name)

            view_full_name = '{view_name}.{database}_{table_name}'
            view_full_name = view_full_name.format(
                view_name=target_db,
                database=self.clean_database,
                table_name=self.clean_table_name)
            src_view = '{src_vw_name}.{src_db}_{src_tbl_name}'
            src_view = src_view.format(src_vw_name=self.domain,
                                       src_db=self.clean_database,
                                       src_tbl_name=self.clean_table_name)

            phi_domain = target_db.split('_non_phi')[0]
            only_domain = self.domain_list
            only_domain = only_domain.split(",")
            only_domain = set(only_domain).intersection([phi_domain])
            only_domain = list(only_domain)

            if rpc_method == 'incremental':
                add_partition_hql = ''
            elif rpc_method == 'full_load':
                add_partition_hql = (
                    "ALTER TABLE `{view_full_name}` ADD PARTITION "
                    "(incr_ingest_timestamp='full_{ingest_date}');\n\n")
                add_partition_hql = add_partition_hql.format(
                    view_full_name=view_full_name,
                    ingest_date=self.partition_name)

            if self.ibis_env == 'PERF' and len(only_domain) < 1:
                try:
                    try:
                        ImpalaConnect.run_query(
                            self.host_name, view_full_name,
                            'USE {db}'.format(db=view_full_name.split(".")[0]),
                            'create')
                        is_table_available = ImpalaConnect.run_query(
                            self.host_name, view_full_name,
                            "SHOW TABLES LIKE '{table_name}'".format(
                                table_name=view_full_name.split(".")[1]))

                        if is_table_available is None or \
                                is_table_available is '':
                            raise ValueError("Table not found")
                    except:
                        raise ValueError("Database/Table not found")

                    max_trgt_ingest = ('select max(ingest_timestamp) from '
                                       '{view_full_name}')
                    max_trgt_ingest = max_trgt_ingest.format(
                        view_full_name=view_full_name)
                    max_trgt_ingest = ImpalaConnect.run_query(
                        self.host_name, view_full_name, max_trgt_ingest)

                    team_run_freq = ("select * from {freq_ingest} where"
                                     " view_name='{view_nm_splt}' and "
                                     "full_table_name='{table_name}'")
                    team_run_freq = team_run_freq.format(
                        view_nm_splt=target_db,
                        table_name=src_view,
                        freq_ingest=self.freq_ingest)
                    team_run_freq = ImpalaConnect.run_query(
                        self.host_name, self.freq_ingest, team_run_freq)
                    frequency = team_run_freq[0][0]

                    if frequency == 'none':
                        team_freq = 0
                    elif frequency == 'daily':
                        team_freq = 1
                    elif frequency == 'weekly':
                        team_freq = 7
                    elif frequency == 'biweekly':
                        team_freq = 14
                    elif frequency == 'fortnightly':
                        team_freq = 15
                    elif frequency == 'monthly':
                        team_freq = 30
                    elif frequency == 'quarterly':
                        team_freq = 90
                    elif frequency == 'yearly':
                        team_freq = 364

                    max_ingest = datetime.datetime.strptime(
                        max_trgt_ingest[0][0], '%Y-%m-%d %H:%M:%S')
                    curr_date_str = datetime.datetime.now().strftime(
                        "%Y-%m-%d %H:%M:%S")
                    curr_date = datetime.datetime.strptime(
                        curr_date_str, '%Y-%m-%d %H:%M:%S')
                    last_ingest_day = (curr_date - max_ingest).days
                    if team_freq != 0 and last_ingest_day >= team_freq and \
                            team_run_freq[0][1].lower() == 'yes':
                        if domain:
                            select_query = "select * from {team_view_name}".\
                                format(team_view_name=team_view_name)
                            views_hql = (
                                'DROP VIEW IF EXISTS {view_full_name};\n'
                                'CREATE DATABASE IF NOT EXISTS {view_name};'
                                '\n'
                                'CREATE DATABASE IF NOT EXISTS {team_name};'
                                '\n'
                                'DROP VIEW IF EXISTS {team_view_name};\n'
                                'DROP TABLE IF EXISTS {team_view_name};\n'
                                'CREATE VIEW  {team_view_name} AS '
                                'SELECT * FROM {src_view}; \n'
                                'CREATE TABLE IF NOT EXISTS {view_full_name} '
                                'like {src_view};\n\n')
                            views_hql = views_hql.format(
                                view_full_name=view_full_name,
                                view_name=target_db,
                                src_view=src_view,
                                team_view_name=team_view_name,
                                team_name=domain)

                        else:
                            select_query = "select * from {src_view}".format(
                                src_view=src_view)
                            views_hql = (
                                'DROP VIEW IF EXISTS {view_full_name};\n'
                                'CREATE DATABASE IF NOT EXISTS {view_name};'
                                '\n'
                                'CREATE  TABLE IF NOT EXISTS {view_full_name} '
                                'like {src_view};\n\n'
                                'INSERT OVERWRITE TABLE {view_full_name} '
                                'PARTITION(incr_ingest_timestamp) '
                                'select * from {src_view} ;\n\n')

                            views_hql = views_hql.format(
                                view_full_name=view_full_name,
                                view_name=target_db,
                                src_view=src_view)

                        if rpc_method == 'incremental':
                            insert_statement = (
                                "INSERT OVERWRITE TABLE {view_full_name} "
                                "PARTITION(incr_ingest_timestamp) "
                                "{select_query} where "
                                "ingest_timestamp > '{maxtime}';\n\n")
                            insert_statement = insert_statement.format(
                                view_full_name=view_full_name,
                                maxtime=max_trgt_ingest[0][0],
                                select_query=select_query)
                        else:
                            insert_statement = (
                                'INSERT OVERWRITE TABLE {view_full_name} '
                                'PARTITION(incr_ingest_timestamp) '
                                '{select_query};\n\n')
                            insert_statement = insert_statement.format(
                                view_full_name=view_full_name,
                                select_query=select_query)

                            drop_hql = (
                                'DROP TABLE IF EXISTS {view_full_name};\n\n')

                            views_hql = drop_hql.format(
                                view_full_name=view_full_name) + views_hql
                        views_hql += insert_statement

                        views_hql += "msck repair table {0};\n\n".format(
                            view_full_name)

                    else:
                        print 'No views hql created'
                        views_hql = ''

                except ValueError as ex:
                    print str(ex)
                    if domain:
                        views_hql = (
                            'DROP VIEW IF EXISTS {view_full_name};\n'
                            'DROP TABLE IF EXISTS {view_full_name};\n\n'
                            'CREATE DATABASE IF NOT EXISTS {view_name};\n'
                            'CREATE DATABASE IF NOT EXISTS {team_name};\n'
                            'DROP VIEW IF EXISTS {team_view_name};\n'
                            'DROP TABLE IF EXISTS {team_view_name};\n'
                            'CREATE VIEW  {team_view_name} AS '
                            'SELECT * FROM {src_view}; \n'
                            'CREATE  TABLE {view_full_name} '
                            'like {src_view};\n\n'
                            'INSERT OVERWRITE TABLE {view_full_name} '
                            'PARTITION(incr_ingest_timestamp) '
                            'select * from {team_view_name} ;\n\n')
                        views_hql = views_hql.format(
                            view_full_name=view_full_name,
                            view_name=target_db,
                            src_view=src_view,
                            team_view_name=team_view_name,
                            team_name=domain)
                    else:
                        views_hql = (
                            'DROP VIEW IF EXISTS {view_full_name};\n'
                            'DROP TABLE IF EXISTS {view_full_name};\n\n'
                            'CREATE DATABASE IF NOT EXISTS {view_name};\n'
                            'CREATE  TABLE {view_full_name} '
                            'like {src_view};\n\n'
                            'INSERT OVERWRITE TABLE {view_full_name} '
                            'PARTITION(incr_ingest_timestamp) '
                            'select * from {src_view} ;\n\n')

                        views_hql = views_hql.format(
                            view_full_name=view_full_name,
                            view_name=target_db,
                            src_view=src_view)
                    views_hql += "msck repair table `{0}`;\n\n".format(
                        view_full_name)

            elif self.ibis_env == 'PERF' and len(only_domain) > 0:
                views_hql = ('DROP VIEW IF EXISTS {view_full_name};\n'
                             'DROP TABLE IF EXISTS {view_full_name};\n'
                             'CREATE VIEW {view_full_name} AS '
                             'SELECT * FROM {src_view}; \n')
                views_hql = views_hql.format(view_full_name=view_full_name,
                                             src_view=src_view)
            else:

                views_hql = (
                    'DROP VIEW IF EXISTS `{view_full_name}`;\n'
                    'DROP TABLE IF EXISTS `{view_full_name}`;\n\n'
                    'CREATE DATABASE IF NOT EXISTS `{view_name}`;\n'
                    'CREATE EXTERNAL TABLE `{view_full_name}` ('
                    '{create_column_hql},\n `ingest_timestamp` string)\n'
                    'partitioned by (incr_ingest_timestamp string)\n'
                    "stored as parquet location 'hdfs://"
                    "/user/data/{target_dir}/live/';\n\n")

                views_hql = views_hql.format(view_full_name=view_full_name,
                                             view_name=target_db,
                                             create_column_hql=create_hql,
                                             target_dir=self.target_dir)

                views_hql += add_partition_hql
                views_hql += "msck repair table `{0}`;\n\n".format(
                    view_full_name)
            return views_hql