Пример #1
0
 def __init__(self, db, table):
     """
     :param db:
     :param table:
     :return:
     """
     self.database = db
     self.table = table
     self.shell_exec = ShellExecutor()
Пример #2
0
 def __init__(self,db,table):
     """
     :param db:
     :param table:
     :return:
     """
     self.database = db
     self.table = table
     self.shell_exec = ShellExecutor()
Пример #3
0
    def __init__(self,config_file=None):
        """
        :param topic: Dataset's Trinity topic name
        :param db: Dataset's Thrive database name in Hive
        :table: Dataset's Thrive table name in Hive
        :return:
        """
        self.parser = SafeConfigParser()
        self.parser.read(config_file)

        self.topic = self.get_config("topic_name")
        self.database = self.get_config("database_name")
        self.table = self.get_config("table_name")
        self.connection_info = self.get_config("connection_info")
        self.metadata_mgr = MetadataManager(self.connection_info,self.table,self.topic)
        self.hdfs_mgr = hdfsManager(self.topic)
        self.hive_mng = HiveManager(self.database,self.table)
        self.shell_exec = ShellExecutor()
        self.loadts = datetime.now()

        self.hdfs_topic = "idea-flowview"
Пример #4
0
class HiveManager(object):
    def __init__(self, db, table):
        """
        :param db:
        :param table:
        :return:
        """
        self.database = db
        self.table = table
        self.shell_exec = ShellExecutor()

    def pull_hive_ts(self, partition, hive_hive_ts_path):
        """
        Pulls hive timestamp from the given list of partitions
        and write results into the corresponding directory under FlowView database
        :param partition: partition in the format of 'year=YYYY/month=MM/day=DD/hour=HH'
        :return:
        """
        ptn_year, ptn_month, ptn_day, ptn_hour = utils.split_ptn(partition)

        pull_cmd = '''
        hive -e "use %s;
        insert overwrite directory '%s/%s/%s/%s/%s'
        select event_id, hive_timestamp from %s
        where year = %s and month = %s and day = %s and hour = %s;"''' \
                   %(self.database,hive_hive_ts_path,ptn_year,ptn_month,ptn_day,ptn_hour,
          self.table, ptn_year, ptn_month,ptn_day,ptn_hour)

        try:
            self.shell_exec.safe_execute(pull_cmd,
                                         splitcmd=False,
                                         as_shell=True)
        except Exception:
            logger.error(
                "Error pulling data from %s hive table and writing to flowview database"
                % self.database)

        logger.info(
            "Wrote hive timestamp data into hive table flowview.db/%s_hive_ts"
            % self.table)

    def get_new_ptns(self, last_ptn):
        """
        Retrieves partitions in Hive whose timestamps have not been retrieved.
        :param lastptn: Last processed partition in the format of 'year=YYYY/month=MM/day=DD/hour=HH'
        :return:  a list of partitions to process in the format of 'year=YYYY/month=MM/day=DD/hour=HH'
        """
        try:
            ptn_cmd = '''
            hive -e "use %s;
            show partitions %s;"
            ''' % (self.database, self.table)
            all_ptns = self.shell_exec.safe_execute(
                ptn_cmd, splitcmd=False, as_shell=True).output.split()
        except Exception:
            raise
        # If last partition is None, then the current load is the first load.
        # Process all existing partitions
        try:
            if last_ptn is None:
                new_ptns = all_ptns
            else:
                lastindex = all_ptns.index(last_ptn)
                # Omit the very last ptn since Thrive/ETL pipeline may still be writing to it
                new_ptns = all_ptns[lastindex + 1:-1]
        except ValueError:
            logger.error("Last processed partition %s not found in table %s" %
                         (last_ptn, self.table))
            raise
        logger.info("Retrieved processed partition %s" % last_ptn)
        logger.info("Pending partitions %s" % new_ptns)

        return new_ptns

    def create_hive_ts_ptn(self, partition, hive_hive_ts_path):
        """
        Creates hive partition
        :param partition: Partition to create in the format of 'year=YYYY/month=MM/day=DD/hour=HH'
        :return:
        """
        ptn_year, ptn_month, ptn_day, ptn_hour = utils.split_ptn(partition)

        create_ptn_cmd = '''
        hive -e "use flowview;
        alter table %s_hive add partition (year = %s, month = %s, day = %s, hour = %s)
        location '%s/%s/%s/%s/%s'";
        ''' % (self.table, ptn_year, ptn_month, ptn_day, ptn_hour,
               hive_hive_ts_path, ptn_year, ptn_month, ptn_day, ptn_hour)

        try:
            self.shell_exec.safe_execute(create_ptn_cmd,
                                         splitcmd=False,
                                         as_shell=True)
        except ShellException:
            logger.error(
                "Error in creating hive table to store hive timestamp")
            raise

        logger.info(
            "Created hive partition year = %s, month = %s, day = %s, hour = %s for %s"
            % (ptn_year, ptn_month, ptn_day, ptn_hour, self.table))

    def count_hdfs_ptn_rows(self, ptn):
        (ptn_year, ptn_month, ptn_day, ptn_hour) = ptn.split("/")
        count_cmd = '''
        hive -e "use flowview;
        select count (*) from %s_hdfs
        where year = %s and month = %s and day = %s and hour = %s;"
        ''' % (self.table, ptn_year, ptn_month, ptn_day, ptn_hour)

        try:
            row_count = self.shell_exec.safe_execute(count_cmd,
                                                     splitcmd=False,
                                                     as_shell=True).output
            return row_count
        except ShellException:
            logger.error("Error getting row counts for hdfs_ts partition %s" %
                         ptn)
            raise

    def count_hive_ptn_rows(self, ptn):
        (ptn_year, ptn_month, ptn_day, ptn_hour) = ptn.split("/")
        ptn_hour_latency = int(ptn_hour) + 1
        where_clause = "%s_hdfs.year = %s and %s_hdfs.month = %s and %s_hdfs.day = %s and %s_hdfs.hour = %s " \
                       "and %s_hive.year = %s and %s_hive.month = %s and %s_hive.day = %s and %s_hive.hour = %s " \
                       "or %s_hive.hour = %s" \
                       %(self.table,ptn_year,self.table,ptn_month,self.table, ptn_day, self.table, ptn_hour,
                         self.table,ptn_year,self.table,ptn_month,self.table, ptn_day, self.table, ptn_hour,
                         self.table,ptn_hour_latency)

        count_cmd = '''
        hive -e "use flowview;
        select count (*) from %s_hdfs join %s_hive on (%s_hdfs.event_id = %s_hive.event_id)
        where %s;"
        ''' % (self.table, self.table, self.table, self.table, where_clause)

        try:
            matching_row_cnt = self.shell_exec.safe_execute(
                count_cmd, splitcmd=False, as_shell=True).output
            return matching_row_cnt
        except ShellException:
            logger.error("Error getting row counts")
            raise

    def load_ptn_transmitted_ratio(self, ptn):
        try:
            hdfs_ptn_row_cnt = self.count_hdfs_ptn_rows(ptn)
            hive_ptn_row_cnt = self.count_hive_ptn_rows(ptn)
            return (int(hive_ptn_row_cnt) / int(hdfs_ptn_row_cnt))
        except ShellException:
            logger.error("Error retrieving row counts")
            raise

    def purge(self):
        """
        Purges all existing hive table for the dataset.
        :return:
        """
        purge_cmd = '''
        hive -e "use flowview;
        drop table %s_hive;
        drop table %s_hdfs;"
        ''' % (self.table, self.table)

        try:
            self.shell_exec.safe_execute(purge_cmd,
                                         splitcmd=False,
                                         as_shell=True)
            logger.info("Purged hive tables")
        except ShellException:
            logger.error("Purging hive tables failed")
            raise
Пример #5
0
def hdfs_thread_execute(topic, table, hdfs_pending, ptn_list,
                        local_hdfs_ts_path, hive_hdfs_ts_path):
    """
    Main hdfs thread executor.
    :param topic: Dataset's Trinity topic name
    :param table: Dataset's Thrive table name in Hive
    :param hdfs_pending: HDFS directories pending processing.
                         Expected format is a list of the following
                         (2015-08-19 10:12, /data/ds_ctg/trinity/thrive_test/d_20150819-1710)
    :return: The latest processed HDFS directory timestmap (e.g. 2015-08-19 10:12) after the current load
    """
    hdfs_mng = hdfsManager(topic)
    hdfs_new_last_dir = hdfs_pending[-1][0]
    threads = []
    workQueue = Queue.Queue()

    # remove previous local file, if it exists
    rmcmd = "rm -r -f %s" % local_hdfs_ts_path
    ShellExecutor.safe_execute(rmcmd)
    logger.info("Removed local file containing timestamps from previous load")

    threadNum = 11
    # Lock the hdfs_pending list so that only one thread can access it
    queueLock = threading.Lock()
    queueLock.acquire()
    for dir in hdfs_pending:
        workQueue.put(dir)
    queueLock.release()

    for t in range(1, threadNum):
        thread = HDFS_ThreadManager("thread%s" % t, workQueue, topic, table,
                                    ptn_list, local_hdfs_ts_path)
        thread.start()
        threads.append(thread)

    while not workQueue.empty():
        pass

    global exitFlag
    exitFlag = 1

    logger.info("Retrieved server & hdfs timestamp info of %s" % topic)

    # Only exit the main thread after all threads finish
    for t in threads:
        t.join()

    # For all directories processed in the current load,
    # (1) transfer the local files that stores the messages' timestamps
    #     to the proper Hive warehouse location
    # (2) create a partition that points toward that location
    for ptn in ptn_list:
        local_dir_path = "%s/%s/hdfs_ts.txt" % (local_hdfs_ts_path, ptn)
        hdfs_tgt_path = "%s/%s" % (hive_hdfs_ts_path, ptn)

        hdfs_mng.makedir(hdfs_tgt_path)
        hdfs_mng.force_putfile(local_dir_path, hdfs_tgt_path)
        hdfs_mng.create_hdfs_ts_ptn(ptn, table, hive_hdfs_ts_path)

    logger.info("Copied hdfs & server timestamp from local to hive warehouse")
    logger.info("Created hive partition for hdfs & server timestamp")
    logger.info("Exiting main thread")

    return hdfs_new_last_dir
Пример #6
0
class HiveManager(object):

    def __init__(self,db,table):
        """
        :param db:
        :param table:
        :return:
        """
        self.database = db
        self.table = table
        self.shell_exec = ShellExecutor()

    def pull_hive_ts(self,partition,hive_hive_ts_path):
        """
        Pulls hive timestamp from the given list of partitions
        and write results into the corresponding directory under FlowView database
        :param partition: partition in the format of 'year=YYYY/month=MM/day=DD/hour=HH'
        :return:
        """
        ptn_year,ptn_month,ptn_day,ptn_hour = utils.split_ptn(partition)

        pull_cmd = '''
        hive -e "use %s;
        insert overwrite directory '%s/%s/%s/%s/%s'
        select event_id, hive_timestamp from %s
        where year = %s and month = %s and day = %s and hour = %s;"''' \
                   %(self.database,hive_hive_ts_path,ptn_year,ptn_month,ptn_day,ptn_hour,
          self.table, ptn_year, ptn_month,ptn_day,ptn_hour)

        try:
            self.shell_exec.safe_execute(pull_cmd,splitcmd=False,as_shell=True)
        except Exception:
            logger.error("Error pulling data from %s hive table and writing to flowview database" %self.database)

        logger.info("Wrote hive timestamp data into hive table flowview.db/%s_hive_ts" %self.table)

    def get_new_ptns(self,last_ptn):
        """
        Retrieves partitions in Hive whose timestamps have not been retrieved.
        :param lastptn: Last processed partition in the format of 'year=YYYY/month=MM/day=DD/hour=HH'
        :return:  a list of partitions to process in the format of 'year=YYYY/month=MM/day=DD/hour=HH'
        """
        try:
            ptn_cmd = '''
            hive -e "use %s;
            show partitions %s;"
            ''' %(self.database,self.table)
            all_ptns = self.shell_exec.safe_execute(ptn_cmd,splitcmd=False,as_shell=True).output.split()
        except Exception:
            raise
        # If last partition is None, then the current load is the first load.
        # Process all existing partitions
        try:
             if last_ptn is None:
                 new_ptns = all_ptns
             else:
                 lastindex = all_ptns.index(last_ptn)
                 # Omit the very last ptn since Thrive/ETL pipeline may still be writing to it
                 new_ptns = all_ptns[lastindex + 1: -1]
        except ValueError:
             logger.error("Last processed partition %s not found in table %s"
                          %(last_ptn,self.table))
             raise
        logger.info("Retrieved processed partition %s" % last_ptn)
        logger.info("Pending partitions %s" %new_ptns)

        return new_ptns

    def create_hive_ts_ptn(self,partition,hive_hive_ts_path):
        """
        Creates hive partition
        :param partition: Partition to create in the format of 'year=YYYY/month=MM/day=DD/hour=HH'
        :return:
        """
        ptn_year,ptn_month,ptn_day,ptn_hour = utils.split_ptn(partition)

        create_ptn_cmd = '''
        hive -e "use flowview;
        alter table %s_hive add partition (year = %s, month = %s, day = %s, hour = %s)
        location '%s/%s/%s/%s/%s'";
        ''' %(self.table, ptn_year,ptn_month,ptn_day,ptn_hour,
              hive_hive_ts_path, ptn_year,ptn_month,ptn_day,ptn_hour)

        try:
            self.shell_exec.safe_execute(create_ptn_cmd,splitcmd=False,as_shell=True)
        except ShellException:
            logger.error("Error in creating hive table to store hive timestamp")
            raise

        logger.info("Created hive partition year = %s, month = %s, day = %s, hour = %s for %s"
                    %(ptn_year,ptn_month,ptn_day,ptn_hour,self.table))

    def count_hdfs_ptn_rows(self,ptn):
        (ptn_year,ptn_month,ptn_day,ptn_hour) = ptn.split("/")
        count_cmd = '''
        hive -e "use flowview;
        select count (*) from %s_hdfs
        where year = %s and month = %s and day = %s and hour = %s;"
        '''%(self.table,ptn_year,ptn_month,ptn_day,ptn_hour)

        try:
            row_count = self.shell_exec.safe_execute(count_cmd,splitcmd=False,as_shell=True).output
            return row_count
        except ShellException:
            logger.error("Error getting row counts for hdfs_ts partition %s" %ptn)
            raise

    def count_hive_ptn_rows(self,ptn):
        (ptn_year,ptn_month,ptn_day,ptn_hour) = ptn.split("/")
        ptn_hour_latency = int(ptn_hour) + 1
        where_clause = "%s_hdfs.year = %s and %s_hdfs.month = %s and %s_hdfs.day = %s and %s_hdfs.hour = %s " \
                       "and %s_hive.year = %s and %s_hive.month = %s and %s_hive.day = %s and %s_hive.hour = %s " \
                       "or %s_hive.hour = %s" \
                       %(self.table,ptn_year,self.table,ptn_month,self.table, ptn_day, self.table, ptn_hour,
                         self.table,ptn_year,self.table,ptn_month,self.table, ptn_day, self.table, ptn_hour,
                         self.table,ptn_hour_latency)

        count_cmd = '''
        hive -e "use flowview;
        select count (*) from %s_hdfs join %s_hive on (%s_hdfs.event_id = %s_hive.event_id)
        where %s;"
        ''' %(self.table,self.table,self.table,self.table,where_clause)

        try:
            matching_row_cnt = self.shell_exec.safe_execute(count_cmd,splitcmd=False,as_shell=True).output
            return matching_row_cnt
        except ShellException:
            logger.error("Error getting row counts")
            raise

    def load_ptn_transmitted_ratio(self,ptn):
        try:
            hdfs_ptn_row_cnt = self.count_hdfs_ptn_rows(ptn)
            hive_ptn_row_cnt = self.count_hive_ptn_rows(ptn)
            return (int(hive_ptn_row_cnt)/int(hdfs_ptn_row_cnt))
        except ShellException:
            logger.error("Error retrieving row counts")
            raise

    def purge(self):
        """
        Purges all existing hive table for the dataset.
        :return:
        """
        purge_cmd = '''
        hive -e "use flowview;
        drop table %s_hive;
        drop table %s_hdfs;"
        ''' %(self.table,self.table)

        try:
            self.shell_exec.safe_execute(purge_cmd,splitcmd=False,as_shell=True)
            logger.info("Purged hive tables")
        except ShellException:
            logger.error("Purging hive tables failed")
            raise