def __init__(self, db, table): """ :param db: :param table: :return: """ self.database = db self.table = table self.shell_exec = ShellExecutor()
def __init__(self,db,table): """ :param db: :param table: :return: """ self.database = db self.table = table self.shell_exec = ShellExecutor()
def __init__(self,config_file=None): """ :param topic: Dataset's Trinity topic name :param db: Dataset's Thrive database name in Hive :table: Dataset's Thrive table name in Hive :return: """ self.parser = SafeConfigParser() self.parser.read(config_file) self.topic = self.get_config("topic_name") self.database = self.get_config("database_name") self.table = self.get_config("table_name") self.connection_info = self.get_config("connection_info") self.metadata_mgr = MetadataManager(self.connection_info,self.table,self.topic) self.hdfs_mgr = hdfsManager(self.topic) self.hive_mng = HiveManager(self.database,self.table) self.shell_exec = ShellExecutor() self.loadts = datetime.now() self.hdfs_topic = "idea-flowview"
class HiveManager(object): def __init__(self, db, table): """ :param db: :param table: :return: """ self.database = db self.table = table self.shell_exec = ShellExecutor() def pull_hive_ts(self, partition, hive_hive_ts_path): """ Pulls hive timestamp from the given list of partitions and write results into the corresponding directory under FlowView database :param partition: partition in the format of 'year=YYYY/month=MM/day=DD/hour=HH' :return: """ ptn_year, ptn_month, ptn_day, ptn_hour = utils.split_ptn(partition) pull_cmd = ''' hive -e "use %s; insert overwrite directory '%s/%s/%s/%s/%s' select event_id, hive_timestamp from %s where year = %s and month = %s and day = %s and hour = %s;"''' \ %(self.database,hive_hive_ts_path,ptn_year,ptn_month,ptn_day,ptn_hour, self.table, ptn_year, ptn_month,ptn_day,ptn_hour) try: self.shell_exec.safe_execute(pull_cmd, splitcmd=False, as_shell=True) except Exception: logger.error( "Error pulling data from %s hive table and writing to flowview database" % self.database) logger.info( "Wrote hive timestamp data into hive table flowview.db/%s_hive_ts" % self.table) def get_new_ptns(self, last_ptn): """ Retrieves partitions in Hive whose timestamps have not been retrieved. :param lastptn: Last processed partition in the format of 'year=YYYY/month=MM/day=DD/hour=HH' :return: a list of partitions to process in the format of 'year=YYYY/month=MM/day=DD/hour=HH' """ try: ptn_cmd = ''' hive -e "use %s; show partitions %s;" ''' % (self.database, self.table) all_ptns = self.shell_exec.safe_execute( ptn_cmd, splitcmd=False, as_shell=True).output.split() except Exception: raise # If last partition is None, then the current load is the first load. # Process all existing partitions try: if last_ptn is None: new_ptns = all_ptns else: lastindex = all_ptns.index(last_ptn) # Omit the very last ptn since Thrive/ETL pipeline may still be writing to it new_ptns = all_ptns[lastindex + 1:-1] except ValueError: logger.error("Last processed partition %s not found in table %s" % (last_ptn, self.table)) raise logger.info("Retrieved processed partition %s" % last_ptn) logger.info("Pending partitions %s" % new_ptns) return new_ptns def create_hive_ts_ptn(self, partition, hive_hive_ts_path): """ Creates hive partition :param partition: Partition to create in the format of 'year=YYYY/month=MM/day=DD/hour=HH' :return: """ ptn_year, ptn_month, ptn_day, ptn_hour = utils.split_ptn(partition) create_ptn_cmd = ''' hive -e "use flowview; alter table %s_hive add partition (year = %s, month = %s, day = %s, hour = %s) location '%s/%s/%s/%s/%s'"; ''' % (self.table, ptn_year, ptn_month, ptn_day, ptn_hour, hive_hive_ts_path, ptn_year, ptn_month, ptn_day, ptn_hour) try: self.shell_exec.safe_execute(create_ptn_cmd, splitcmd=False, as_shell=True) except ShellException: logger.error( "Error in creating hive table to store hive timestamp") raise logger.info( "Created hive partition year = %s, month = %s, day = %s, hour = %s for %s" % (ptn_year, ptn_month, ptn_day, ptn_hour, self.table)) def count_hdfs_ptn_rows(self, ptn): (ptn_year, ptn_month, ptn_day, ptn_hour) = ptn.split("/") count_cmd = ''' hive -e "use flowview; select count (*) from %s_hdfs where year = %s and month = %s and day = %s and hour = %s;" ''' % (self.table, ptn_year, ptn_month, ptn_day, ptn_hour) try: row_count = self.shell_exec.safe_execute(count_cmd, splitcmd=False, as_shell=True).output return row_count except ShellException: logger.error("Error getting row counts for hdfs_ts partition %s" % ptn) raise def count_hive_ptn_rows(self, ptn): (ptn_year, ptn_month, ptn_day, ptn_hour) = ptn.split("/") ptn_hour_latency = int(ptn_hour) + 1 where_clause = "%s_hdfs.year = %s and %s_hdfs.month = %s and %s_hdfs.day = %s and %s_hdfs.hour = %s " \ "and %s_hive.year = %s and %s_hive.month = %s and %s_hive.day = %s and %s_hive.hour = %s " \ "or %s_hive.hour = %s" \ %(self.table,ptn_year,self.table,ptn_month,self.table, ptn_day, self.table, ptn_hour, self.table,ptn_year,self.table,ptn_month,self.table, ptn_day, self.table, ptn_hour, self.table,ptn_hour_latency) count_cmd = ''' hive -e "use flowview; select count (*) from %s_hdfs join %s_hive on (%s_hdfs.event_id = %s_hive.event_id) where %s;" ''' % (self.table, self.table, self.table, self.table, where_clause) try: matching_row_cnt = self.shell_exec.safe_execute( count_cmd, splitcmd=False, as_shell=True).output return matching_row_cnt except ShellException: logger.error("Error getting row counts") raise def load_ptn_transmitted_ratio(self, ptn): try: hdfs_ptn_row_cnt = self.count_hdfs_ptn_rows(ptn) hive_ptn_row_cnt = self.count_hive_ptn_rows(ptn) return (int(hive_ptn_row_cnt) / int(hdfs_ptn_row_cnt)) except ShellException: logger.error("Error retrieving row counts") raise def purge(self): """ Purges all existing hive table for the dataset. :return: """ purge_cmd = ''' hive -e "use flowview; drop table %s_hive; drop table %s_hdfs;" ''' % (self.table, self.table) try: self.shell_exec.safe_execute(purge_cmd, splitcmd=False, as_shell=True) logger.info("Purged hive tables") except ShellException: logger.error("Purging hive tables failed") raise
def hdfs_thread_execute(topic, table, hdfs_pending, ptn_list, local_hdfs_ts_path, hive_hdfs_ts_path): """ Main hdfs thread executor. :param topic: Dataset's Trinity topic name :param table: Dataset's Thrive table name in Hive :param hdfs_pending: HDFS directories pending processing. Expected format is a list of the following (2015-08-19 10:12, /data/ds_ctg/trinity/thrive_test/d_20150819-1710) :return: The latest processed HDFS directory timestmap (e.g. 2015-08-19 10:12) after the current load """ hdfs_mng = hdfsManager(topic) hdfs_new_last_dir = hdfs_pending[-1][0] threads = [] workQueue = Queue.Queue() # remove previous local file, if it exists rmcmd = "rm -r -f %s" % local_hdfs_ts_path ShellExecutor.safe_execute(rmcmd) logger.info("Removed local file containing timestamps from previous load") threadNum = 11 # Lock the hdfs_pending list so that only one thread can access it queueLock = threading.Lock() queueLock.acquire() for dir in hdfs_pending: workQueue.put(dir) queueLock.release() for t in range(1, threadNum): thread = HDFS_ThreadManager("thread%s" % t, workQueue, topic, table, ptn_list, local_hdfs_ts_path) thread.start() threads.append(thread) while not workQueue.empty(): pass global exitFlag exitFlag = 1 logger.info("Retrieved server & hdfs timestamp info of %s" % topic) # Only exit the main thread after all threads finish for t in threads: t.join() # For all directories processed in the current load, # (1) transfer the local files that stores the messages' timestamps # to the proper Hive warehouse location # (2) create a partition that points toward that location for ptn in ptn_list: local_dir_path = "%s/%s/hdfs_ts.txt" % (local_hdfs_ts_path, ptn) hdfs_tgt_path = "%s/%s" % (hive_hdfs_ts_path, ptn) hdfs_mng.makedir(hdfs_tgt_path) hdfs_mng.force_putfile(local_dir_path, hdfs_tgt_path) hdfs_mng.create_hdfs_ts_ptn(ptn, table, hive_hdfs_ts_path) logger.info("Copied hdfs & server timestamp from local to hive warehouse") logger.info("Created hive partition for hdfs & server timestamp") logger.info("Exiting main thread") return hdfs_new_last_dir
class HiveManager(object): def __init__(self,db,table): """ :param db: :param table: :return: """ self.database = db self.table = table self.shell_exec = ShellExecutor() def pull_hive_ts(self,partition,hive_hive_ts_path): """ Pulls hive timestamp from the given list of partitions and write results into the corresponding directory under FlowView database :param partition: partition in the format of 'year=YYYY/month=MM/day=DD/hour=HH' :return: """ ptn_year,ptn_month,ptn_day,ptn_hour = utils.split_ptn(partition) pull_cmd = ''' hive -e "use %s; insert overwrite directory '%s/%s/%s/%s/%s' select event_id, hive_timestamp from %s where year = %s and month = %s and day = %s and hour = %s;"''' \ %(self.database,hive_hive_ts_path,ptn_year,ptn_month,ptn_day,ptn_hour, self.table, ptn_year, ptn_month,ptn_day,ptn_hour) try: self.shell_exec.safe_execute(pull_cmd,splitcmd=False,as_shell=True) except Exception: logger.error("Error pulling data from %s hive table and writing to flowview database" %self.database) logger.info("Wrote hive timestamp data into hive table flowview.db/%s_hive_ts" %self.table) def get_new_ptns(self,last_ptn): """ Retrieves partitions in Hive whose timestamps have not been retrieved. :param lastptn: Last processed partition in the format of 'year=YYYY/month=MM/day=DD/hour=HH' :return: a list of partitions to process in the format of 'year=YYYY/month=MM/day=DD/hour=HH' """ try: ptn_cmd = ''' hive -e "use %s; show partitions %s;" ''' %(self.database,self.table) all_ptns = self.shell_exec.safe_execute(ptn_cmd,splitcmd=False,as_shell=True).output.split() except Exception: raise # If last partition is None, then the current load is the first load. # Process all existing partitions try: if last_ptn is None: new_ptns = all_ptns else: lastindex = all_ptns.index(last_ptn) # Omit the very last ptn since Thrive/ETL pipeline may still be writing to it new_ptns = all_ptns[lastindex + 1: -1] except ValueError: logger.error("Last processed partition %s not found in table %s" %(last_ptn,self.table)) raise logger.info("Retrieved processed partition %s" % last_ptn) logger.info("Pending partitions %s" %new_ptns) return new_ptns def create_hive_ts_ptn(self,partition,hive_hive_ts_path): """ Creates hive partition :param partition: Partition to create in the format of 'year=YYYY/month=MM/day=DD/hour=HH' :return: """ ptn_year,ptn_month,ptn_day,ptn_hour = utils.split_ptn(partition) create_ptn_cmd = ''' hive -e "use flowview; alter table %s_hive add partition (year = %s, month = %s, day = %s, hour = %s) location '%s/%s/%s/%s/%s'"; ''' %(self.table, ptn_year,ptn_month,ptn_day,ptn_hour, hive_hive_ts_path, ptn_year,ptn_month,ptn_day,ptn_hour) try: self.shell_exec.safe_execute(create_ptn_cmd,splitcmd=False,as_shell=True) except ShellException: logger.error("Error in creating hive table to store hive timestamp") raise logger.info("Created hive partition year = %s, month = %s, day = %s, hour = %s for %s" %(ptn_year,ptn_month,ptn_day,ptn_hour,self.table)) def count_hdfs_ptn_rows(self,ptn): (ptn_year,ptn_month,ptn_day,ptn_hour) = ptn.split("/") count_cmd = ''' hive -e "use flowview; select count (*) from %s_hdfs where year = %s and month = %s and day = %s and hour = %s;" '''%(self.table,ptn_year,ptn_month,ptn_day,ptn_hour) try: row_count = self.shell_exec.safe_execute(count_cmd,splitcmd=False,as_shell=True).output return row_count except ShellException: logger.error("Error getting row counts for hdfs_ts partition %s" %ptn) raise def count_hive_ptn_rows(self,ptn): (ptn_year,ptn_month,ptn_day,ptn_hour) = ptn.split("/") ptn_hour_latency = int(ptn_hour) + 1 where_clause = "%s_hdfs.year = %s and %s_hdfs.month = %s and %s_hdfs.day = %s and %s_hdfs.hour = %s " \ "and %s_hive.year = %s and %s_hive.month = %s and %s_hive.day = %s and %s_hive.hour = %s " \ "or %s_hive.hour = %s" \ %(self.table,ptn_year,self.table,ptn_month,self.table, ptn_day, self.table, ptn_hour, self.table,ptn_year,self.table,ptn_month,self.table, ptn_day, self.table, ptn_hour, self.table,ptn_hour_latency) count_cmd = ''' hive -e "use flowview; select count (*) from %s_hdfs join %s_hive on (%s_hdfs.event_id = %s_hive.event_id) where %s;" ''' %(self.table,self.table,self.table,self.table,where_clause) try: matching_row_cnt = self.shell_exec.safe_execute(count_cmd,splitcmd=False,as_shell=True).output return matching_row_cnt except ShellException: logger.error("Error getting row counts") raise def load_ptn_transmitted_ratio(self,ptn): try: hdfs_ptn_row_cnt = self.count_hdfs_ptn_rows(ptn) hive_ptn_row_cnt = self.count_hive_ptn_rows(ptn) return (int(hive_ptn_row_cnt)/int(hdfs_ptn_row_cnt)) except ShellException: logger.error("Error retrieving row counts") raise def purge(self): """ Purges all existing hive table for the dataset. :return: """ purge_cmd = ''' hive -e "use flowview; drop table %s_hive; drop table %s_hdfs;" ''' %(self.table,self.table) try: self.shell_exec.safe_execute(purge_cmd,splitcmd=False,as_shell=True) logger.info("Purged hive tables") except ShellException: logger.error("Purging hive tables failed") raise