def insert_ptn_ratio(self): ptn_list_sorted = sorted(self.hdfs_ptn_list,key=lambda s: int(re.sub("[^0-9]", "", s))) try: for ptn in ptn_list_sorted: transmitted_ratio = self.hive_mng.load_ptn_transmitted_ratio(ptn) load_success_data = { "topic_name": self.topic, "database_name": self.database, "table_name":self.table, "load_start_time":utils.iso_format(self.loadts), "load_end_time": utils.iso_format(datetime.now()), "hdfs_partition":ptn, "transmitted_ratio":transmitted_ratio } print load_success_data self.metadata_mgr.insert(load_success_data,"ratio") except Exception: logger.error("Error creating transmission ratio metadata") raise
def execute(self): # TODO [comment section (2)] based on?? """ Top level method for LoadHandler; manages the load workflow. The method (1) decides if the current load should proceed (2) creates Hive table for server & HDFS timestamp partitioned by Year, Month, Day, Hour based on (3) creates hive table for hive timestamp partitioned by Year, Month, Day, Hour based on source Hive table partition :return: None """ # Determines if there exist hdfs directories and hive partitions to process self.hdfs_to_proceed() self.hive_to_proceed() if not self.hive_proceed and not self.hdfs_proceed: logger.info("No partitions or directories to proceed. Ending load") return logger.info("Proceeding with load") if self.hdfs_proceed: logger.info("Proceeding with HDFS load") try: # Calculate the latest last processed hdfs directory after the current load # hdfs_thread_manager retrieves hdfs timestamps from pending directories, # write to local file system, copy files to hive warehouse, # create new partitions that point toward corresponding directories self.hdfs_new_last_dir = hdfs_thread_execute(self.topic,self.table,self.hdfs_dir_pending, self.hdfs_ptn_list, self.get_config("local_hdfs_ts_path"), self.get_config("hive_hdfs_ts_path")) except Exception: logger.error("Error retrieving server and hdfs timestamp") raise else: logger.info("No new HDFS dir to process.") if self.hive_proceed: hive_hive_ts_path = self.get_config("hive_hive_ts_path") logger.info("Proceeding with Hive load") try: # create a FlowView partition for each Hive partition pending processing for partition in self.hive_ptn_pending: # create partition self.hive_mgr.create_hive_ts_ptn(partition,hive_hive_ts_path) # retrieve hive timestamp data and write into the corresponding directory self.hive_mgr.pull_hive_ts(partition,hive_hive_ts_path) # calculate the last processed partition after the current load self.hive_new_last_ptn = self.hive_ptn_pending[-1] except Exception: logger.error("Error creating hive partition") raise logger.info("Created hive partition for Hive timestamp") else: logger.info("No new Hive partition to process") try: # create metadata for current load load_metadata = { "topic_name": self.topic, "database_name": self.database, "table_name":self.table, "load_start_time":utils.iso_format(self.loadts), "load_end_time": utils.iso_format(datetime.now()), "last_load_hdfs_dir": self.hdfs_new_last_dir, "last_load_hive_partition": self.hive_new_last_ptn } logger.info("Created load_metadata %s" %load_metadata) except Exception: logger.error("Error creating metadata") raise logger.info("Created hive partition for hive timestamp") try: # insert metadata for current load into SQL metadata database self.metadata_mgr.insert(load_metadata,"load") except MetadataException: logger.error("Error inserting metadata %s" %load_metadata) raise try: self.insert_ptn_ratio() logger.info("Calculated load partition data transmission ratio") except Exception: logger.error("Error in calculating load partition data transmission ratio") raise logger.info("Load complete")