def mrqos_join_cleanupv2(logger): """ when called, this function will delete all partitions the clnspp table as long as it is older than the threshold """ # get the lowest partition by checking the HDFS folders joined_partitions = hdfsutil.ls(config.hdfs_table_join2) str_parts_list = [i.split('=', 1)[1] for i in joined_partitions] str_parts_list_int = map(int, str_parts_list) # check if "partitions" is within the threshold timenow = int(time.time()) # get the list of retired data in HDFS using hive partitions try: hdfs_remove_list = [x for x in beeline.show_partitions('mrqos.mrqos_join').split('\n')\ if '=' in x and x.split('=')[1] < str(timenow-config.mrqos_join_delete)] try: # drop the partitions in hive beeline.drop_partitions('mrqos.mrqos_join2', 'ts<%s' % str(timenow-config.mrqos_join_delete)) logger.info("drop hive partitions successful. ") # remove the hdfs folders for partition_id in hdfs_remove_list: try: hdfs_d = os.path.join(config.hdfs_table, 'mrqos_join2', '%s' % str(partition_id)) hdfsutil.rm(hdfs_d, r=True) except sp.CalledProcessError as e: logger.info('failed to remove HDFS folder for mrqos_join at partition folder %s' % str(partition_id)) logger.info('remove HDFS successful. ') except sp.CalledProcessError as e: logger.error('failed to drop partitions. ') except sp.CalledProcessError as e: logger.error('failed to obtain retire partition list (HIVE)') logger.error('error message: %s' % e.message)
def mrqos_table_cleanup(): """ when called, this function will delete all partitions the clnspp table as long as it is older than the threshold """ # get the lowest partition by checking the HDFS folders score_partitions = hdfsutil.ls(config.hdfs_table_score) str_parts_list = [i.split('=', 1)[1] for i in score_partitions] str_parts_list_int = map(int, str_parts_list) # check if "partitions" is within the threshold, if not, drop in hive table and remove from hdfs timenow = int(time.time()) mtype = ['score', 'distance', 'in_country', 'in_continent', 'ra_load'] for item in mtype: exec('this_partitions = hdfsutil.ls(config.hdfs_table_%s)' % item) str_parts_list = [i.split('=', 1)[1] for i in this_partitions] str_parts_list_int = map(int, str_parts_list) print " ## for table: %s" % item print " ## ", print str_parts_list_int for partition in str_parts_list_int: if partition < timenow - config.mrqos_table_delete: try: print " ## handling table: %s with ts=%s" % (item, str(partition)) # drop partitions (ok even if partition does not exist) hiveql_str = 'use mrqos; alter table ' + item + ' drop if exists partition(ts=%s)' % str(partition) beeline.bln_e(hiveql_str) # remove data from HDFS (ok even if folder in hdfs does not exist) hdfs_d = os.path.join(config.hdfs_table, item, 'ts=%s' % partition) hdfsutil.rm(hdfs_d, r=True) except sp.CalledProcessError as e: print ">> failed in hive table clean up in table: %s." % item print e.message
def main(): """ get the date for the past day (yesterday). """ timenow = int(time.time()) datenow = str(datetime.date.today()-datetime.timedelta(1)) datenow = datenow[0:4]+datenow[5:7]+datenow[8:10] print "###################" print "# Start processing the data back in " + datenow + " (yesterday)" print "# starting processing time is " + str(timenow) print "###################" ts = calendar.timegm(time.gmtime()) ts_last_hour = ts-3600 datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour))) hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour))) # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s." % (datestamp), if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_day % (datestamp), '000000_0.deflate')): f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_day.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, datestamp, datestamp) f.close() print " **** perform beeline for hourly summary for day = %s, hour = %s." %(datestamp, hourstamp) try: beeline.bln_e(strcmd_s) except: # delete the folder if summarization failed. print " **** summarization failed, removed hdfs folder." hdfsutil.rm(config.hdfs_qos_rg_day % (datestamp), r=True) else: print " file exists."
def mrqos_join_cleanup(): """ when called, this function will delete all partitions the clnspp table as long as it is older than the threshold """ # get the lowest partition by checking the HDFS folders joined_partitions = hdfsutil.ls(config.hdfs_table_join) str_parts_list = [i.split('=', 1)[1] for i in joined_partitions] str_parts_list_int = map(int, str_parts_list) # check if "partitions" is within the threshold timenow = int(time.time()) # get the list of retired data in HDFS using hive partitions try: hdfs_remove_list = [x for x in beeline.show_partitions('mrqos.mrqos_join').split('\n')\ if '=' in x and x.split('=')[1] < str(timenow-config.mrqos_join_delete)] try: # drop the partitions in hive beeline.drop_partitions('mrqos.mrqos_join', 'ts<%s' % str(timenow-config.mrqos_join_delete)) print " drop partitions successful. " # remove the hdfs folders for partition_id in hdfs_remove_list: try: hdfs_d = os.path.join(config.hdfs_table, 'mrqos_join', '%s' % str(partition_id)) hdfsutil.rm(hdfs_d, r=True) except sp.CalledProcessError as e: print ">> failed to remove HDFS folder for mrqos_join at partition folder %s" % str(partition_id) print " remove HDFS successful. " except sp.CalledProcessError as e: print ">> failed to drop partitions" except sp.CalledProcessError as e: print ">> failed to obtain retire partition list (HIVE)" print e.message
def upload_to_hive(listname, hdfs_d, partition, tablename, logger): """ this function will create a partition directory in hdfs with the requisite timestamp. It will then add the partition to the table "tablename" with the appropriate "partition" """ # hdfs_d = config.hdfsclnspp % (ts) # create the partition try: # sp.check_call(['hadoop', 'fs', '-mkdir', hdfs_d]) hdfs.mkdir(hdfs_d) logger.info('HDFS directory creation succeeded: %s' % hdfs_d) try: # sp.check_call(['hadoop', 'fs', '-put', listname, hdfs_d]) hdfs.put(listname, hdfs_d) logger.info('HDFS upload succeeded: %s' % listname) try: hiveql_str = 'use mrqos; alter table ' + tablename + ' add partition(%s);' % (partition) bln_e(hiveql_str) logger.info('add partition (alter table) succeeded %s' % tablename) except sp.CalledProcessError as e: logger.error('add partition (alter table) failed.') logger.error('error: %s' % e.message) # sp.check_call(['hadoop', 'fs', '-rm', '-r', hdfs_d]) hdfs.rm(hdfs_d, r=True) except sp.CalledProcessError as e: logger.error('HDFS upload failed.') logger.error('error: %s' % e.message) # sp.check_call(['hadoop', 'fs', '-rm', '-r', hdfs_d]) hdfs.rm(hdfs_d, r=True) except sp.CalledProcessError as e: logger.error('HDFS directory creation failed.') logger.error('error: %s' % e.message)
def main(): """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """ ts = calendar.timegm(time.gmtime()) print "###################" print "# Performing the hourly mrqos_region summary" print "# starting processing time is " + str(ts) print "###################" ts_last_hour = ts-3600 datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour))) hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour))) hour_list = [str("%02d" % x) for x in range(24)] region_summary_retrial_max = 10 # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')): f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " **** perform beeline for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: beeline.bln_e(strcmd_s) try: beeline.bln_e_output(strcmd_g, query_result_file) except: print " **** copy to local failed!" break except: # delete the folder if summarization failed. print " **** summarization failed upto #retrials="+str(count_retrial) hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists." # check if the summary has been performed since the beginning of the day, last check on day X is X+1/0:30:00 for hour in hour_list: if hour < hourstamp: print " **** checking day = %s, hour = %s." % (datestamp, hour), if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hour), '000000_0.deflate')): f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hour, datestamp, hour, datestamp, hour) f.close() print " **** perform beeline for hourly summary for day = %s, hour = %s." %(datestamp, hour) try: beeline.bln_e(strcmd_s) except: # delete the folder if summarization failed. print " **** summarization failed, removed hdfs folder." hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hour), r=True) else: print " file exists."
def main(): """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """ ts = calendar.timegm(time.gmtime()) print "###################" print "# Performing the hourly mrqos_region summary" print "# starting processing time is " + str(ts) print "###################" ts_last_hour = ts-3600 datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour))) hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour))) hour_list = [str("%02d" % x) for x in range(24)] hour_list = [x for x in hour_list if x <= hourstamp] region_summary_retrial_max = 10 # check if the summary has been performed on this particular hour (last hour) folders_day = '/'.join(str(config.hdfs_qos_rg_view_hour % (datestamp, '00')).split('/')[0:-1]) # check if the summary folder for "this day" (datestamp) has been created or not, if not, create one if hdfsutil.test_dic(folders_day): hdfsutil.mkdir(folders_day) folders_in = [folders_day+'/hour=%s' % x for x in hour_list] folders_out = hdfsutil.ls(folders_day) folders_missing = [x for x in folders_in if x not in folders_out] folders_missing.sort(reverse=True) for item in folders_missing: hourstamp = item[-2:] print " **** missing data for day = %s, hour = %s." % (datestamp, hourstamp), f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " **** perform beeline for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: beeline.bln_e(strcmd_s) try: beeline.bln_e_output(strcmd_g, query_result_file) except: print " **** copy to local failed!" break except sp.CalledProcessError as e: # delete the folder if summarization failed. print " **** summarization failed upto #retrials="+str(count_retrial) print " **** ", print e.message hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1
def main(): # initialze the logger logging.basicConfig( filename=os.path.join('/home/testgrp/logs/', 'mapmon_summarize.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) timenow = int(time.time()) datenow = str(datetime.date.today() - datetime.timedelta(1)) date_idx = datenow[0:4] + datenow[5:7] + datenow[8:10] # get the latest barebone day_idx bb_day_idx = beeline.get_last_partitions('mapper.barebones').split('=')[1] logger.info("barebone index: day={}".format(bb_day_idx)) # get the latest mpd yesterday uuid_list = [ x.split('=')[-1] for x in hdfsutil.ls( os.path.join(os.path.dirname(config.hdfs_table), 'mapper', 'mapmon', 'day={}'.format(date_idx))) ] for uuid_idx in uuid_list: logger.info("dealing with day={}, uuid={}".format(date_idx, uuid_idx)) file_location = os.path.join(config.hdfs_table, 'mapmon_sum', 'day={}'.format(date_idx), 'mpd_uuid={}'.format(uuid_idx)) if hdfsutil.test_dic(file_location): logger.info('creating folder: {}'.format(file_location)) hdfsutil.mkdir(file_location) if hdfsutil.test_file(os.path.join(file_location, '000000_0.deflate')): f = open( os.path.join(config.mrqos_hive_query, 'mapmon_summarize.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (date_idx, uuid_idx, bb_day_idx, date_idx, uuid_idx, date_idx, uuid_idx) f.close() try: beeline.bln_e(strcmd_s) except: # delete the folder if summarization failed. logger.warn("summarization failed, removing hdfs folder.") hdfsutil.rm(file_location, r=True) else: logger.info(" file exists.")
def cleanup_mrqos_region_related_tables(datestamp, hour): tables = ['mrqos_region_hour', 'case_view_hour', 'region_view_hour'] for table_item in tables: try: # drop partitions (ok even if partition does not exist) hiveql_str = 'use mrqos; alter table %s drop if exists partition(datestamp=%s, hour=%s)' % (table_item, str(datestamp), str(hour)) beeline.bln_e(hiveql_str) # remove data from HDFS (ok even if folder in hdfs does not exist) hdfs_d = os.path.join(config.hdfs_table, table_item, 'datestamp=%s' % str(datestamp), 'hour=%s' % str(hour)) hdfsutil.rm(hdfs_d, r=True) except sp.CalledProcessError: print ">> failed in hive table clean up in table: %s for partition datestamp=%s, hour=%s." % (table_item, str(datestamp), str(hour)) pass
def main(): # initialze the logger logging.basicConfig(filename=os.path.join('/home/testgrp/logs/', 'mapmon_summarize.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) timenow = int(time.time()) datenow = str(datetime.date.today()-datetime.timedelta(1)) date_idx = datenow[0:4]+datenow[5:7]+datenow[8:10] # get the latest barebone day_idx bb_day_idx = beeline.get_last_partitions('mapper.barebones').split('=')[1] logger.info("barebone index: day={}".format(bb_day_idx)) # get the latest mpd yesterday uuid_list = [x.split('=')[-1] for x in hdfsutil.ls(os.path.join(os.path.dirname(config.hdfs_table),'mapper','mapmon','day={}'.format(date_idx)))] for uuid_idx in uuid_list: logger.info("dealing with day={}, uuid={}".format(date_idx, uuid_idx)) file_location = os.path.join(config.hdfs_table, 'mapmon_sum', 'day={}'.format(date_idx), 'mpd_uuid={}'.format(uuid_idx)) if hdfsutil.test_dic(file_location): logger.info('creating folder: {}'.format(file_location)) hdfsutil.mkdir(file_location) if hdfsutil.test_file(os.path.join(file_location, '000000_0.deflate')): f = open(os.path.join(config.mrqos_hive_query, 'mapmon_summarize.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (date_idx, uuid_idx, bb_day_idx, date_idx, uuid_idx, date_idx, uuid_idx) f.close() try: beeline.bln_e(strcmd_s) except: # delete the folder if summarization failed. logger.warn("summarization failed, removing hdfs folder.") hdfsutil.rm(file_location, r=True) else: logger.info(" file exists.")
def upload_to_hive(listname, hdfs_d, partition, tablename, logger): """ this function will create a partition directory in hdfs with the requisite timestamp. It will then add the partition to the table "tablename" with the appropriate "partition" """ # hdfs_d = config.hdfsclnspp % (ts) # create the partition try: # sp.check_call(['hadoop', 'fs', '-mkdir', hdfs_d]) hdfs.mkdir(hdfs_d) logger.info('HDFS directory creation succeeded: %s' % hdfs_d) try: # sp.check_call(['hadoop', 'fs', '-put', listname, hdfs_d]) hdfs.put(listname, hdfs_d) logger.info('HDFS upload succeeded: %s' % listname) try: hiveql_str = 'use mrqos; alter table ' + tablename + ' add partition(%s);' % ( partition) bln_e(hiveql_str) logger.info('add partition (alter table) succeeded %s' % tablename) except sp.CalledProcessError as e: logger.error('add partition (alter table) failed.') logger.error('error: %s' % e.message) # sp.check_call(['hadoop', 'fs', '-rm', '-r', hdfs_d]) hdfs.rm(hdfs_d, r=True) except sp.CalledProcessError as e: logger.error('HDFS upload failed.') logger.error('error: %s' % e.message) # sp.check_call(['hadoop', 'fs', '-rm', '-r', hdfs_d]) hdfs.rm(hdfs_d, r=True) except sp.CalledProcessError as e: logger.error('HDFS directory creation failed.') logger.error('error: %s' % e.message)
def mrqos_table_cleanup(): """ when called, this function will delete all partitions the clnspp table as long as it is older than the threshold """ # get the lowest partition by checking the HDFS folders score_partitions = hdfsutil.ls(config.hdfs_table_score) str_parts_list = [i.split('=', 1)[1] for i in score_partitions] str_parts_list_int = map(int, str_parts_list) # check if "partitions" is within the threshold, if not, drop in hive table and remove from hdfs timenow = int(time.time()) mtype = ['score', 'distance', 'in_country', 'in_continent', 'ra_load'] for item in mtype: exec('this_partitions = hdfsutil.ls(config.hdfs_table_%s)' % item) str_parts_list = [i.split('=', 1)[1] for i in this_partitions] str_parts_list_int = map(int, str_parts_list) print " ## for table: %s" % item print " ## ", print str_parts_list_int for partition in str_parts_list_int: if partition < timenow - config.mrqos_table_delete: try: print " ## handling table: %s with ts=%s" % ( item, str(partition)) # drop partitions (ok even if partition does not exist) hiveql_str = 'use mrqos; alter table ' + item + ' drop if exists partition(ts=%s)' % str( partition) beeline.bln_e(hiveql_str) # remove data from HDFS (ok even if folder in hdfs does not exist) hdfs_d = os.path.join(config.hdfs_table, item, 'ts=%s' % partition) hdfsutil.rm(hdfs_d, r=True) except sp.CalledProcessError as e: print ">> failed in hive table clean up in table: %s." % item print e.message
def main(): # logging set-up logging.basicConfig( filename=os.path.join(config.mrqos_logging, 'hive_table_cleanup.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # ############################## # start the script # parameter setting # ############################## ts = int(time.time()) ts_timeout = ts - config.mrqos_table_delete * 24 * 3 # 3 days = (24*3) hours of time-out date_timeout = time.strftime('%Y%m%d', time.gmtime(float(ts_timeout))) # hourstamp = time.strftime('%H', time.gmtime(float(ts))) list_to_clean = sorted( list( set([ x.split('/')[0] for x in beeline.show_partitions( 'mrqos.mrqos_region').split('\n') ]))) list_to_clean = [ x for x in list_to_clean if ('=' in x and x.split('=')[1] < date_timeout) ] logger.info('handling table: mrqos_region') try: logger.info('removing the data in HDFS') # remove the hdfs folder for item in list_to_clean: hdfsutil.rm(os.path.join(config.hdfs_table, 'mrqos_region', '%s' % item), r=True) # alter the hive table: mrqos_region try: logger.info('drop partitions, condition: datestamp<%s' % str(date_timeout)) beeline.drop_partitions(tablename='mrqos.mrqos_region', condition='datestamp<%s' % str(date_timeout)) except sp.CalledProcessError as e: logger.error('drop partition failed') logger.error('error: %s' % e.message) except sp.CalledProcessError as e: logger.error('removed data from hdfs failed') logger.error('error: %s' % e.message) # ############################## # target table: maprule_info, mcm_machines # ############################## query_item = ['maprule_info', 'mcm_machines'] for scan in query_item: logger.info('handling table: %s' % scan) list_to_clean = sorted( list( set([ x.split('/')[0] for x in beeline.show_partitions('mrqos.%s' % scan).split('\n') ]))) list_to_clean = [ x for x in list_to_clean if ('=' in x and int(x.split('=')[1]) < ts_timeout) ] try: logger.info('removing the data in HDFS') # remove the hdfs folder for item in list_to_clean: hdfsutil.rm(os.path.join(config.hdfs_table, '%s' % scan, '%s' % item), r=True) # alter the hive table: mrqos_region try: logger.info('drop partitions, condition: ts<%s' % str(ts_timeout)) beeline.drop_partitions(tablename='mrqos.%s' % scan, condition='ts<%s' % str(ts_timeout)) except sp.CalledProcessError as e: logger.error('drop partition failed') logger.error('error: %s' % e.message) except sp.CalledProcessError as e: logger.error('removed data from hdfs failed') logger.error('error: %s' % e.message)
def main(): """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """ ts = calendar.timegm(time.gmtime()) print "###################" print "# Performing the hourly mrqos_region summary" print "# starting processing time is " + str(ts) print "###################" ts_last_hour = ts - 3600 datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour))) hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour))) hour_list = [str("%02d" % x) for x in range(24)] hour_list = [x for x in hour_list if x <= hourstamp] region_summary_retrial_max = 10 # check if the summary has been performed on this particular hour (last hour) folders_day = '/'.join( str(config.hdfs_qos_rg_view_hour % (datestamp, '00')).split('/')[0:-1]) # check if the summary folder for "this day" (datestamp) has been created or not, if not, create one if hdfsutil.test_dic(folders_day): hdfsutil.mkdir(folders_day) folders_in = [folders_day + '/hour=%s' % x for x in hour_list] folders_out = hdfsutil.ls(folders_day) folders_missing = [x for x in folders_in if x not in folders_out] folders_missing.sort(reverse=True) for item in folders_missing: hourstamp = item[-2:] print " **** missing data for day = %s, hour = %s." % (datestamp, hourstamp), f = open( os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % ( datestamp, hourstamp) query_result_file = os.path.join( config.mrqos_query_result, 'region_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " **** perform beeline for hourly summary for day = %s, hour = %s." % ( datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: beeline.bln_e(strcmd_s) try: beeline.bln_e_output(strcmd_g, query_result_file) except: print " **** copy to local failed!" break except sp.CalledProcessError as e: # delete the folder if summarization failed. print " **** summarization failed upto #retrials=" + str( count_retrial) print " **** ", print e.message hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1
def main(): # parameters # RAinput='/home/testgrp/RAAnalysis/ra_data/ra_msg/assignments_agg' # current time timenow = int(time.time()) # #### RA PART #### for ra_concat_file in glob.glob( os.path.join(config.RAconcat,'*.txt') ): infoitem = ra_concat_file.split('.') datestamp = infoitem[1] UUID = infoitem[2] STARTTIME = infoitem[3] ENDTIME = infoitem[4] print 'uuid=%s, starttime=%s, endtime=%s, datestamp=%s' % (UUID, STARTTIME, ENDTIME, datestamp) # upload ra_concat_file to HDFS print '*** uploading file to HDFS ' + ra_concat_file try: sp.check_call(['hadoop', 'fs', '-put', ra_concat_file, config.hdfs_ra_intermediate]) sp.check_call(['rm', ra_concat_file]) intermediate_file_name = ra_concat_file.split('/')[-1] except: print 'HDFS file upload error' # still remove the local file (keeps from cumulating the concatenated files) sp.check_call(['rm', ra_concat_file]) continue # check the next ra_concat_file # create corresponding HDFS directory # PIG will create the HDFS in the designated folder # run PIG script to utilize AVRO # example: HADOOP_USER_NAME=akamai; pig11 -p datestamp=20151201 -p uuid=0e0bda82-9823-11e5-b44e-300ed5c5f881 -p ts=1448980818 /home/testgrp/RAAnalysis/pig/csv_to_avro.pig print '*** pig serializes the data into HDFS for file ' + ra_concat_file cmd = '%s; %s -p datestamp=%s -p uuid=%s -p ts=%s %s; %s' % ( config.cmd_hadoop_user_akamai, config.cmd_pig11, datestamp, UUID, STARTTIME, config.csv_to_avro_pig_script, config.cmd_hadoop_user_testgrp ) #print cmd try: print 'try the pig script...' sp.check_call( cmd, shell=True ) # pig log cleanup _log directory and _SUCCESS file when successful this_ra_temp_hdfs_location = config.hdfs_ra_temp % (datestamp, UUID, STARTTIME) this_ra_map_hdfs_location = config.hdfs_ra_map % (datestamp, UUID, STARTTIME) # copy the file from ramap [PIG OUTPUT] to RA_map folder [HIVE] print 'copy the file to RA_map folder' print 'HDFS copy RA-avro fail' if hdfs.cp( this_ra_temp_hdfs_location+'/part-r-00000.avro', this_ra_map_hdfs_location) else 'HDFS copy RA-avro success' # remove the remainder in ramap [PIG output] folder (not fully clear yet) print 'remove the remainder in the ramap folder' cmd = '%s; hadoop fs -rm -r %s; %s' % (config.cmd_hadoop_user_akamai, this_ra_temp_hdfs_location, config.cmd_hadoop_user_testgrp) sp.check_call( cmd, shell=True ) #cmd = '%s; hadoop fs -rm %s/_SUCCESS' % (config.cmd_hadoop_user_change, # this_ra_map_hdfs_location) #sp.check_call( cmd, shell=True ) # remove the remainder in the RA_pre_Avro folder print 'intermediate_file_name = ' + intermediate_file_name hdfs.rm( config.hdfs_ra_intermediate+'/'+intermediate_file_name ) # update the HIVE table cmd = "hive -e 'use raana; MSCK REPAIR TABLE ra_map;'" sp.check_call( cmd, shell=True ) except: print 'PIG script Error.'
def main(): """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """ ts = calendar.timegm(time.gmtime()) logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'cron_region_summary_hour.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # start the logging logger.info("###################") logger.info("# Performing the hourly mrqos_region summary") logger.info("# starting time: " + str(ts) + " = " + time.strftime('GMT %Y-%m-%d %H:%M:%S', time.gmtime(ts))) logger.info("###################") # parameter: backfilter length bf_length = config.region_summary_back_filling ts_last_couple_hour_list = [ts-(1+x)*3600 for x in range(bf_length)] for ts_last_hour in ts_last_couple_hour_list: datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour))) hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour))) region_summary_retrial_max = 10 # ############################### # # The SUMMARY HOUR hive procedure # # ############################### # #logger.info(" **** summary hour tour: checking day = %s, hour = %s." % (datestamp, hourstamp)) # check if the summary has been performed on this particular hour (last hour) if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_hour % (datestamp, hourstamp), '000000_0.deflate')): logger.info("** region summary hour: checking day = %s, hour = %s, and file does not exist." % (datestamp, hourstamp)) f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() count_retrial = 0 while count_retrial < region_summary_retrial_max: tic = time.time() try: beeline.bln_e(strcmd_s) logger.info("BLN region summary hour success @ cost = %s sec." % str(time.time()-tic)) break except sp.CalledProcessError as e: # delete the folder if summarization failed. logger.info("BLN region summary hour failed @ cost = %s sec in retrial #%s" % (str(time.time()-tic), str(count_retrial))) logger.exception("message") hdfsutil.rm(config.hdfs_qos_rg_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: logger.info("** region summary hour: checking day = %s, hour = %s, and file exists." % (datestamp, hourstamp)) # ############################ # # The CASE VIEW hive procedure # # ############################ # #print " **** case view tour:" # check if the summary has been performed on this particular hour (last hour) if hdfsutil.test_file(os.path.join(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), '000000_0.deflate')): logger.info("** case view hour: checking day = %s, hour = %s, and file does not exist." % (datestamp, hourstamp)) f = open(os.path.join(config.mrqos_hive_query, 'mrqos_case_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.case_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'case_view_hour.%s.%s.csv' % (datestamp, hourstamp)) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: tic = time.time() beeline.bln_e(strcmd_s) logger.info("BLN case view hour success @ cost = %s sec." % str(time.time()-tic)) try: beeline.bln_e_output(strcmd_g, query_result_file) except sp.CalledProcessError as e: logger.warning("copy to local failed, retrying...") print e.message try: beeline.bln_e_output(strcmd_g, query_result_file) except sp.CalledProcessError as e: logger.error("copy to local failed again, abort.") logger.exception("message") break except sp.CalledProcessError as e: # delete the folder if summarization failed. logger.info("BLN case view hour failed @ cost = %s sec in retrial #%s" % (str(time.time()-tic), str(count_retrial))) logger.exception("message") hdfsutil.rm(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: logger.info("** case view hour: checking day = %s, hour = %s, and file exists." % (datestamp, hourstamp)) # ############################## # # The REGION VIEW hive procedure # # ############################## # # check if the summary has been performed on this particular hour (last hour) if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')): logger.info("** region view hour: checking day = %s, hour = %s, and file does not exist." % (datestamp, hourstamp)) f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp)) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: tic = time.time() beeline.bln_e(strcmd_s) logger.info("BLN region view hour success @ cost = %s sec." % str(time.time()-tic)) try: beeline.bln_e_output(strcmd_g, query_result_file) except sp.CalledProcessError as e: logger.warning("copy to local failed, retrying...") print e.message try: beeline.bln_e_output(strcmd_g, query_result_file) except sp.CalledProcessError as e: logger.error("copy to local failed again, abort.") logger.exception("message") break except sp.CalledProcessError as e: # delete the folder if summarization failed. logger.info("BLN region view hour failed @ cost = %s sec in retrial #%s" % (str(time.time()-tic), str(count_retrial))) logger.exception("message") hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: logger.info("** region view hour: checking day = %s, hour = %s, and file exists." % (datestamp, hourstamp))
def main(): # logging set-up logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'io_ratio_join.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # ############################## # start the script # parameter setting ts = int(time.time()) logger.info('########### ts=%s ###########' % str(ts)) #datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts))) #hourstamp = time.strftime('%H', time.gmtime(float(ts))) # IO-Ratio Join: last_mrqos_region_partition = beeline.get_last_partitions('mrqos.mrqos_region') [datestamp, hourstamp, ts_region] = [x.split('=')[1] for x in last_mrqos_region_partition.split('/')] logger.info('MRQOS mrqos_region partition: datestamp=%s, hour=%s, ts_region=%s' % (datestamp, hourstamp, ts_region)) mapruleinfo_partitions = [x for x in sorted(beeline.show_partitions('mrqos.maprule_info').split('\n'),reverse=True) if '=' in x] mapruleinfo_partitions = [x for x in mapruleinfo_partitions if x < 'ts=%s' % ts_region] ts_mapruleinfo = mapruleinfo_partitions[0].split('=')[1] logger.info('MRQOS maprule_info partition: ts_mapruleinfo=%s' % ts_mapruleinfo) region_summary_retrial_max = 10 # ############################### # # The In-Out Ratio hive procedure # # ############################### # # check if the summary has been performed on this particular hour (last hour) # print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file(os.path.join(config.hdfs_table, 'mrqos_ioratio', 'datestamp=%s' % datestamp, 'hour=%s' % hourstamp, 'ts=%s' % ts_region, '000000_0.deflate')): logger.info(' Joined file not exist.') f = open(os.path.join(config.mrqos_hive_query, 'mrqos_ioratio.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, ts_region, datestamp, hourstamp, ts_region, ts_mapruleinfo) print strcmd_s f.close() # strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % (datestamp, hourstamp) # query_result_file = os.path.join(config.mrqos_query_result,'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary: day = %s, hour = %s. " %(datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: tic = time.time() try: beeline.bln_e(strcmd_s) logger.info(' ****** success with time cost = %s.' % str(time.time()-tic)) break except sp.CalledProcessError as e: # delete the folder if summarization failed. logger.error(' ****** failed with time cost = %s upto # retrials=%s' % (str(time.time()-tic), str(count_retrial))) logger.error('error %s' % e.message) hdfsutil.rm(os.path.join(config.hdfs_table, 'mrqos_ioratio', 'datestamp=%s' % datestamp, 'hour=%s' % hourstamp, 'ts=%s' % ts_region), r=True) count_retrial += 1 else: logger.info(' Joined file exists.')
def main(argv): """ get the date and hour for the specified day and hour. Clean(drop) and rebuild the table partition. """ try: opts, args = getopt.getopt(argv,"qd:h:",["datestamp=","hour="]) except getopt.GetoptError: print 'region_summary_hour_repair.py -d <datestamp> -h <hour>' sys.exit(2) hour ='' datestamp = '' for opt, arg in opts: if opt == '-q': print 'region_summary_hour_repair.py -d <datestamp> -h <hour>' sys.exit() elif opt in ("-d", "--datestamp"): datestamp = arg elif opt in ("-h", "--hour"): hour = arg ts = calendar.timegm(time.gmtime()) print "###################" print "# Performing the repair of the mrqos_region summary" print "# starting processing time is " + str(ts) + " = " + time.strftime('GMT %Y-%m-%d %H:%M:%S', time.gmtime(ts)) print "###################" if (not datestamp and not hour): print 'region_summary_hour_repair.py -d <datestamp> -h <hour>' sys.exit(2) print 'Fixing datestamp = %s' % datestamp if not hour: hour_list = [str("%02d" % x) for x in range(24)] print 'Fixing hour = %s' % hour_list else: print 'Fixing hour = %s' % hour #ts_last_hour = ts-3600 #datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour))) #hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour))) #hour_list = [str("%02d" % x) for x in range(24)] region_summary_retrial_max = 10 print " #**** first perform table cleanups: " if not hour: for hourstamp in hour_list: cleanup_mrqos_region_related_tables(datestamp, hourstamp) else: hourstamp = hour cleanup_mrqos_region_related_tables(datestamp, hourstamp) print " #**** rebuild the db / table: " if not hour: for hourstamp in hour_list: # ############################### # # The SUMMARY HOUR hive procedure # # ############################### # print " **** summary hour tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary: day = %s, hour = %s. " %(datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: tic = time.time() try: beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str(time.time()-tic) #try: # beeline.bln_e_output(strcmd_g, query_result_file) #except: # print " **** copy to local failed, retry!" # beeline.bln_e_output(strcmd_g, query_result_file) break except: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto # retrials=%s" % (str(time.time()-tic), str(count_retrial)) hdfsutil.rm(config.hdfs_qos_rg_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists." # ############################ # # The CASE VIEW hive procedure # # ############################ # print " **** case view tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file(os.path.join(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open(os.path.join(config.mrqos_hive_query, 'mrqos_case_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.case_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'case_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: tic = time.time() beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str(time.time()-tic) # repair don't care about moving the result to SQLite DB #try: # beeline.bln_e_output(strcmd_g, query_result_file) #except: # print " **** copy to local failed, retry!" # beeline.bln_e_output(strcmd_g, query_result_file) break except: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto #retrials=%s" % (str(time.time()-tic), str(count_retrial)) hdfsutil.rm(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists." # ############################## # # The REGION VIEW hive procedure # # ############################## # print " **** region view tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: tic = time.time() beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str(time.time()-tic) # repair don't care about moving the result to SQLite DB #try: # beeline.bln_e_output(strcmd_g, query_result_file) #except: # print " **** copy to local failed, retry!" # beeline.bln_e_output(strcmd_g, query_result_file) break except: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto #retrials=%s" % (str(time.time()-tic), str(count_retrial)) hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists." else: # ############################### # # The SUMMARY HOUR hive procedure # # ############################### # print " **** summary hour tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary: day = %s, hour = %s. " %(datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: tic = time.time() try: beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str(time.time()-tic) #try: # beeline.bln_e_output(strcmd_g, query_result_file) #except: # print " **** copy to local failed, retry!" # beeline.bln_e_output(strcmd_g, query_result_file) break except: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto # retrials=%s" % (str(time.time()-tic), str(count_retrial)) hdfsutil.rm(config.hdfs_qos_rg_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists." # ############################ # # The CASE VIEW hive procedure # # ############################ # print " **** case view tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file(os.path.join(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open(os.path.join(config.mrqos_hive_query, 'mrqos_case_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.case_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'case_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: tic = time.time() beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str(time.time()-tic) # repair don't care about moving the result to SQLite DB #try: # beeline.bln_e_output(strcmd_g, query_result_file) #except: # print " **** copy to local failed, retry!" # beeline.bln_e_output(strcmd_g, query_result_file) break except: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto #retrials=%s" % (str(time.time()-tic), str(count_retrial)) hdfsutil.rm(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists." # ############################## # # The REGION VIEW hive procedure # # ############################## # print " **** region view tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file(os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open(os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % (datestamp, hourstamp) query_result_file = os.path.join(config.mrqos_query_result,'region_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: tic = time.time() beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str(time.time()-tic) # repair don't care about moving the result to SQLite DB #try: # beeline.bln_e_output(strcmd_g, query_result_file) #except: # print " **** copy to local failed, retry!" # beeline.bln_e_output(strcmd_g, query_result_file) break except: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto #retrials=%s" % (str(time.time()-tic), str(count_retrial)) hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists."
def main(): """ get the date and hour for the previous hour. Will check from the beginning of the day, insert when missing. """ ts = calendar.timegm(time.gmtime()) print "###################" print "# Performing the hourly mrqos_region summary" print "# starting processing time is " + str(ts) + " = " + time.strftime( 'GMT %Y-%m-%d %H:%M:%S', time.gmtime(ts)) print "###################" ts_last_hour = ts - 3600 datestamp = time.strftime('%Y%m%d', time.gmtime(float(ts_last_hour))) hourstamp = time.strftime('%H', time.gmtime(float(ts_last_hour))) #hour_list = [str("%02d" % x) for x in range(24)] region_summary_retrial_max = 10 # ############################### # # The SUMMARY HOUR hive procedure # # ############################### # print " **** summary hour tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file( os.path.join(config.hdfs_qos_rg_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open( os.path.join(config.mrqos_hive_query, 'mrqos_region_summarize_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "SELECT maprule, geoname, netname, region, avg_region_score, score_target, hourly_region_nsd_demand, hourly_region_eu_demand, hourly_region_ra_load, case_ra_load, case_nsd_demand, case_eu_demand, case_uniq_region, name, ecor, continent, country, city, latitude, longitude, provider, region_capacity, ecor_capacity, prp, numghosts, datestamp, hour FROM mrqos.mrqos_region_hour WHERE datestamp=%s and hour=%s;" % ( datestamp, hourstamp) query_result_file = os.path.join( config.mrqos_query_result, 'region_summary_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary: day = %s, hour = %s. " % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: tic = time.time() try: beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str( time.time() - tic) #try: # beeline.bln_e_output(strcmd_g, query_result_file) #except: # print " **** copy to local failed, retry!" # beeline.bln_e_output(strcmd_g, query_result_file) break except sp.CalledProcessError as e: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto # retrials=%s" % ( str(time.time() - tic), str(count_retrial)) print e.message hdfsutil.rm(config.hdfs_qos_rg_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists." # ############################ # # The CASE VIEW hive procedure # # ############################ # print " **** case view tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file( os.path.join( config.hdfs_qos_case_view_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open( os.path.join(config.mrqos_hive_query, 'mrqos_case_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.case_view_hour where datestamp=%s and hour=%s;" % ( datestamp, hourstamp) query_result_file = os.path.join( config.mrqos_query_result, 'case_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: tic = time.time() beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str( time.time() - tic) try: beeline.bln_e_output(strcmd_g, query_result_file) except sp.CalledProcessError as e: print " **** copy to local failed, retry!" print e.message beeline.bln_e_output(strcmd_g, query_result_file) break except sp.CalledProcessError as e: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto #retrials=%s" % ( str(time.time() - tic), str(count_retrial)) print e.message hdfsutil.rm(config.hdfs_qos_case_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists." # ############################## # # The REGION VIEW hive procedure # # ############################## # print " **** region view tour:" # check if the summary has been performed on this particular hour (last hour) print " **** checking day = %s, hour = %s." % (datestamp, hourstamp), if hdfsutil.test_file( os.path.join(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), '000000_0.deflate')): print " file not exits,", f = open( os.path.join(config.mrqos_hive_query, 'mrqos_region_view_hour.hive'), 'r') strcmd = f.read() strcmd_s = strcmd % (datestamp, hourstamp, datestamp, hourstamp, datestamp, hourstamp) f.close() strcmd_g = "select * from mrqos.region_view_hour where datestamp=%s and hour=%s;" % ( datestamp, hourstamp) query_result_file = os.path.join( config.mrqos_query_result, 'region_view_hour.%s.%s.csv' % (datestamp, hourstamp)) print " BLN for hourly summary for day = %s, hour = %s." % (datestamp, hourstamp) count_retrial = 0 while count_retrial < region_summary_retrial_max: try: tic = time.time() beeline.bln_e(strcmd_s) print " ****** success with time cost = %s." % str( time.time() - tic) try: beeline.bln_e_output(strcmd_g, query_result_file) except sp.CalledProcessError as e: print " **** copy to local failed, retry!" print e.message beeline.bln_e_output(strcmd_g, query_result_file) break except sp.CalledProcessError as e: # delete the folder if summarization failed. print " ****** failed with time cost = %s upto #retrials=%s" % ( str(time.time() - tic), str(count_retrial)) print e.message hdfsutil.rm(config.hdfs_qos_rg_view_hour % (datestamp, hourstamp), r=True) count_retrial += 1 else: print " file exists."
def main(): # logging set-up logging.basicConfig(filename=os.path.join(config.mrqos_logging, 'hive_table_cleanup.log'), level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logger = logging.getLogger(__name__) # ############################## # start the script # parameter setting # ############################## ts = int(time.time()) ts_timeout = ts - config.mrqos_table_delete * 24 * 3 # 3 days = (24*3) hours of time-out date_timeout = time.strftime('%Y%m%d', time.gmtime(float(ts_timeout))) # hourstamp = time.strftime('%H', time.gmtime(float(ts))) list_to_clean = sorted(list(set([x.split('/')[0] for x in beeline.show_partitions('mrqos.mrqos_region').split('\n')]))) list_to_clean = [x for x in list_to_clean if ('=' in x and x.split('=')[1] < date_timeout)] logger.info('handling table: mrqos_region') try: logger.info('removing the data in HDFS') # remove the hdfs folder for item in list_to_clean: hdfsutil.rm(os.path.join(config.hdfs_table, 'mrqos_region', '%s' % item), r=True) # alter the hive table: mrqos_region try: logger.info('drop partitions, condition: datestamp<%s' % str(date_timeout)) beeline.drop_partitions(tablename='mrqos.mrqos_region', condition='datestamp<%s' % str(date_timeout)) except sp.CalledProcessError as e: logger.error('drop partition failed') logger.error('error: %s' % e.message) except sp.CalledProcessError as e: logger.error('removed data from hdfs failed') logger.error('error: %s' % e.message) # ############################## # target table: maprule_info, mcm_machines # ############################## query_item = ['maprule_info', 'mcm_machines'] for scan in query_item: logger.info('handling table: %s' % scan) list_to_clean = sorted(list(set([x.split('/')[0] for x in beeline.show_partitions('mrqos.%s' % scan).split('\n')]))) list_to_clean = [x for x in list_to_clean if ('=' in x and int(x.split('=')[1]) < ts_timeout)] try: logger.info('removing the data in HDFS') # remove the hdfs folder for item in list_to_clean: hdfsutil.rm(os.path.join(config.hdfs_table, '%s' % scan, '%s' % item), r=True) # alter the hive table: mrqos_region try: logger.info('drop partitions, condition: ts<%s' % str(ts_timeout)) beeline.drop_partitions(tablename='mrqos.%s' % scan, condition='ts<%s' % str(ts_timeout)) except sp.CalledProcessError as e: logger.error('drop partition failed') logger.error('error: %s' % e.message) except sp.CalledProcessError as e: logger.error('removed data from hdfs failed') logger.error('error: %s' % e.message)