def load_file(date_arg): #导文件模块,主要考虑:要导的文件存在不存在?需要有删除计算当天的分区步骤,文件没导成功怎么办 global WORKERS, FILE_HIVE_LIST, HIVE_DB, HIVE_TABLE # 5.1 设置全局变量 hive_status = check_hive_file() # 5.2 hive_status=True、False 检查是否存在文件 if not hive_status: log_str = 'click stream file parse fail: file count not equal WORKERS' print MyTime.get_local_time(), '-------------- ' + log_str log.error(log_str) #MyAlarm.send_mail_sms(log_str) # 5.3 假如文件不存在,生成告警日志,并发送文件 else: hive = MyHiveBin.HiveBin() # 调用hive模块 模块在 com/hive/bin dt = date_arg.replace('-', '') # "2016-11-30" 改变为 "20161130" hive.drop_partition( HIVE_DB, HIVE_TABLE, 'dt', dt ) # 5.4 假如文件存在,删掉计算当天的分区 HIVE_DB= 'to8to_rawdata' HIVE_TABLE='clickstream' partition_dict = {'dt': dt} for hive_file in FILE_HIVE_LIST: log_str = 'load file ' + hive_file + ' into hive begin' print MyTime.get_local_time(), '-------------- ' + log_str log.info(log_str) status = hive.load_file( HIVE_DB, HIVE_TABLE, hive_file, partition_dict) # status 注意,导数据成功后将 状态 True、False 赋值给 status if status is False: log_str = 'Load file ' + hive_file + ' into hive status:fail; Click stream parse exit' log.error(log_str) #MyAlarm.send_mail_sms(log_str) # 5.5 假如文件存在 但没导成功,将发送告警 return False
def load_file(date, today): global tar_src global tar_des global latest_time tar_src = tar_src.replace('xxxx-xx-xx', date) tar_des = tar_des.replace('xxxx-xx-xx', date) print tar_src latest_time_stamp = MyTime.datetime_timestamp( latest_time.replace('xxxx-xx-xx', today)) if not os.path.exists(tar_des): shell = "mkdir -p " + tar_des os.system(shell) while 1: if os.path.exists(tar_src): if file_modify_stat(tar_src): if MyTool.tar_file(tar_src, tar_des): for root, dirs, files in os.walk(tar_des): for d in dirs: print os.path.join(root, d) for f in files: file = os.path.join(root, f) shell = '/usr/bin/dos2unix ' + file os.popen(shell) hive = MyHiveBin.HiveBin() hive.load_file_single_overwrite( 'to8to_rawdata', file) log.info('Mysql actual kefu yuyue to hive status:ok') break else: log.info('Mysql actual tar file not exists') now_time_stamp = MyTime.datetime_timestamp(MyTime.get_local_time()) if now_time_stamp > latest_time_stamp: log.critical('not find ' + tar_src) MyAlarm.send_mail_sms('Get Mysql actual tar file status:fail!') return False time.sleep(time_rate) return True
log.error(log_str) #MyAlarm.send_mail_sms(log_str) # 5.5 假如文件存在 但没导成功,将发送告警 return False def main(date_arg): global FILE_JSON_NAME, FILE_HIVE_PATH ## 3.1、首先设置全局变量,FILE_JSON_NAME = None FILE_HIVE_PATH = None set_file_path(date_arg) ## 3.2、设置文件路径 ,生成文件名列表 FILE_HIVE_LIST try: os.mkdir( FILE_HIVE_PATH ) ## 3.3 创建目录 FILE_HIVE_PATH :/data1/bi/platform/tar/2016-11-16/ClickStream/ except Exception, ex: print str(ex) pass print MyTime.get_local_time( ), '-------------- tar click stream file begin' # from cube import MyTime log.info( 'tar click stream file begin' ) # 写入日志 log = MyLog.MyLog(path='/data1/bi/platform/scripts/BI/ClickStream/log/', name='ClickStream', type='to8to', level='DEBUG') if check_file(): ## 3.4 检查json日志文件是否生成 print MyTime.get_local_time( ), '-------------- tar click stream file success, then process work' log.info('tar click stream file success') log.info('click stream to8to process work') get_file_size(FILE_JSON_NAME) ## 3.5 获得文件大小,这个有点多余 click_stream() ## 3.6 开始清洗 print MyTime.get_local_time(), '-------------- process success' log.info('click stream to8to process work success') if __name__ == '__main__':