def run(args): conf_file = args['conf'] print "---------", conf_file conf = conf_parser.ConfParser(conf_file) conf.load('DeCardUser') input_dir = conf.get('input') if not input_dir: input_dir = os.path.join(conf.get('root_dir'), 'SplitbyProv', conf.get('province'), conf.get('month')) output_dir = args['output'] if not output_dir: output_dir = os.path.join(conf.get('root_dir'), 'DecardUser', conf.get('province'), conf.get('month')) cluster = conf.load_to_dict('cluster') # De Card User print 'de card user start' de_card_user_input = input_dir print de_card_user_input de_card_user_output = output_dir cluster['input_path'] = de_card_user_input cluster['output_path'] = de_card_user_output cluster['month'] = conf.get('month') cluster['province'] = conf.get('province') de_card_user_params = list() de_card_user_params.append(conf.get('month')) de_card_user_params.append(conf.get('month')) true_user_dir = os.path.join(conf.get('root_dir'), 'User', conf.get('province'), conf.get('month')) # start_time = args['params'][0] ##未写 # end_time = args['params'][1] ##未写 start_time = conf.get('start_time') end_time = conf.get('end_time') day_threshold = conf.get('day_threshold') hour_threshold = conf.get('hour_threshold') de_card_user_params.append(true_user_dir) de_card_user_params.append(start_time) de_card_user_params.append(end_time) de_card_user_params.append(day_threshold) de_card_user_params.append(hour_threshold) cluster['params'] = de_card_user_params cluster['main_class'] = conf.get('main_class') cluster['driver'] = conf.get('driver') print "params:\n", cluster print "in_dir:", input_dir print "out_dir:", output_dir de_card_user_task = runner.SparkJob(**cluster) de_card_user_task.run() print 'de card user end'
def run(args): conf_file = args['conf'] print "---------", conf_file conf = conf_parser.ConfParser(conf_file) conf.load('DeAbnormalUser') input_dir = conf.get('input') if not input_dir: input_dir = os.path.join(conf.get('root_dir'), 'JoinCoordin', conf.get('province'), conf.get('month')) output_dir = args['output'] if not output_dir: output_dir = os.path.join(conf.get('root_dir'), 'DeAbnormalUser', conf.get('province'), conf.get('month')) cluster = conf.load_to_dict('cluster') ## DeAbnormal User print 'deAbnormal user start' de_abnormal_user_input = input_dir print de_abnormal_user_input de_abnormal_user_output = output_dir cluster['input_path'] = de_abnormal_user_input cluster['output_path'] = de_abnormal_user_output cluster['month'] = conf.get('month') cluster['province'] = conf.get('province') de_abnormal_user_params = list() normal_user_dir = os.path.join(conf.get('root_dir'), 'User', conf.get('province'), conf.get('month')) abnormal_user_dir = os.path.join(conf.get('root_dir'), 'User', conf.get('province'), conf.get('month')) de_abnormal_user_params.append(conf.get('month')) de_abnormal_user_params.append(conf.get('province')) de_abnormal_user_params.append(normal_user_dir) de_abnormal_user_params.append(abnormal_user_dir) de_abnormal_user_params.append(conf.get('cc_threshold')) # 是否传参 de_abnormal_user_params.append(conf.get('sm_threshold')) # 是否传参 de_abnormal_user_params.append(conf.get('mm_threshold')) # 是否传参 cluster['params'] = de_abnormal_user_params cluster['main_class'] = conf.get('main_class') cluster['driver'] = conf.get('driver') print "params:\n", cluster print "in_dir:", input_dir print "out_dir:", output_dir de_abnormal_user_task = runner.SparkJob(**cluster) de_abnormal_user_task.run() print 'de abnormal user end'
def run(args): conf_file = args['conf'] print "---------", conf_file conf = conf_parser.ConfParser(conf_file) conf.load('MergeData') input_dir = conf.get('input') if not input_dir: input_dir = os.path.join(conf.get('root_dir'), 'DeAbnormalUser', conf.get('province'), conf.get('month')) output_dir = args['output'] if not output_dir: output_dir = os.path.join(conf.get('root_dir'), 'MergeData', conf.get('province'), conf.get('month')) cluster = conf.load_to_dict('cluster') # Merge Data # inputDir outputDir month province abNormalUserDir print 'merge data start' merge_data_input = input_dir merge_data_output = output_dir cluster['input_path'] = merge_data_input cluster['output_path'] = merge_data_output cluster['month'] = conf.get('month') cluster['province'] = conf.get('province') # params merge_data_params = list() merge_data_params.append(conf.get('month')) merge_data_params.append(conf.get('province')) abnormal_user_dir = os.path.join(conf.get('root_dir'), 'User', conf.get('province'), conf.get('month')) merge_data_params.append(abnormal_user_dir) cluster['params'] = merge_data_params cluster['main_class'] = conf.get('main_class') cluster['driver'] = conf.get('driver') print "params:\n", cluster print "in_dir:", input_dir print "out_dir:", output_dir merge_data_task = runner.SparkJob(**cluster) merge_data_task.run() print 'merge data end'
def run(args): conf_file = args['conf'] print "---------", conf_file conf = conf_parser.ConfParser(conf_file) conf.load('JoinCoordinate') input_dir = conf.get('input') if not input_dir: input_dir = os.path.join(conf.get('root_dir'), 'SelectField', conf.get('province'), conf.get('month')) output_dir = args['output'] if not output_dir: output_dir = os.path.join(conf.get('root_dir'), 'JoinCoordin', conf.get('province'), conf.get('month')) cluster = conf.load_to_dict('cluster') # Join Coordinate # JoinUserLatLontAndProvince inputDir outputDir month province CellListDir TeleUserInfoDir print 'join coordinate start' join_coordinate_input = input_dir join_coordinate_output = output_dir cluster['input_path'] = join_coordinate_input cluster['output_path'] = join_coordinate_output # cluster['month'] = month # cluster['province'] = province join_coordinate_params = list() join_coordinate_params.append(conf.get('month')) join_coordinate_params.append(conf.get('province')) join_coordinate_params.append(conf.get('cell_info_dir')) join_coordinate_params.append( conf.get('tele_usmergeDistinctNormalUserLocer_info_dir')) # 是否传参 cluster['params'] = join_coordinate_params cluster['main_class'] = conf.get('main_class') cluster['driver'] = conf.get('driver') print "params:\n", cluster print "in_dir:", input_dir print "out_dir:", output_dir join_coordinate_task = runner.SparkJob(**cluster) join_coordinate_task.run() print 'join coordinate end'
def run(args): conf_file = args['conf'] print "---------", conf_file conf = conf_parser.ConfParser(conf_file) conf.load('SelectFields') input_dir = conf.get('input') if not input_dir: input_dir = os.path.join(conf.get('root_dir'), 'SplitData', conf.get('province'), conf.get('month')) output_dir = args['output'] if not output_dir: output_dir = os.path.join(conf.get('root_dir'), 'SelectField', conf.get('province'), conf.get('month')) cluster = conf.load_to_dict('cluster') # Select Fields # inputDir outputDir month province print 'select fields start' select_fields_input = input_dir print select_fields_input select_fields_output = output_dir cluster['input_path'] = select_fields_input cluster['output_path'] = select_fields_output # params select_fields_params = list() select_fields_params.append(conf.get('month')) select_fields_params.append(conf.get('province')) cluster['params'] = select_fields_params cluster['main_class'] = conf.get('main_class') cluster['driver'] = conf.get('driver') print "params:\n", cluster print "in_dir:", input_dir print "out_dir:", output_dir select_fields_task = runner.SparkJob(**cluster) select_fields_task.run() print 'select_fields end'
def run(args): conf_file = args['conf'] conf = conf_parser.ConfParser(conf_file) conf.load('ScenicAnalysis') # 加载去震荡默认的参数配置模块 month = args['month'] if not month: if conf.has('month'): month = conf.get('month') else: assert False, 'the month is not setted' province = args['province'] if not province: if conf.has('province'): province = conf.get('province') else: assert False, 'the province is not setted' cluster = conf.load_to_dict('cluster') scenicAnalysis_input = list() scenicAnalysis_params = list() print "Check input (file src is listed as follows)" #input 1 OtherProvTrueUserInfo_input = os.path.join(conf.get('root_dir'), 'DataClean', province, month, '%sOther' % month, '%sTrueOther.csv' % month) print "OtherProvTrueUserInfo:" + OtherProvTrueUserInfo_input scenicAnalysis_input.append(OtherProvTrueUserInfo_input) #input 2 OtherProvStopPoint_input = os.path.join(conf.get('root_dir'), 'Extraction', province, month, '%sOtherStop.csv' % month) print "OtherProvStopPoint:" + OtherProvStopPoint_input scenicAnalysis_input.append(OtherProvStopPoint_input) #input 3 LocalTrueUserInfo_input = os.path.join(conf.get('root_dir'), 'DataClean', province, month, '%sLocal' % month, '%sTrueLocal.csv' % month) print "LocalTrueUserInfo:" + LocalTrueUserInfo_input scenicAnalysis_input.append(LocalTrueUserInfo_input) #input 4 CDRData_input = os.path.join(conf.get('root_dir'), 'SplitData', province, month, '%sCC.csv' % month) print "CDRData:" + CDRData_input scenicAnalysis_input.append(CDRData_input) #input 5 ScenicData_input = os.path.join(conf.get('root_dir'), 'BasicInfo', 'ScenicSpot', '%s.csv' % province) print "ScenicData:" + ScenicData_input scenicAnalysis_input.append(ScenicData_input) #input 6 CellData_input = os.path.join(conf.get('root_dir'), 'BasicInfo', 'BaseStation', '%s.csv' % province) print "CellData:" + CellData_input scenicAnalysis_input.append(CellData_input) #判断输入是否存在 for inputfiles in scenicAnalysis_input: if not hdfs_util.exist(inputfiles): print >> sys.stderr, 'the input is not existed!' sys.exit(-1) #插入jar包的第一个参数 scenicAnalysis_params.append(conf.get('root_dir')) #插入Jar包的第二,第三个参数 scenicAnalysis_params.append(province) scenicAnalysis_params.append(month) print 'scenic spot analysis start!' cluster['input_path'] = '/user/tele/trip' cluster[ 'output_path'] = '/user/tele/trip/BackEnd/ScenicSpotAnalysis/HaiNan/201512' cluster['params'] = scenicAnalysis_params cluster['main_class'] = conf.load_to_dict('ScenicAnalysis').get( 'main_class') cluster['driver'] = conf.load_to_dict('ScenicAnalysis').get('driver') ScenicAnalysis_task = runner.SparkJob(**cluster) ScenicAnalysis_task.run() print 'scenic spot analysis end!'
def run(args): conf_file = args['conf'] conf = conf_parser.ConfParser(conf_file) conf.load('LocalUserFeature') # 加载去震荡默认的参数配置模块 month = args['month'] if not month: if conf.has('month'): month = conf.get('month') else: assert False, 'the month is not setted' province = args['province'] if not province: if conf.has('province'): province = conf.get('province') else: assert False, 'the province is not setted' cluster = conf.load_to_dict('cluster') input_dir = args['input'] if not input_dir: input_dir = '/user/tele/trip' output_dir = args['output'] if not output_dir: output_dir = os.path.join(conf.get('root_dir'), 'BackEnd', 'LocalAnalysis', province, month, 'LocalUserFeatureTemp.csv') scenicAnalysis_input = list() scenicAnalysis_params = list() print "Check input (file src is listed as follows)" #input 1 LocalStopPoint_input = os.path.join(conf.get('root_dir'), 'Extraction', province, month, '%sLocalStop.csv' % month) print "LocalStopPoint:" + LocalStopPoint_input scenicAnalysis_input.append(LocalStopPoint_input) #input 2 UserHome_input = os.path.join(conf.get('root_dir'), 'Extraction', province, month, '%sHome.csv' % month) print "UserHome:" + UserHome_input scenicAnalysis_input.append(UserHome_input) #input 3 UserWork_input = os.path.join(conf.get('root_dir'), 'Extraction', province, month, '%sWork.csv' % month) print "UserWork:" + UserWork_input scenicAnalysis_input.append(UserWork_input) #input 4 LocalTrueUserInfo_input = os.path.join(conf.get('root_dir'), 'DataClean', province, month, '%sLocal' % month, '%sTrueLocal.csv' % month) print "LocalTrueUserInfo:" + LocalTrueUserInfo_input scenicAnalysis_input.append(LocalTrueUserInfo_input) #input 5 CDRData_input = os.path.join(conf.get('root_dir'), 'SplitData', province, month, '%sCC.csv' % month) print "CDRData:" + CDRData_input scenicAnalysis_input.append(CDRData_input) #input 6 ScenicData_input = os.path.join(conf.get('root_dir'), 'BasicInfo', 'ScenicSpot', '%s.csv' % province) print "ScenicData:" + ScenicData_input scenicAnalysis_input.append(ScenicData_input) #判断输入是否存在 for inputfiles in scenicAnalysis_input: if not hdfs_util.exist(inputfiles): print >> sys.stderr, 'the input is not existed!' sys.exit(-1) #插入Jar包的参数 scenicAnalysis_params.append(province) scenicAnalysis_params.append(month) print ' Extract Local User Feature start!' cluster['input_path'] = input_dir cluster['output_path'] = output_dir cluster['params'] = scenicAnalysis_params cluster['main_class'] = conf.load_to_dict('LocalUserFeature').get( 'main_class') cluster['driver'] = conf.load_to_dict('LocalUserFeature').get('driver') ScenicAnalysis_task = runner.SparkJob(**cluster) ScenicAnalysis_task.run() print 'Extract Local User Feature end!'
def run(args): conf_file = args['conf'] conf = conf_parser.ConfParser(conf_file) conf.load('LongestPath') # 加载去震荡默认的参数配置模块 month = args['month'] if not month: if conf.has('month'): month = conf.get('month') else: assert False, 'the month is not setted' province = args['province'] if not province: if conf.has('province'): province = conf.get('province') else: assert False, 'the province is not setted' cluster = conf.load_to_dict('cluster') input_dir = args['input'] if not input_dir: input_dir = '/user/tele/trip' output_dir = args['output'] if not output_dir: output_dir = os.path.join(conf.get('root_dir'), 'Extraction', province, month, '%sReprestRtFromHmToWk.csv' % month) scenicAnalysis_input = list() scenicAnalysis_params = list() print "Check input (file src is listed as follows)" #input 1 LocalStopPoint_input = os.path.join(conf.get('root_dir'), 'Extraction', province, month, '%sLocalStop.csv' % month) print "LocalStopPoint:" + LocalStopPoint_input scenicAnalysis_input.append(LocalStopPoint_input) #input 2 UserHome_input = os.path.join(conf.get('root_dir'), 'Extraction', province, month, '%sHome.csv' % month) print "UserHome:" + UserHome_input scenicAnalysis_input.append(UserHome_input) #input 3 UserWork_input = os.path.join(conf.get('root_dir'), 'Extraction', province, month, '%sWork.csv' % month) print "UserWork:" + UserWork_input scenicAnalysis_input.append(UserWork_input) #input 4 CellData_input = os.path.join(conf.get('root_dir'), 'BasicInfo', 'BaseStation', '%s.csv' % province) print "CellData:" + CellData_input scenicAnalysis_input.append(CellData_input) #判断输入是否存在 for inputfiles in scenicAnalysis_input: if not hdfs_util.exist(inputfiles): print >> sys.stderr, 'the input is not existed!' sys.exit(-1) #插入Jar包的参数 scenicAnalysis_params.append(province) scenicAnalysis_params.append(month) print 'Extract Local Users Representative(Longest) Path From Home to Work start!' cluster['input_path'] = input_dir cluster['output_path'] = output_dir cluster['params'] = scenicAnalysis_params cluster['main_class'] = conf.load_to_dict('LongestPath').get('main_class') cluster['driver'] = conf.load_to_dict('LongestPath').get('driver') ScenicAnalysis_task = runner.SparkJob(**cluster) ScenicAnalysis_task.run() print 'Extract Local Users Representative(Longest) Path From Home to Work end!'
def run(args): conf_file = args['conf'] conf = conf_parser.ConfParser(conf_file) conf.load('DeOscillation') # 加载去震荡默认的参数配置模块 month = args['month'] if not month: if conf.has('month'): month = conf.get('month') else: assert False, 'the month is not setted' province = args['province'] if not province: if conf.has('province'): province = conf.get('province') else: assert False, 'the province is not setted' user_type = '' if conf.has('user_type'): user_type = conf.get('user_type') if args['params']: user_type = args['params'][ 0] # 若user_type 采用传参的话,必须把user_type放在无参数名的第一位 if not user_type: assert False, 'the user_type is not setted' input_dir = args['input'] if not input_dir: input_dir = os.path.join(conf.get('root_dir'), 'DataClean', month) output_dir = args['output'] if not output_dir: output_dir = os.path.join(conf.get('root_dir'), 'DeOscillation', month) cluster = conf.load_to_dict('cluster') # Stable Point print 'stable point start' if user_type == 'Local': stable_input = os.path.join(input_dir, '%sTrue%s.csv' % (month, user_type)) else: stable_input = os.path.join(input_dir, '%sTotal%s.csv' % (month, user_type)) print stable_input stable_output = os.path.join(output_dir, '%sStable%s.csv' % (month, user_type)) stable_params = list() stable_params.append( conf.load_to_dict('StablePoint').get( 'oscillation.stable.point.time.threshold', '15')) cluster['input_path'] = stable_input cluster['output_path'] = stable_output cluster['params'] = stable_params cluster['main_class'] = conf.load_to_dict('StablePoint').get('main_class') cluster['driver'] = conf.load_to_dict('StablePoint').get('driver') stable_task = runner.SparkJob(**cluster) stable_task.run() print 'stable point end' # Rule1 print 'rule1 2 3 start' rule123_input = stable_output rule123_output = os.path.join(output_dir, '%sRule123%s.csv' % (month, user_type)) rule123_params = list() rule123_params.append( conf.load_to_dict('Rule1').get('oscillation.rule1.time.threshold', '2')) rule123_params.append( conf.load_to_dict('Rule2').get('oscillation.rule2.time.threshold', '1')) rule123_params.append( conf.load_to_dict('Rule2').get('oscillation.rule2.distance.threshold', '10')) rule123_params.append( conf.load_to_dict('Rule3').get('oscillation.rule3.speed.threshold', '250')) rule123_params.append( conf.load_to_dict('Rule3').get('oscillation.rule3.distance.threshold', '50')) cluster['input_path'] = rule123_input cluster['output_path'] = rule123_output cluster['params'] = rule123_params cluster['main_class'] = conf.load_to_dict('Rule1').get('main_class') cluster['driver'] = conf.load_to_dict('Rule1').get('driver') rule1_task = runner.SparkJob(**cluster) rule1_task.run() print 'rule123 end' # Rule4 print 'rule4 start' rule4_input = rule123_output rule4_output = os.path.join(output_dir, '%sRule4%s.csv' % (month, user_type)) rule4_params = list() rule4_params.append( conf.load_to_dict('Rule4').get('oscillation.rule4.time.threshold', '60')) rule4_params.append( conf.load_to_dict('Rule4').get('oscillation.rule4.count.threshold', '3')) rule4_params.append( conf.load_to_dict('Rule4').get( 'oscillation.rule4.uniq.count.threshold', '2')) cluster['input_path'] = rule4_input cluster['output_path'] = rule4_output cluster['params'] = rule4_params cluster['main_class'] = conf.load_to_dict('Rule4').get('main_class') cluster['driver'] = conf.load_to_dict('Rule4').get('driver') rule4_task = runner.SparkJob(**cluster) rule4_task.run() print 'rule4 end'