예제 #1
0
def run(args):
    conf_file = args['conf']
    print "---------", conf_file
    conf = conf_parser.ConfParser(conf_file)

    conf.load('DeCardUser')
    input_dir = conf.get('input')
    if not input_dir:
        input_dir = os.path.join(conf.get('root_dir'), 'SplitbyProv',
                                 conf.get('province'), conf.get('month'))

    output_dir = args['output']
    if not output_dir:
        output_dir = os.path.join(conf.get('root_dir'), 'DecardUser',
                                  conf.get('province'), conf.get('month'))

    cluster = conf.load_to_dict('cluster')

    # De Card User
    print 'de card user start'
    de_card_user_input = input_dir
    print de_card_user_input
    de_card_user_output = output_dir
    cluster['input_path'] = de_card_user_input
    cluster['output_path'] = de_card_user_output
    cluster['month'] = conf.get('month')
    cluster['province'] = conf.get('province')
    de_card_user_params = list()
    de_card_user_params.append(conf.get('month'))
    de_card_user_params.append(conf.get('month'))
    true_user_dir = os.path.join(conf.get('root_dir'), 'User',
                                 conf.get('province'), conf.get('month'))

    # start_time = args['params'][0]  ##未写
    # end_time = args['params'][1]  ##未写

    start_time = conf.get('start_time')
    end_time = conf.get('end_time')
    day_threshold = conf.get('day_threshold')
    hour_threshold = conf.get('hour_threshold')
    de_card_user_params.append(true_user_dir)
    de_card_user_params.append(start_time)
    de_card_user_params.append(end_time)
    de_card_user_params.append(day_threshold)
    de_card_user_params.append(hour_threshold)
    cluster['params'] = de_card_user_params
    cluster['main_class'] = conf.get('main_class')
    cluster['driver'] = conf.get('driver')

    print "params:\n", cluster
    print "in_dir:", input_dir
    print "out_dir:", output_dir

    de_card_user_task = runner.SparkJob(**cluster)
    de_card_user_task.run()

    print 'de card user end'
예제 #2
0
def run(args):
    conf_file = args['conf']
    print "---------", conf_file
    conf = conf_parser.ConfParser(conf_file)

    conf.load('DeAbnormalUser')
    input_dir = conf.get('input')
    if not input_dir:
        input_dir = os.path.join(conf.get('root_dir'), 'JoinCoordin',
                                 conf.get('province'), conf.get('month'))

    output_dir = args['output']
    if not output_dir:
        output_dir = os.path.join(conf.get('root_dir'), 'DeAbnormalUser',
                                  conf.get('province'), conf.get('month'))

    cluster = conf.load_to_dict('cluster')

    ## DeAbnormal User
    print 'deAbnormal user start'
    de_abnormal_user_input = input_dir
    print de_abnormal_user_input
    de_abnormal_user_output = output_dir
    cluster['input_path'] = de_abnormal_user_input
    cluster['output_path'] = de_abnormal_user_output
    cluster['month'] = conf.get('month')
    cluster['province'] = conf.get('province')
    de_abnormal_user_params = list()
    normal_user_dir = os.path.join(conf.get('root_dir'), 'User',
                                   conf.get('province'), conf.get('month'))
    abnormal_user_dir = os.path.join(conf.get('root_dir'), 'User',
                                     conf.get('province'), conf.get('month'))
    de_abnormal_user_params.append(conf.get('month'))
    de_abnormal_user_params.append(conf.get('province'))
    de_abnormal_user_params.append(normal_user_dir)
    de_abnormal_user_params.append(abnormal_user_dir)
    de_abnormal_user_params.append(conf.get('cc_threshold'))  # 是否传参
    de_abnormal_user_params.append(conf.get('sm_threshold'))  # 是否传参
    de_abnormal_user_params.append(conf.get('mm_threshold'))  # 是否传参
    cluster['params'] = de_abnormal_user_params
    cluster['main_class'] = conf.get('main_class')
    cluster['driver'] = conf.get('driver')

    print "params:\n", cluster
    print "in_dir:", input_dir
    print "out_dir:", output_dir

    de_abnormal_user_task = runner.SparkJob(**cluster)
    de_abnormal_user_task.run()
    print 'de abnormal user end'
예제 #3
0
def run(args):
    conf_file = args['conf']
    print "---------", conf_file
    conf = conf_parser.ConfParser(conf_file)

    conf.load('MergeData')
    input_dir = conf.get('input')
    if not input_dir:
        input_dir = os.path.join(conf.get('root_dir'), 'DeAbnormalUser',
                                 conf.get('province'), conf.get('month'))

    output_dir = args['output']
    if not output_dir:
        output_dir = os.path.join(conf.get('root_dir'), 'MergeData',
                                  conf.get('province'), conf.get('month'))

    cluster = conf.load_to_dict('cluster')

    # Merge Data
    # inputDir outputDir month province abNormalUserDir
    print 'merge data start'
    merge_data_input = input_dir
    merge_data_output = output_dir
    cluster['input_path'] = merge_data_input
    cluster['output_path'] = merge_data_output
    cluster['month'] = conf.get('month')
    cluster['province'] = conf.get('province')

    # params
    merge_data_params = list()
    merge_data_params.append(conf.get('month'))
    merge_data_params.append(conf.get('province'))

    abnormal_user_dir = os.path.join(conf.get('root_dir'), 'User',
                                     conf.get('province'), conf.get('month'))
    merge_data_params.append(abnormal_user_dir)
    cluster['params'] = merge_data_params
    cluster['main_class'] = conf.get('main_class')
    cluster['driver'] = conf.get('driver')

    print "params:\n", cluster
    print "in_dir:", input_dir
    print "out_dir:", output_dir

    merge_data_task = runner.SparkJob(**cluster)
    merge_data_task.run()
    print 'merge data end'
예제 #4
0
def run(args):
    conf_file = args['conf']
    print "---------", conf_file
    conf = conf_parser.ConfParser(conf_file)

    conf.load('JoinCoordinate')
    input_dir = conf.get('input')
    if not input_dir:
        input_dir = os.path.join(conf.get('root_dir'), 'SelectField',
                                 conf.get('province'), conf.get('month'))

    output_dir = args['output']
    if not output_dir:
        output_dir = os.path.join(conf.get('root_dir'), 'JoinCoordin',
                                  conf.get('province'), conf.get('month'))
    cluster = conf.load_to_dict('cluster')

    # Join Coordinate
    # JoinUserLatLontAndProvince inputDir outputDir month province CellListDir TeleUserInfoDir
    print 'join coordinate start'
    join_coordinate_input = input_dir
    join_coordinate_output = output_dir
    cluster['input_path'] = join_coordinate_input
    cluster['output_path'] = join_coordinate_output
    # cluster['month'] = month
    # cluster['province'] = province
    join_coordinate_params = list()
    join_coordinate_params.append(conf.get('month'))
    join_coordinate_params.append(conf.get('province'))
    join_coordinate_params.append(conf.get('cell_info_dir'))
    join_coordinate_params.append(
        conf.get('tele_usmergeDistinctNormalUserLocer_info_dir'))  # 是否传参
    cluster['params'] = join_coordinate_params
    cluster['main_class'] = conf.get('main_class')
    cluster['driver'] = conf.get('driver')

    print "params:\n", cluster
    print "in_dir:", input_dir
    print "out_dir:", output_dir

    join_coordinate_task = runner.SparkJob(**cluster)
    join_coordinate_task.run()
    print 'join coordinate end'
예제 #5
0
def run(args):
    conf_file = args['conf']
    print "---------", conf_file
    conf = conf_parser.ConfParser(conf_file)

    conf.load('SelectFields')
    input_dir = conf.get('input')
    if not input_dir:
        input_dir = os.path.join(conf.get('root_dir'), 'SplitData',
                                 conf.get('province'), conf.get('month'))

    output_dir = args['output']
    if not output_dir:
        output_dir = os.path.join(conf.get('root_dir'), 'SelectField',
                                  conf.get('province'), conf.get('month'))

    cluster = conf.load_to_dict('cluster')
    # Select Fields
    # inputDir outputDir month province
    print 'select fields start'
    select_fields_input = input_dir
    print select_fields_input
    select_fields_output = output_dir
    cluster['input_path'] = select_fields_input
    cluster['output_path'] = select_fields_output

    # params
    select_fields_params = list()
    select_fields_params.append(conf.get('month'))
    select_fields_params.append(conf.get('province'))

    cluster['params'] = select_fields_params
    cluster['main_class'] = conf.get('main_class')
    cluster['driver'] = conf.get('driver')

    print "params:\n", cluster
    print "in_dir:", input_dir
    print "out_dir:", output_dir

    select_fields_task = runner.SparkJob(**cluster)
    select_fields_task.run()
    print 'select_fields end'
def run(args):
    conf_file = args['conf']
    conf = conf_parser.ConfParser(conf_file)
    conf.load('ScenicAnalysis')  # 加载去震荡默认的参数配置模块
    month = args['month']
    if not month:
        if conf.has('month'):
            month = conf.get('month')
        else:
            assert False, 'the month is not setted'

    province = args['province']
    if not province:
        if conf.has('province'):
            province = conf.get('province')
        else:
            assert False, 'the province is not setted'
    cluster = conf.load_to_dict('cluster')

    scenicAnalysis_input = list()
    scenicAnalysis_params = list()
    print "Check input (file src is listed as follows)"
    #input 1
    OtherProvTrueUserInfo_input = os.path.join(conf.get('root_dir'),
                                               'DataClean', province, month,
                                               '%sOther' % month,
                                               '%sTrueOther.csv' % month)
    print "OtherProvTrueUserInfo:" + OtherProvTrueUserInfo_input
    scenicAnalysis_input.append(OtherProvTrueUserInfo_input)
    #input 2
    OtherProvStopPoint_input = os.path.join(conf.get('root_dir'), 'Extraction',
                                            province, month,
                                            '%sOtherStop.csv' % month)
    print "OtherProvStopPoint:" + OtherProvStopPoint_input
    scenicAnalysis_input.append(OtherProvStopPoint_input)
    #input 3
    LocalTrueUserInfo_input = os.path.join(conf.get('root_dir'), 'DataClean',
                                           province, month, '%sLocal' % month,
                                           '%sTrueLocal.csv' % month)
    print "LocalTrueUserInfo:" + LocalTrueUserInfo_input
    scenicAnalysis_input.append(LocalTrueUserInfo_input)
    #input 4
    CDRData_input = os.path.join(conf.get('root_dir'), 'SplitData', province,
                                 month, '%sCC.csv' % month)
    print "CDRData:" + CDRData_input
    scenicAnalysis_input.append(CDRData_input)
    #input 5
    ScenicData_input = os.path.join(conf.get('root_dir'), 'BasicInfo',
                                    'ScenicSpot', '%s.csv' % province)
    print "ScenicData:" + ScenicData_input
    scenicAnalysis_input.append(ScenicData_input)
    #input 6
    CellData_input = os.path.join(conf.get('root_dir'), 'BasicInfo',
                                  'BaseStation', '%s.csv' % province)
    print "CellData:" + CellData_input
    scenicAnalysis_input.append(CellData_input)
    #判断输入是否存在
    for inputfiles in scenicAnalysis_input:
        if not hdfs_util.exist(inputfiles):
            print >> sys.stderr, 'the input is not existed!'
            sys.exit(-1)
    #插入jar包的第一个参数
    scenicAnalysis_params.append(conf.get('root_dir'))

    #插入Jar包的第二,第三个参数
    scenicAnalysis_params.append(province)
    scenicAnalysis_params.append(month)
    print 'scenic spot analysis start!'
    cluster['input_path'] = '/user/tele/trip'
    cluster[
        'output_path'] = '/user/tele/trip/BackEnd/ScenicSpotAnalysis/HaiNan/201512'
    cluster['params'] = scenicAnalysis_params
    cluster['main_class'] = conf.load_to_dict('ScenicAnalysis').get(
        'main_class')
    cluster['driver'] = conf.load_to_dict('ScenicAnalysis').get('driver')
    ScenicAnalysis_task = runner.SparkJob(**cluster)
    ScenicAnalysis_task.run()
    print 'scenic spot analysis end!'
예제 #7
0
def run(args):
    conf_file = args['conf']
    conf = conf_parser.ConfParser(conf_file)
    conf.load('LocalUserFeature')  # 加载去震荡默认的参数配置模块
    month = args['month']
    if not month:
        if conf.has('month'):
            month = conf.get('month')
        else:
            assert False, 'the month is not setted'

    province = args['province']
    if not province:
        if conf.has('province'):
            province = conf.get('province')
        else:
            assert False, 'the province is not setted'
    cluster = conf.load_to_dict('cluster')

    input_dir = args['input']
    if not input_dir:
        input_dir = '/user/tele/trip'

    output_dir = args['output']
    if not output_dir:
        output_dir = os.path.join(conf.get('root_dir'), 'BackEnd',
                                  'LocalAnalysis', province, month,
                                  'LocalUserFeatureTemp.csv')

    scenicAnalysis_input = list()
    scenicAnalysis_params = list()
    print "Check input (file src is listed as follows)"
    #input 1
    LocalStopPoint_input = os.path.join(conf.get('root_dir'), 'Extraction',
                                        province, month,
                                        '%sLocalStop.csv' % month)
    print "LocalStopPoint:" + LocalStopPoint_input
    scenicAnalysis_input.append(LocalStopPoint_input)
    #input 2
    UserHome_input = os.path.join(conf.get('root_dir'), 'Extraction', province,
                                  month, '%sHome.csv' % month)
    print "UserHome:" + UserHome_input
    scenicAnalysis_input.append(UserHome_input)
    #input 3
    UserWork_input = os.path.join(conf.get('root_dir'), 'Extraction', province,
                                  month, '%sWork.csv' % month)
    print "UserWork:" + UserWork_input
    scenicAnalysis_input.append(UserWork_input)
    #input 4
    LocalTrueUserInfo_input = os.path.join(conf.get('root_dir'), 'DataClean',
                                           province, month, '%sLocal' % month,
                                           '%sTrueLocal.csv' % month)
    print "LocalTrueUserInfo:" + LocalTrueUserInfo_input
    scenicAnalysis_input.append(LocalTrueUserInfo_input)
    #input 5
    CDRData_input = os.path.join(conf.get('root_dir'), 'SplitData', province,
                                 month, '%sCC.csv' % month)
    print "CDRData:" + CDRData_input
    scenicAnalysis_input.append(CDRData_input)
    #input 6
    ScenicData_input = os.path.join(conf.get('root_dir'), 'BasicInfo',
                                    'ScenicSpot', '%s.csv' % province)
    print "ScenicData:" + ScenicData_input
    scenicAnalysis_input.append(ScenicData_input)
    #判断输入是否存在
    for inputfiles in scenicAnalysis_input:
        if not hdfs_util.exist(inputfiles):
            print >> sys.stderr, 'the input is not existed!'
            sys.exit(-1)
    #插入Jar包的参数
    scenicAnalysis_params.append(province)
    scenicAnalysis_params.append(month)
    print ' Extract Local User Feature start!'
    cluster['input_path'] = input_dir
    cluster['output_path'] = output_dir
    cluster['params'] = scenicAnalysis_params
    cluster['main_class'] = conf.load_to_dict('LocalUserFeature').get(
        'main_class')
    cluster['driver'] = conf.load_to_dict('LocalUserFeature').get('driver')
    ScenicAnalysis_task = runner.SparkJob(**cluster)
    ScenicAnalysis_task.run()
    print 'Extract Local User Feature end!'
예제 #8
0
def run(args):
    conf_file = args['conf']
    conf = conf_parser.ConfParser(conf_file)
    conf.load('LongestPath')  # 加载去震荡默认的参数配置模块
    month = args['month']
    if not month:
        if conf.has('month'):
            month = conf.get('month')
        else:
            assert False, 'the month is not setted'

    province = args['province']
    if not province:
        if conf.has('province'):
            province = conf.get('province')
        else:
            assert False, 'the province is not setted'
    cluster = conf.load_to_dict('cluster')

    input_dir = args['input']
    if not input_dir:
        input_dir = '/user/tele/trip'

    output_dir = args['output']
    if not output_dir:
        output_dir = os.path.join(conf.get('root_dir'), 'Extraction', province,
                                  month, '%sReprestRtFromHmToWk.csv' % month)

    scenicAnalysis_input = list()
    scenicAnalysis_params = list()
    print "Check input (file src is listed as follows)"

    #input 1
    LocalStopPoint_input = os.path.join(conf.get('root_dir'), 'Extraction',
                                        province, month,
                                        '%sLocalStop.csv' % month)
    print "LocalStopPoint:" + LocalStopPoint_input
    scenicAnalysis_input.append(LocalStopPoint_input)
    #input 2
    UserHome_input = os.path.join(conf.get('root_dir'), 'Extraction', province,
                                  month, '%sHome.csv' % month)
    print "UserHome:" + UserHome_input
    scenicAnalysis_input.append(UserHome_input)
    #input 3
    UserWork_input = os.path.join(conf.get('root_dir'), 'Extraction', province,
                                  month, '%sWork.csv' % month)
    print "UserWork:" + UserWork_input
    scenicAnalysis_input.append(UserWork_input)
    #input 4
    CellData_input = os.path.join(conf.get('root_dir'), 'BasicInfo',
                                  'BaseStation', '%s.csv' % province)
    print "CellData:" + CellData_input
    scenicAnalysis_input.append(CellData_input)
    #判断输入是否存在
    for inputfiles in scenicAnalysis_input:
        if not hdfs_util.exist(inputfiles):
            print >> sys.stderr, 'the input is not existed!'
            sys.exit(-1)
    #插入Jar包的参数
    scenicAnalysis_params.append(province)
    scenicAnalysis_params.append(month)
    print 'Extract Local Users Representative(Longest) Path From Home to Work start!'
    cluster['input_path'] = input_dir
    cluster['output_path'] = output_dir
    cluster['params'] = scenicAnalysis_params
    cluster['main_class'] = conf.load_to_dict('LongestPath').get('main_class')
    cluster['driver'] = conf.load_to_dict('LongestPath').get('driver')
    ScenicAnalysis_task = runner.SparkJob(**cluster)
    ScenicAnalysis_task.run()
    print 'Extract Local Users Representative(Longest) Path From Home to Work end!'
예제 #9
0
def run(args):
    conf_file = args['conf']
    conf = conf_parser.ConfParser(conf_file)
    conf.load('DeOscillation')  # 加载去震荡默认的参数配置模块
    month = args['month']
    if not month:
        if conf.has('month'):
            month = conf.get('month')
        else:
            assert False, 'the month is not setted'

    province = args['province']
    if not province:
        if conf.has('province'):
            province = conf.get('province')
        else:
            assert False, 'the province is not setted'

    user_type = ''
    if conf.has('user_type'):
        user_type = conf.get('user_type')
    if args['params']:
        user_type = args['params'][
            0]  # 若user_type 采用传参的话,必须把user_type放在无参数名的第一位
    if not user_type:
        assert False, 'the user_type is not setted'

    input_dir = args['input']
    if not input_dir:
        input_dir = os.path.join(conf.get('root_dir'), 'DataClean', month)

    output_dir = args['output']
    if not output_dir:
        output_dir = os.path.join(conf.get('root_dir'), 'DeOscillation', month)
    cluster = conf.load_to_dict('cluster')

    # Stable Point
    print 'stable point start'
    if user_type == 'Local':
        stable_input = os.path.join(input_dir,
                                    '%sTrue%s.csv' % (month, user_type))
    else:
        stable_input = os.path.join(input_dir,
                                    '%sTotal%s.csv' % (month, user_type))
    print stable_input
    stable_output = os.path.join(output_dir,
                                 '%sStable%s.csv' % (month, user_type))
    stable_params = list()
    stable_params.append(
        conf.load_to_dict('StablePoint').get(
            'oscillation.stable.point.time.threshold', '15'))
    cluster['input_path'] = stable_input
    cluster['output_path'] = stable_output
    cluster['params'] = stable_params
    cluster['main_class'] = conf.load_to_dict('StablePoint').get('main_class')
    cluster['driver'] = conf.load_to_dict('StablePoint').get('driver')
    stable_task = runner.SparkJob(**cluster)
    stable_task.run()
    print 'stable point end'

    # Rule1
    print 'rule1 2 3 start'
    rule123_input = stable_output
    rule123_output = os.path.join(output_dir,
                                  '%sRule123%s.csv' % (month, user_type))
    rule123_params = list()
    rule123_params.append(
        conf.load_to_dict('Rule1').get('oscillation.rule1.time.threshold',
                                       '2'))
    rule123_params.append(
        conf.load_to_dict('Rule2').get('oscillation.rule2.time.threshold',
                                       '1'))
    rule123_params.append(
        conf.load_to_dict('Rule2').get('oscillation.rule2.distance.threshold',
                                       '10'))
    rule123_params.append(
        conf.load_to_dict('Rule3').get('oscillation.rule3.speed.threshold',
                                       '250'))
    rule123_params.append(
        conf.load_to_dict('Rule3').get('oscillation.rule3.distance.threshold',
                                       '50'))
    cluster['input_path'] = rule123_input
    cluster['output_path'] = rule123_output
    cluster['params'] = rule123_params
    cluster['main_class'] = conf.load_to_dict('Rule1').get('main_class')
    cluster['driver'] = conf.load_to_dict('Rule1').get('driver')
    rule1_task = runner.SparkJob(**cluster)
    rule1_task.run()
    print 'rule123 end'

    # Rule4
    print 'rule4 start'
    rule4_input = rule123_output
    rule4_output = os.path.join(output_dir,
                                '%sRule4%s.csv' % (month, user_type))
    rule4_params = list()
    rule4_params.append(
        conf.load_to_dict('Rule4').get('oscillation.rule4.time.threshold',
                                       '60'))
    rule4_params.append(
        conf.load_to_dict('Rule4').get('oscillation.rule4.count.threshold',
                                       '3'))
    rule4_params.append(
        conf.load_to_dict('Rule4').get(
            'oscillation.rule4.uniq.count.threshold', '2'))
    cluster['input_path'] = rule4_input
    cluster['output_path'] = rule4_output
    cluster['params'] = rule4_params
    cluster['main_class'] = conf.load_to_dict('Rule4').get('main_class')
    cluster['driver'] = conf.load_to_dict('Rule4').get('driver')
    rule4_task = runner.SparkJob(**cluster)
    rule4_task.run()
    print 'rule4 end'