Exemplo n.º 1
0
 def avg_with_cnt(self,data):
     return data \
         .map(lambda (key, value): (adpu.build_key(self.combo, key), value)) \
         .combineByKey(lambda value: (value, 1),
                       lambda x, value: (x[0] + value, x[1] + 1),
                       lambda x, y: (x[0] + y[0], x[1] + y[1])) \
         .map(lambda (label, (value_sum, count)):
              (label, str(count) + "," + str(value_sum / count)))
Exemplo n.º 2
0
def main():
    # Build the arguments parser and parse the arguments
    args = adpu.parse_arguments()
    data_file = args['input']
    master = args['master']

    conf = SparkConf() \
        .setAppName('Cube Build Beta') \
        .setMaster(master) \
        .set("spark.hadoop.validateOutputSpecs", "false") #TODO TEST PURPOSE, WILL BE REMOVED

    sc = SparkContext(conf=conf)
    data = sc.textFile(data_file)
    exec_build(data, args)
Exemplo n.º 3
0
def exec_build(data, args):
    # Get all the arguments from the parser
    combo_file = args['combo']
    qtr = args['quarter']
    output = args['output']
    target = args['target']

        # Find all months within 12 months
    all_months = adpu.get_12_months(qtr)

    all_combos = adpu.read_combo_file(combo_file)
    # TODO MIGHT BE REMOVED IF THE INPUT IS FROM HIVE TABLE
    header = data.first()

    # TODO NEED TO BE MODIFIED IF INPUT IS FROM HIVE TABLE
    idx = header.strip().split(",").index(target) #benchmark target variable/column in the data

    # Apply filters, months, states
    # TODO CREATE A FUNCTION TO APPLY A LIST OF FILTERS
    # TODO STATUS == 'A' OR STATUS == 'T'
    # TODO AND JOB_SCORE > 70.0
    # TODO AND ((RATE_TYPE == 'H' AND RATE_AMOUNT <500) OR (RATE_TYPE == 'S' AND RATE_AMOUNT < 40000))
    data = data.filter(lambda x : x != header) \
        .filter(lambda line: line.strip().split(",")[-1] in all_months) \
        .filter(lambda line: line.strip().split(",")[3] in adpu.all_states)
        #.filter(lambda line: line.strip().split(",")[?]) in ['T','A'])

    # Compute the total wage for each person within last 12 months
    cal = Calculator()
    cal.index = idx
    person_total = cal.sum(data=data)
    person_total.cache()
    # person_total = calculate_sum(data, idx)

    # Iterate all combos
    for combo in all_combos:
        # Computer average
        calculator = Calculator(combo = combo)
        data_avg = calculator.avg_with_cnt(data=person_total)
        # data_avg = calculate_avg_with_cnt(person_total, combo)

        # Apply filter employee count > 180
        data_new = data_avg.filter(lambda (x,y) : int(y.split(",")[0])>180)

        # Computer min
        data_min = calculator.min(data=person_total)
        # data_min = calculate_min(person_total, combo)

        # Computer max
        data_max = calculator.max(data=person_total)
        # data_max = calculate_max(person_total, combo)

        data_final = data_new.join(data_min).join(data_max) \
            .map(lambda (key, value) : ",".join(key.split(",") +
                [i.strip("'()") for i in str(value).split(",")])) \
            .repartition(1)

        for i in data_final.collect():
            print i

        output_path = output + combo.split(",")[0]
        data_final.saveAsTextFile(output_path)
Exemplo n.º 4
0
 def max(self,data):
     return data \
         .map(lambda (key, value): (adpu.build_key(self.combo, key), value)) \
         .reduceByKey(lambda a, b: a if a > b else b)