def reducer(stream): airport_acc = {} for airport, count in mapred.iter_key_values(stream): airport_acc[airport] = airport_acc.get(airport, 0) + int(count) for airport, count in airport_acc.iteritems(): mapred.send(airport, count)
def mapper(stream): fields = ['DayOfWeek', 'ArrDelay'] dow_map = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday' ] for dow, delay in mapred.iter_curated_fields(stream, fields): dow_str = dow_map[int(dow) - 1] mapred.send(dow_str, (delay, 1))
def reducer_step1(stream): """This implements the first reduction step. The output are keyed against day/day-part for the specific KeySplitOutputFormat to split the files. """ least_delay_legs = {} for key, value in mapred.iter_key_values(stream): _, origin, dest, delay, _, _, _ = value leg_key = (key, origin, dest) if leg_key not in least_delay_legs: least_delay_legs[leg_key] = value else: _, _, _, best_delay, _, _, _ = least_delay_legs[leg_key] if float(delay) < float(best_delay): least_delay_legs[leg_key] = value for (key, _, _), value in least_delay_legs.iteritems(): mapred.send(key, value)
def mapper_step1(stream): """This mapper produces the keys to be used for the first problem reduction step: selecting the single best flight for each day/day-part/origin/dest. """ fields = [ 'FlightDate', 'Origin', 'Dest', 'ArrDelay', 'CRSDepTime', 'UniqueCarrier', 'FlightNum' ] for date, origin, dest, delay, time, carrier, flight in mapred.iter_curated_fields( stream, fields): try: minute_of_day = int(time[:2]) * 60 + int(time[2:]) except: pass else: period = 'AM' if minute_of_day < 12 * 60 else 'PM' mapred.send((date, period), (date, origin, dest, delay, time, carrier, flight))
def mapper(stream): fields = ['Origin', 'Dest', 'UniqueCarrier', 'ArrDelay'] for origin, dest, carrier, delay in mapred.iter_curated_fields( stream, fields): mapred.send((origin, dest, carrier), (delay, 1))
def reducer(stream): depdest_carr_mean_delay = mapred.mean_accumulator_reducer(stream) for (origin, dest, carrier), mean_count in depdest_carr_mean_delay.iteritems(): mapred.send((origin, dest, carrier), mean_count)
def mapper(stream): fields = ['Origin', 'Dest', 'DepDelay'] for origin, dest, delay in mapred.iter_curated_fields(stream, fields): mapred.send((origin, dest), (delay, 1))
def mapper(stream): fields = ['Origin', 'Dest'] for origin, dest in mapred.iter_curated_fields(stream, fields): mapred.send(origin, 1) mapred.send(dest, 1)
def reducer(stream): dow_mean_delay = mapred.mean_accumulator_reducer(stream) for dow, mean_count in dow_mean_delay.iteritems(): mapred.send(dow, mean_count)
def mapper(stream): fields = ['UniqueCarrier', 'ArrDelay'] for carrier, delay in mapred.iter_curated_fields(stream, fields): mapred.send(carrier, (delay, 1))
def reducer(stream): carrier_mean_delay = mapred.mean_accumulator_reducer(stream) for carrier, mean_count in carrier_mean_delay.iteritems(): mapred.send(carrier, mean_count)