def __init__(self, worker_id, host, port): self.host = host self.port = port self.worker_id = worker_id self.mapper = Mapper() self.reducer = Reducer() self.logger = logging.getLogger('worker ' + str(self.worker_id)) self.logger.debug('Worker connecting to %s:%d', self.host, self.port)
def set_attributes(self, submit_time, mapper_list, reducer_list, data_size_list): self.submitTime = submit_time self.mapperList = mapper_list for i in range(0, len(reducer_list)): r = Reducer("R" + self.jobName[1:] + "-" + str(i), data_size_list[i], self) r.set_attributes(reducer_list[i], self.submitTime, mapper_list) self.reducerList.append(r)
def main(): mapper = Mapper() reducer = Reducer() arrayMap = mapper.map("Esta à Frase, fRASe tomas frase esta unica única") arrayMap2 = mapper.map("única Este cena\n. frase única") arrayMap3 = mapper.map("à frase à") mapFinal = reducer.reduce([arrayMap3]) print("Reduced: ", mapFinal)
def start(): mapper = Mapper() reducer = Reducer() mapper.setInputFile("./inputFiles/AComp_Passenger_data.csv") mapper.setMapFunction(StripErrorsUserCode.mapDuplicates) reducer.setRedFunction(StripErrorsUserCode.redWrite) reducer.setOutputFile("./inputFiles/PassengerData.csv") pairs = mapper.run() reducer.run(pairs, 'w') mapper.setInputFile("./inputFiles/PassengerData.csv") mapper.setMapFunction(StripErrorsUserCode.mapSpelling) pairs = mapper.run() reducer.run(pairs, 'w')
def start(): mapper = Mapper() reducer = Reducer() # Produce a list of all the airports in a csv file with the headings 'Airport Code', and 'Null' mapper.setInputFile("./inputFiles/PassengerData.csv") mapper.setMapFunction(PassengersOnEachFlightUserCode.mapPassengerToFlight) reducer.setRedFunction(PassengersOnEachFlightUserCode.redCountPassengers) reducer.setOutputFile("./results/NumberOfPassengersOnEachFlight.csv") noPassengers = mapper.run() reducer.run(noPassengers, 'w') print(":: Task 2 complete")
def start(): mapper = Mapper() reducer = Reducer() # Produce a list of all the airports in a csv file with the headings 'Airport Code', and 'Null' mapper.setInputFile("./inputFiles/PassengerData.csv") mapper.setMapFunction(FlightInformationUserCode.mapReOrder) reducer.setRedFunction(FlightInformationUserCode.redCalcFlightInfo) reducer.setOutputFile( "./results/ListOfFlightsWithDurationAndAllPassengers.csv") noPassengers = mapper.run() reducer.run(noPassengers, 'w') print(":: Task 3 complete")
import pickle from Reducer import Reducer from Trie import Trie import sys current_key = None current_trie = Trie() key = None # input comes from STDIN for line in sys.stdin: # remove leading and trailing whitespace line = line.strip() # parse the input we got from mapper.py key, filename = line.split('\t', 1) file = open(filename, mode='rb') t = pickle.load(file) if current_key == key: # join tries Reducer.reduce(current_trie, t) else: if current_key: #traverse tries current_trie.printWords(path=current_key) current_trie = t current_key = key if current_key == key: current_trie.printWords(path=current_key)
class Worker(): def __init__(self, worker_id, host, port): self.host = host self.port = port self.worker_id = worker_id self.mapper = Mapper() self.reducer = Reducer() self.logger = logging.getLogger('worker ' + str(self.worker_id)) self.logger.debug('Worker connecting to %s:%d', self.host, self.port) def parse_msg(self, msg): msg_len = len(msg) return '0' * (MAX_N_BYTES - len(str(int(msg_len)))) + str(msg_len) + msg def proccess_msg(self, msg): # msg = self.queue_in.get() if msg['task'] == 'map_request': # logger.debug('THIS IS A MAP REQ') result = self.mapper.map(msg['value']) reply = {'task': 'map_reply', 'value': result} return reply elif msg['task'] == 'reduce_request': # logger.debug('THIS IS A REDUCE REQ') result = self.reducer.reduce(msg['value']) reply = {'task': 'reduce_reply', 'value': result} return reply elif msg['task'] == 'done': pass else: self.logger.debug('THIS IS NOT FOR ME: %s', msg['task']) def register(self): message = {'task': 'register', 'id': self.worker_id} return message async def tcp_echo_client(self, host, port, loop): self.logger.debug('Openning connection') reader, writer = await asyncio.open_connection(host, port, loop=loop ) # open connection # register to_send = self.register() # register on first time msg_json = json.dumps(to_send) parsed_msg = self.parse_msg(msg_json) self.logger.info('Sending to: %s' % host) writer.write(parsed_msg.encode()) # send message await writer.drain() while True: # receive data try: data = await reader.read(MAX_N_BYTES) except ConnectionResetError: await asyncio.sleep( 3) # give the backup coordinator time to start break if not data: await asyncio.sleep( 3) # give the backup coordinator time to start break # self.logger.info('Received (size of json str): %r ' % data.decode() ) cur_size = 0 total_size = int(data.decode()) final_str = '' while (total_size - cur_size) >= CHUNK: data = await reader.read(CHUNK) final_str = final_str + data.decode() cur_size += len(data) data = await reader.read(total_size - cur_size) final_str = final_str + data.decode() # self.logger.info('Received: %r ' % final_str['task'] ) self.logger.info('Received from: %s ' % host) to_send = self.proccess_msg( json.loads(final_str)) # process message if to_send is not None: msg_json = json.dumps(to_send) parsed_msg = self.parse_msg(msg_json) self.logger.info('Sending to: %s' % host) writer.write(parsed_msg.encode()) # send message await writer.drain() self.logger.info('Close the socket') writer.close()
def start(): mapper = Mapper() reducer = Reducer() # Produce a list of all the airports in a csv file with the headings 'Airport Code', and 'Null' mapper.setInputFile("./inputFiles/Top30_airports_LatLong.csv") mapper.setMapFunction(NoOfFlightsFromAirportsUserCode.mapUnusedAirports) reducer.setRedFunction(NoOfFlightsFromAirportsUserCode.redUnusedAirports) reducer.setOutputFile("./results/Airports.csv") unusedAirports = mapper.run() reducer.run(unusedAirports, 'w') # Produce a list of all the airports used in the passenger data file in a csv file with the headings 'Airport Code', and 'Number of flights from that airport' mapper.setInputFile("./inputFiles/PassengerData.csv") mapper.setMapFunction(NoOfFlightsFromAirportsUserCode.mapUsedAirports) reducer.setRedFunction(NoOfFlightsFromAirportsUserCode.redUsedAirports) usedAirports = mapper.run() reducer.run(usedAirports, 'a') # Combine the two above results to give a list of all flights from each airport, including those that aren't used # Heading titles: 'Airport code', 'No. of flights from that airport' mapper.setMapFunction(NoOfFlightsFromAirportsUserCode.mapMakePairs) mapper.setInputFile("./results/Airports.csv") reducer.setOutputFile("./results/NumberOfFlightsFromEachAirport.csv") allAirports = mapper.run() reducer.setRedFunction(NoOfFlightsFromAirportsUserCode.redCountFlights) reducer.run(allAirports, 'w') print(":: Task 1 complete")
def start(): mapper = Mapper() reducer = Reducer() mapper.setInputFile("./inputFiles/PassengerData.csv") mapper.setMapFunction(CalcDistanceUserCode.mapCalcFlightDistances) reducer.setRedFunction(CalcDistanceUserCode.redCalcFlightDistance) reducer.setOutputFile("./results/FlightDistances.csv") flightDistances = mapper.run() reducer.run(flightDistances, 'w') mapper.setInputFile("./results/FlightDistances.csv") mapper.setMapFunction(CalcDistanceUserCode.mapTotalPassengerDistance) reducer.setRedFunction(CalcDistanceUserCode.redTotalPassengerDistance) reducer.setOutputFile( "./results/TotalDistanceTravelledByEachPassenger.csv") passengerDistance = mapper.run() reducer.run(passengerDistance, 'w') mapper.setInputFile("./results/FlightDistances.csv") mapper.setMapFunction(CalcDistanceUserCode.mapDistaces) reducer.setRedFunction(CalcDistanceUserCode.redDistances) reducer.setOutputFile("./results/DistanceOfEachFlight.csv") distances = mapper.run() reducer.run(distances, 'w') print(":: Task 4 complete")