def run(self,input_reader,output_writer): start_time = datetime.datetime.now() self._mapred.reset() print "INFO:start job %s on a single core" % self._mapred.__class__.__name__ self._mapred.run_map(input_reader) if "combine" in dir(self._mapred): self._mapred.run_combine(self._mapred.data.items()) if "reduce" not in dir(self._mapred): self._mapred.data_reduced = self._mapred.data else: self._mapred.run_reduce(self._mapred.data.items()) output_writer.write(self._mapred.post_reduce()) print "INFO: end job %s in %s with mem size of %d" % (self._mapred.__class__.__name__, (datetime.datetime.now()-start_time),mem.asizeof(self._mapred))
def profile(self, input_reader,sample_size=100, max_memory=1000,core=cpu_count()-1,hadoop_nodes=4): """ Profile the MapReduce job against the input reader and return recommandation + diagnostics @param max_memory: SMP memory limit availaible for the job in Mb (default : 1Gb) @return: recommanded engine name, diagnostic data """ diagnostics = {} if input_reader.is_distant(): return HADOOP, diagnostics total_size = input_reader.get_estimated_size() map_delay = 0.0 self.reset() if 'map' in dir(self): for line in input_reader.sample(sample_size): start = time.time() self.map(line) map_delay += time.time() - start elif 'map_partition' in dir(self): start = time.time() self.map_partition(input_reader.sample(sample_size)) map_delay += time.time() - start else: raise Exception("ERROR: You have to implement a map() or map_partition() method") sample_size = sample_size if total_size >= sample_size else total_size mean_map_delay = map_delay / sample_size map_data_mem = mem.asizeof(self.data)/1000000.0 * total_size / sample_size diagnostics['estimated-input-size'] = total_size diagnostics['mean-map-delay'] = mean_map_delay diagnostics['estimated-mem-size'] = map_data_mem if map_data_mem >= max_memory: engine= HADOOP diagnostics['estimated-delay'] = total_size * mean_map_delay / hadoop_nodes else: if mean_map_delay >= 1.0e-4: if map_data_mem * core >= max_memory: engine = HADOOP diagnostics['estimated-delay'] = total_size * mean_map_delay / hadoop_nodes else: engine = MULTI_CORE diagnostics['estimated-delay'] = total_size * mean_map_delay / (core if core > 0 else 1) else: engine = SINGLE_CORE diagnostics['estimated-delay'] = total_size * mean_map_delay return engine,diagnostics
def run(self,input_reader,output_writer,cpu=cpu_count()-1, cache_line=100000): start_time = datetime.datetime.now() print "INFO: start job %s on %d cores" % (self._mapred.__class__.__name__, cpu) self._run_map(cpu, cache_line, input_reader) if "reduce" not in dir(self._mapred): self._mapred.data_reduced = self._mapred.data else: if len(self._mapred.data) < cpu: self._mapred.run_reduce(self._mapred.data.items()) else: self._run_reduce(cpu) output_writer.write(self._mapred.post_reduce()) print "INFO: end job %s in %s with mem size of %d" % (self._mapred.__class__.__name__, (datetime.datetime.now()-start_time),mem.asizeof(self))
def run(self,input_reader,output_writer): start_time = datetime.datetime.now() print "INFO: start job %s on hadoop" % (self._mapred.__class__.__name__) #hadoop_home #hadoop = HadoopClient() hdfs_web_url = 'http://sandbox:50070/?user.name=predictiveds' #getenv('HDFS_WEB_URL') (scheme,hostport, path,params,query,fragment) = urlparse(hdfs_web_url) host,port = hostport.split(':',2) hadoop = WebHdfsClient(host,port,query) if getenv('POLYMR_HOME') is None: print "ERROR : $POLYMR_HOME have to be set to polymr home directory" raise SystemError("$POLYMR_HOME have to be set to polymr home directory") #Set metadata format_class = input_reader.formatter.__class__ input_source_file = inspect.getfile(format_function) input_module_name = format_class.__module__ input_class_name = format_class.__name__ input_format_source = "format = %s" % inspect.getsource(format_function) self._mapred.params['_input_meta'] = { 'input_class_name': input_class_name, 'input_module_name' : input_module_name, 'input_source' : input_source_file, 'input_options' : input_reader.formatter.options } #store params to broadcast to hadoop params_file_id = str(uuid.uuid1()) cache_filename = '/var/tmp/%s' % params_file_id f = open(cache_filename,mode='w') f.write(json.dumps(self._mapred.params)) f.close() #Manage the input types if input_reader.is_distant(): hdfs_input = input_reader.filename else : hdfs_input = ".tmp/input-%s" % str(uuid.uuid1()) hadoop.put_file(input_reader.to_file(), hdfs_input) if output_writer.is_distant(): output_id = output_writer.fileName else: output_id = ".tmp/output-%s" % str(uuid.uuid1()) #dummy hadoop simulation as command pipes cmds = "$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/contrib/streaming/hadoop-*streaming*.jar -archives $POLYMR_HOME/polymr.zip#polymr -files $POLYMR_HOME/streamer.py,%s,%s,%s -input %s -output %s -mapper 'streamer.py mapper %s %s %s'" % (self._source_file, cache_filename, input_source_file, hdfs_input, output_id, self._module_name,self._class_name,params_file_id) if "combine" in dir(self._mapred): cmds += " -combiner 'streamer.py combiner %s %s %s'" % (self._module_name,self._class_name,params_file_id) if "reduce" in dir(self._mapred): cmds += " -reducer 'streamer.py reducer %s %s %s'" % (self._module_name,self._class_name,params_file_id) print "INFO: %s" % cmds subprocess.check_output(cmds,shell=True) # get result def load_line(line): key, value = line.split(";",2) self._mapred.data_reduced[key] = [json.loads(value)] if output_writer.is_distant(): pass # nothing to do elif output_writer.is_memory(): output = hadoop.cat("%s"% output_id) map(load_line,output.strip().split("\n")) output_writer.write(self._mapred.post_reduce()) else : hadoop.get_file(output_id, output_writer.filename) #Clean up if not input_reader.is_distant() : hadoop.rm(hdfs_input) if not output_writer.is_distant() : hadoop.rm(output_id) print "INFO: end job %s in %s with mem size of %d" % (self._mapred.__class__.__name__, (datetime.datetime.now()-start_time),mem.asizeof(self))