def setUpCluster(self): """ Setup Hadoop Cluster """ self.startPilot() pcs = self.getPilotComputes() logger.info("Setup Hadoop Cluster") i=0 hadoopSetupTasks =[] for pilot in pcs: setUpTask = {} desc = util.getEmptyDU(self._pilots[i]['pilot_compute']) self._pilotInfo[i]['hadoopConfDir'] = self.compute_data_service.submit_data_unit(desc) setUpTask = util.setAffinity(setUpTask, self._pilotInfo[i]['hadoopConfDir'].data_unit_description) setUpTask['output_data'] = [ { self._pilotInfo[i]['hadoopConfDir'].get_url(): ['mapred-site.xml','core-site.xml','slaves'] } ] setUpTask['executable'] = "python" nodes = pilot.get_nodes() setUpTask['arguments'] = [self._setupScript, ",".join(nodes)] hadoopSetupTasks.append(self.compute_data_service.submit_compute_unit(setUpTask)) i=i+1 util.waitCUs(hadoopSetupTasks) logger.info("Cluster ready")
def _chunk(self): """ Chunks input data if Chunk task is defined """ if self._chunkDesc: """ for each file in inputDU create a Chunk task """ logger.debug("Chunking input data") chunkCUs = [] try: for inputDu in self._inputDus: temp = util.getEmptyDU(inputDu.data_unit_description) temp = self.compute_data_service.submit_data_unit(temp) temp.wait() for fName in inputDu.list_files(): # for user defined ChunkDesc assign affinity. self._chunkDesc = util.setAffinity(self._chunkDesc, inputDu.data_unit_description) # Pass the input filename and output filename as arguments. self._chunkDesc['arguments'] = [fName, "%s-%s" % (fName, constant.CHUNK_FILE_PREFIX)] # Collect chunked files in output_du self._chunkDesc['output_data'] = [ { temp.get_url(): ['*-chunk-*'] } ] # Get input file to Chunk CU. self._chunkDesc["input_data"] = [ {inputDu.get_url(): [fName]} ] if self._chunkExe is not None: self._chunkDesc["input_data"].append(self._chunkExe.get_url()) chunkCUs.append(self.compute_data_service.submit_compute_unit(self._chunkDesc)) self._chunkDus.append(temp) # Wait for the chunk DUS logger.debug("Wait for chunk DUS/CUS") util.waitDUs(self._chunkDus) util.waitCUs(chunkCUs) except Exception, ex: self._clean(ex, "Chunk failed - Abort")
def _loadExecutables(self): """ Loads executables into Pilot-Data """ if self._chunkDesc and self._chunkDesc.get('files', None): desc = util.getEmptyDU(self._pilots[0]['pilot_compute']) desc['file_urls'] = self._chunkDesc['files'] self._chunkExe = self.compute_data_service.submit_data_unit(desc) if self._mapDesc and self._mapDesc.get('files', None): desc = util.getEmptyDU(self._pilots[0]['pilot_compute']) desc['file_urls'] = self._mapDesc['files'] self._mapExe = self.compute_data_service.submit_data_unit(desc) if self._reduceDesc and self._reduceDesc.get('files', None): desc = util.getEmptyDU(self._pilots[0]['pilot_compute']) desc['file_urls'] = self._reduceDesc['files'] self._reduceExe = self.compute_data_service.submit_data_unit(desc) # Wait for the executable DUS util.waitDUs([self._chunkExe, self._mapExe, self._reduceExe])
def _reduce(self): """ Reduce Phase """ logger.debug("Creating DUS to store Reduce Output results") # Create DU to collect output data of all the reduce tasks temp = util.getEmptyDU(self._pilots[0]['pilot_compute']) self._outputDu = self.compute_data_service.submit_data_unit(temp) util.waitDUs([self._outputDu]) # Create reduce for each reduce DU reduceCUs = [] pdString = "%s:%s" % (self.pdUrl.netloc,self.pdUrl.path) outputDir = os.path.join(pdString,self._outputDu.get_url().split(":")[-1]) reduceArgs = self._reduceDesc.get('arguments', []) rtemp=[] for rdu in self.reduceDus: mapOutPath=os.path.join(self.pdUrl.path,rdu.get_url().split(":")[-1]) rduFiles = [os.path.join(mapOutPath,f) for f in os.listdir(mapOutPath)] rdu.add_files(rduFiles, exists=True) rtemp.append(rdu) util.waitDUs(rtemp) try: for rdu in self.reduceDus: reduceTask = util.setAffinity(copy.copy(self._reduceDesc), rdu.data_unit_description) reduceTask['input_data'] = [rdu.get_url()] reduceFiles = [] if self._iterOutputPrefixes: for pref in self._iterOutputPrefixes: reduceFiles.append(pref+"*") else: reduceFiles.append('reduce-*') reduceTask['arguments'] = [outputDir, ",".join(reduceFiles)] + reduceArgs if self._reduceExe is not None: reduceTask["input_data"].append(self._reduceExe.get_url()) reduceCUs.append(self.compute_data_service.submit_compute_unit(reduceTask)) # Wait for the map DUS and CUS logger.debug("Create & submitting Reduce tasks") util.waitCUs(reduceCUs) except Exception, ex: self._clean(ex, "Reduce Phase failed - Abort")
def _loadInputData(self): """ Loads input data into Pilot-Data """ for pilot in self._pilots: if pilot['input_url'].startswith('redis'): # reconnect to Pilot-Data self._inputDus.append(util.getDuUrl(pilot['input_url'])) else: desc = util.getEmptyDU(pilot['pilot_compute']) desc['file_urls'] = util.getFileUrls(pilot['input_url'], self._pdFTP) temp = self.compute_data_service.submit_data_unit(desc) pilot['input_url'] = temp.get_url() self._inputDus.append(temp) util.waitDUs(self._inputDus) logger.debug("New Pilot-MapReduce descriptions with updated PD URLS \n" \ "use these descriptions to reuse already uploaded data") map(lambda x: logger.debug(x), self._pilots)
def _map(self): """ Map Phase """ # Create output DUS one for each reduce to collect all the Map Task results logger.debug("Creating DUS to store Map Output results") for _ in range(self._nbrReduces): temp = util.getEmptyDU(self._pilots[0]['pilot_compute']) self.reduceDus.append(self.compute_data_service.submit_data_unit(temp)) util.waitDUs(self.reduceDus) pdString = "%s:%s" % (self.pdUrl.netloc,self.pdUrl.path) rduDirs = [os.path.join(pdString,rdu.get_url().split(":")[-1]) for rdu in self.reduceDus] rduString = ",".join(rduDirs) # Create task for each chunk in all the chunk data units mapCUs = [] try: for cdu in self._chunkDus: for cfName in cdu.list_files(): mapTask = util.setAffinity(copy.copy(self._mapDesc), cdu.data_unit_description) mapTask['arguments'] = [cfName, rduString] + self._mapDesc.get('arguments', []) mapTask["input_data"] = [ {cdu.get_url(): [cfName]} ] if self._iterDu: mapTask["input_data"].append(self._iterDu.get_url()) for dui in self._iterDu.to_dict()["data_unit_items"]: mapTask["arguments"].append(dui.__dict__["filename"]) if self._mapExe is not None: mapTask["input_data"].append(self._mapExe.get_url()) mapCUs.append(self.compute_data_service.submit_compute_unit(mapTask)) # Wait for the map DUS and CUS logger.debug("Create & submitting Map tasks") util.waitCUs(mapCUs) except Exception, ex: self._clean(ex, "Map Phase failed - Abort")