def go(self): self.logger.info("Starting storagemapper run") super(storagemapper, self).go() # We read the storage node name out of the path # and append the local filename (ie, on the storage node) to the map # ---------------------------------------------------------------------- data = defaultdict(list) for filename in self.inputs['args']: host = filename.split(os.path.sep)[3] data[host].append(filename.split(host)[-1]) # Dump the generated mapping to a parset # ---------------------------------------------------------------------- parset = Parset() for host, filenames in data.iteritems(): parset.addStringVector(host, filenames) create_directory(os.path.dirname(self.inputs['mapfile'])) parset.writeFile(self.inputs['mapfile']) self.outputs['mapfile'] = self.inputs['mapfile'] return 0
def go(self): self.logger.info("Starting datamapper run") super(datamapper, self).go() # We build lists of compute-nodes per cluster and data-per-cluster, # then match them up to schedule jobs in a round-robin fashion. # ---------------------------------------------------------------------- clusterdesc = ClusterDesc(self.config.get('cluster', "clusterdesc")) if clusterdesc.subclusters: available_nodes = dict((cl.name, cycle(get_compute_nodes(cl))) for cl in clusterdesc.subclusters) else: available_nodes = { clusterdesc.name: cycle(get_compute_nodes(clusterdesc)) } data = defaultdict(list) for filename in self.inputs['args']: subcluster = filename.split(os.path.sep)[2] try: host = next(available_nodes[subcluster]) except KeyError as key: self.logger.error("%s is not a known cluster" % str(key)) raise data[host].append(filename) # Dump the generated mapping to a parset # ---------------------------------------------------------------------- parset = Parset() for host, filenames in data.items(): parset.addStringVector(host, filenames) parset.writeFile(self.inputs['mapfile']) self.outputs['mapfile'] = self.inputs['mapfile'] return 0