def __init__(self, cmd, cmd_def=None, runner=None, runner_conf=None, stdout=None, stderr=None, stdin=None, splits=None): '''It inits the a Popen instance, it creates and runs the subjobs. Like the subprocess.Popen it accepts stdin, stdout, stderr, but in this case all of them should be files, PIPE will not work. In the cmd_def list we have to tell this Popen how to locate the input and output files in the cmd and how to split and join them. Look for the cmd_format in the streams.py file. keyword arguments: cmd -- a list with the cmd to parallelize cmd_def -- the cmd definition list (default []) runner -- which runner to use (default subprocess.Popen) runner_conf -- extra parameters for the runner (default {}) stdout -- a fhand to store the stdout (default None) stderr -- a fhand to store the stderr (default None) stdin -- a fhand with the stdin (default None) splits -- number of subjobs to generate ''' #we want the same interface as subprocess.popen #pylint: disable-msg=R0913 self._retcode = None self._outputs_collected = False #some defaults #if the runner is not given, we use subprocess.Popen if runner is None: runner = StdPopen #is the cmd_def set in the command? cmd, cmd_cmd_def = get_cmd_def_from_cmd(cmd) if cmd_cmd_def: cmd_def = cmd_cmd_def elif cmd_def: cmd_def = cmd_def else: cmd_def = [] if not cmd_def and stdin is not None: raise ValueError('No cmd_def given but stdin present') #if the number of splits is not given we calculate them if splits is None: splits = self.default_splits(runner) #we need a work dir to create the temporary split files self._work_dir = NamedTemporaryDir() copy_file_mode('.', self._work_dir.name) #the main job self._job = {'cmd': cmd, 'work_dir': self._work_dir} #we create the new subjobs self._jobs = self._split_jobs(cmd, cmd_def, splits, self._work_dir, stdout=stdout, stderr=stderr, stdin=stdin) #launch every subjobs self._launch_jobs(self._jobs, runner=runner, runner_conf=runner_conf)
def _split_streams(streams, splits, work_dir): '''Given a list of streams it splits every stream in the given number of splits''' #which are the input and output streams? input_stream_indexes = [] output_stream_indexes = [] for index, stream in enumerate(streams): if stream['io'] == 'in': input_stream_indexes.append(index) elif stream['io'] == 'out': output_stream_indexes.append(index) #we create one work dir for every split work_dirs = [] for index in range(splits): dir_ = NamedTemporaryDir(dir=work_dir) work_dirs.append(dir_) copy_file_mode('.', dir_.name) #we have to do first the input files because the number of splits could #be changed by them #we split the input stream files into several splits #we have to sort the input_stream_indexes, first we should take the ones #that have an input file to be split def do_we_have_to_split(stream_index): 'If the stream has to split a file it will return True' split = None stream = streams[stream_index] #maybe they shouldn't be split if 'special' in stream and 'no_split' in stream['special']: split = False #maybe there is no file to split if (('fhand' in stream and stream['fhand'] is None) or ('fname' in stream and stream['fname'] is None) or ('fname' not in stream and 'fhand' not in stream)): split = False elif (('fhand' in stream and stream['fhand'] is not None) or ('fname' in stream and stream['fname'] is not None)): split = True return split def to_be_split_first(stream1, stream2): 'It sorts the streams, the ones to be split go first' split1 = do_we_have_to_split(stream1) split2 = do_we_have_to_split(stream2) return int(split1) - int(split2) input_stream_indexes = sorted(input_stream_indexes, to_be_split_first) first = True split_files = {} for index in input_stream_indexes: stream = streams[index] #splitter splitter = None if 'special' in stream and 'no_split' in stream['special']: splitter = create_non_splitter_splitter(copy_files=True) elif 'splitter' not in stream: msg = 'An splitter should be provided for every input stream' msg += 'missing for: ' + str(stream) raise ValueError(msg) else: splitter = stream['splitter'] #if the splitter is a function we assume that it will know how to #split the given file, otherwise should be a registered type of #splitter or a regular expression if '__call__' not in dir(splitter): splitter = get_splitter(splitter) #we split the input files in the splits, every file will be in one #of the given work_dirs #the stream can have fname or fhands if 'fhand' in stream: file_ = stream['fhand'] elif 'fname' in stream: file_ = stream['fname'] else: file_ = None if file_ is None: #the stream migth have no file associated files = [None] * len(work_dirs) else: files = splitter(file_, work_dirs) #the files len can be different than splits, in that case we modify #the splits or we raise an error if len(files) != splits: if first: splits = len(files) #we discard the empty temporary dirs work_dirs = work_dirs[0:splits] else: msg = 'Not all input files were divided in the same number' msg += ' of splits' raise RuntimeError(msg) first = False split_files[index] = files #a list of files for every in stream #we split the ouptut stream files into several splits output_splitter = create_non_splitter_splitter(copy_files=False) for index in output_stream_indexes: stream = streams[index] #for th output we just create the new names, but we don't split #any file if 'fhand' in stream: fname = stream['fhand'] else: fname = stream['fname'] files = output_splitter(fname, work_dirs) split_files[index] = files #a list of files for every in stream new_streamss = [] #we need one new stream for every split for split_index in range(splits): #the streams for one job new_streams = [] for stream_index, stream in enumerate(streams): #we duplicate the original stream new_stream = stream.copy() #we set the new files if 'fhand' in stream: new_stream['fhand'] = split_files[stream_index][split_index] else: new_stream['fname'] = split_files[stream_index][split_index] new_streams.append(new_stream) new_streamss.append(new_streams) return new_streamss, work_dirs
def splitter(file_, work_dirs): '''It splits the given file into several splits. Every split will be located in one of the work_dirs, although it is not guaranteed to create as many splits as work dirs. If in the file there are less items than work_dirs some work_dirs will be left empty. It returns a list with the fpaths or fhands for the splitted files. file_ can be an fhand or an fname. ''' #the file_ can be an fname or an fhand. which one is it? file_is_str = None if isinstance(file_, str): fname = file_ file_is_str = True else: fname = file_.name file_is_str = False # do we have header? if header_extractor is not None: header_fhand = NamedTemporaryFile() fhand = open(fname) header_extractor(fhand, header_fhand) fhand.close() else: header_fhand = None # do we have footer? if footer_extractor is not None: footer_fhand = NamedTemporaryFile() fhand = open(fname) footer_extractor(fhand, header_fhand) fhand.close() else: footer_fhand = None # File preprocess if preprocesor is not None: suffix = os.path.splitext(fname)[-1] preprocessed_fhand = NamedTemporaryFile(suffix=suffix) fhand = open(fname) preprocesor(fhand, preprocessed_fhand) fhand.close() fname = preprocessed_fhand.name #how many splits do we want? nsplits = len(work_dirs) #how many items are in the file? We assume that all files have the same #number of items fhand = open(fname, 'r') nitems = item_counter(fhand, expression) #how many splits a we going to create? and how many items will be in #every split #if there are more items than splits we create as many splits as items if nsplits > nitems: nsplits = nitems (nsplits1, nitems1), (nsplits2, nitems2) = _calculate_divisions(nitems, nsplits) #we have to create nsplits1 files with nitems1 in it and nsplits2 files #with nitems2 items in it new_files = [] fhand = open(fname, 'r') items = item_splitter(fhand, expression) splits_made = 0 for nsplits, nitems in ((nsplits1, nitems1), (nsplits2, nitems2)): #we have to create nsplits files with nitems in it #we don't need the split_index for anything #pylint: disable-msg=W0612 for split_index in range(nsplits): suffix = os.path.splitext(fname)[-1] work_dir = work_dirs[splits_made] ofh = NamedTemporaryFile(dir=work_dir.name, delete=False, suffix=suffix) copy_file_mode(fhand.name, ofh.name) # header if header_fhand is not None: header_fhand.seek(0) ofh.write(header_fhand.read()) for item_index in range(nitems): ofh.write(items.next()) ofh.flush() # footer if footer_fhand is not None: footer_fhand.seek(0) ofh.write(footer_fhand.read()) #postprocess if postprocesor is not None: newofh = NamedTemporaryFile(dir=work_dir.name, delete=False, suffix=suffix) postprocesor(ofh, newofh) ofh_path = ofh.name ofh.close() os.remove(ofh_path) ofh = newofh #we have to close the files otherwise we can run out of files #in the os filesystem if file_is_str: new_files.append(ofh.name) else: new_files.append(ofh) ofh.close() splits_made += 1 return new_files