class DualSortedReader: """Given two sorted files of tags in a format supported by Pyicoteo, iterates through them returning them in order""" def __init__(self, file_a_path, file_b_path, format, read_half_open=False, logger=None): self.logger = logger self.file_a = open(file_a_path) self.file_b = open(file_b_path) self.current_a = Cluster(cached=False, read=format, read_half_open=read_half_open, logger=self.logger) self.current_b = Cluster(cached=False, read=format, read_half_open=read_half_open, logger=self.logger) def __iter__(self): stop_a = True # indicates if the exception StopIteration is raised by file a (True) or file b (False) safe_reader = SafeReader(self.logger) try: while 1: if not self.current_a: stop_a = True line_a = self.file_a.next() safe_reader.safe_read_line(self.current_a, line_a) if not self.current_b: stop_a = False line_b = self.file_b.next() safe_reader.safe_read_line(self.current_b, line_b) if self.current_a < self.current_b: self.current_a.clear() yield line_a else: self.current_b.clear() yield line_b except StopIteration: # we still need to print the reminder of the sorter file if stop_a: while self.file_b: yield line_b line_b = self.file_b.next() else: while self.file_a: yield line_a line_a = self.file_a.next()
class BigSort: """ This class can sort huge files without loading them fully into memory. Based on a recipe by Tomasz Bieruta. """ def __init__(self, file_format, read_half_open=False, frag_size=0, id=0, logger=True, filter_chunks=True, push_distance=0, buffer_size = 320000, temp_file_size = 8000000): self.logger = logger self.file_format = file_format self.frag_size = frag_size self.push_distance = push_distance self.buffer_size = buffer_size self.temp_file_size = temp_file_size self.filter_chunks = filter_chunks try: if self.file_format: self.cluster = Cluster(read=self.file_format, write=self.file_format, read_half_open=read_half_open, write_half_open=read_half_open, logger=self.logger) except ConversionNotSupported: self.logger.error('') self.logger.error('Reading "%s" is not supported (unknown format).\n'%self.file_format) list_available_formats() self.id = id def skipHeaderLines(self, key, experiment_file): validLine = False count = 0 while not validLine and count < 400: #file formats with more than 400 lines of header should die anyway try: currentPos = experiment_file.tell() line = [experiment_file.readline()] line.sort(key=key) experiment_file.seek(currentPos) validLine = True except: count += 1 def remove_chunks(self, chunks): for chunk in chunks: try: os.remove(chunk) except: pass def filter_chunk(self, chunk): filtered_chunk = [] for line in chunk: if self.cluster.reader.quality_filter(line): self.cluster.clear() try: self.cluster.read_line(line) if self.frag_size: self.cluster.extend(self.frag_size) if self.push_distance: self.cluster.push(self.push_distance) except InvalidLine: if self.logger: self.logger.debug('Discarding middle invalid line: %s'%line) if not self.cluster.is_empty(): filtered_chunk.append(self.cluster.write_line()) return filtered_chunk def sort(self, input, output=None, key=None, tempdirs=[]): if key is None: # unless explicitly specified, sort with the default lambda key = sorting_lambda(self.file_format) if not tempdirs: tempdirs.append(gettempdir()) input_file = open(input,'rb',self.temp_file_size) self.skipHeaderLines(key, input_file) try: input_iterator = iter(input_file) chunks = [] for tempdir in cycle(tempdirs): current_chunk = list(islice(input_iterator, self.buffer_size)) if self.filter_chunks: current_chunk = self.filter_chunk(current_chunk) if current_chunk: if self.logger: self.logger.debug("Chunk: len current_chunk: %s chunks: %s temp_file_size %s buffer_size %s"%(len(current_chunk), len(chunks), self.temp_file_size, self.buffer_size)) current_chunk.sort(key=key) output_chunk = open(os.path.join(tempdir,'%06i_%s_%s'%(len(chunks), os.getpid(), self.id)),'w+b',self.temp_file_size) output_chunk.writelines(current_chunk) output_chunk.flush() output_chunk.seek(0) chunks.append(output_chunk.name) else: break except KeyboardInterrupt: # If there is an interruption, delete all temporary files and raise the exception for further processing. print 'Removing temporary files...' self.remove_chunks(chunks) raise finally: input_file.close() if output is None: output = "%s/tempsort%s_%s"%(tempdirs[0], os.getpid(), self.id) output_file = open(output,'wb',self.temp_file_size) try: output_file.writelines(self.merge(chunks,key)) finally: self.remove_chunks(chunks) output_file.close() return open(output) def merge(self, chunks, key=None): if self.logger: self.logger.info("... Merging chunks...") if key is None: key = lambda x : x values = [] for index, chunk in enumerate(chunks): try: chunk_file = open(chunk) iterator = iter(chunk_file) value = iterator.next() except StopIteration: self.remove_chunks(chunks) #try: chunks.remove(chunk) except: pass # igual hay algo magico aqui que se me ha pasado, pero creo que no vale para nada else: heappush(values,((key(value), index, value, iterator, chunk_file))) while values: k, index, value, iterator, chunk = heappop(values) yield value try: value = iterator.next() except StopIteration: self.remove_chunks(chunks) #aqui tambien habia magia remove chunks else: heappush(values,(key(value),index,value,iterator,chunk))