示例#1
0
class DualSortedReader:
    """Given two sorted files of tags in a format supported by Pyicoteo, iterates through them returning them in order"""
    def __init__(self, file_a_path, file_b_path, format, read_half_open=False, logger=None):
        self.logger = logger
        self.file_a = open(file_a_path)
        self.file_b = open(file_b_path)
        self.current_a = Cluster(cached=False, read=format, read_half_open=read_half_open, logger=self.logger)
        self.current_b = Cluster(cached=False, read=format, read_half_open=read_half_open, logger=self.logger)
        
    def __iter__(self):
        stop_a = True # indicates if the exception StopIteration is raised by file a (True) or file b (False)
        safe_reader = SafeReader(self.logger)
        try:
            while 1:
                if not self.current_a:
                    stop_a = True
                    line_a = self.file_a.next()
                    safe_reader.safe_read_line(self.current_a, line_a)
                
                if not self.current_b:
                    stop_a = False
                    line_b = self.file_b.next()
                    safe_reader.safe_read_line(self.current_b, line_b)
                
                if self.current_a < self.current_b:
                    self.current_a.clear()
                    yield line_a
                else:
                    self.current_b.clear()
                    yield line_b
        except StopIteration: # we still need to print the reminder of the sorter file
            if stop_a:
                while self.file_b:
                    yield line_b
                    line_b = self.file_b.next()
            else:
                while self.file_a:
                    yield line_a
                    line_a = self.file_a.next()
示例#2
0
class BigSort:
    """
    This class can sort huge files without loading them fully into memory.
    Based on a recipe by Tomasz Bieruta.

    """
    def __init__(self, file_format, read_half_open=False, frag_size=0, id=0, logger=True, filter_chunks=True, push_distance=0, buffer_size = 320000, temp_file_size = 8000000):
        self.logger = logger
        self.file_format = file_format
        self.frag_size = frag_size
        self.push_distance = push_distance
        self.buffer_size = buffer_size
        self.temp_file_size = temp_file_size
        self.filter_chunks = filter_chunks
        try:
            if self.file_format:
                self.cluster = Cluster(read=self.file_format, write=self.file_format, read_half_open=read_half_open, write_half_open=read_half_open, logger=self.logger)
        except ConversionNotSupported:
            self.logger.error('')
            self.logger.error('Reading "%s" is not supported (unknown format).\n'%self.file_format)
            list_available_formats()

        self.id = id
        
    def skipHeaderLines(self, key, experiment_file):
        validLine = False
        count = 0
        while not validLine and count < 400: #file formats with more than 400 lines of header should die anyway 
            try:
                currentPos = experiment_file.tell()
                line = [experiment_file.readline()]
                line.sort(key=key)
                experiment_file.seek(currentPos)
                validLine = True
            except:
                count += 1

    def remove_chunks(self, chunks):
        for chunk in chunks:
            try:
                os.remove(chunk)
            except:
                pass
    
    def filter_chunk(self, chunk):
        filtered_chunk = []
        for line in chunk:
            if self.cluster.reader.quality_filter(line):    
                self.cluster.clear()
                try:           
                    self.cluster.read_line(line)
                    if self.frag_size:
                        self.cluster.extend(self.frag_size)

                    if self.push_distance:
                        self.cluster.push(self.push_distance)

                except InvalidLine:
                    if self.logger: self.logger.debug('Discarding middle invalid line: %s'%line)
                                   
                if not self.cluster.is_empty():
                    filtered_chunk.append(self.cluster.write_line())

        return filtered_chunk

    def sort(self, input, output=None, key=None, tempdirs=[]):
        if key is None: # unless explicitly specified, sort with the default lambda
            key = sorting_lambda(self.file_format)

        if not tempdirs:
            tempdirs.append(gettempdir())

        input_file = open(input,'rb',self.temp_file_size)
        self.skipHeaderLines(key, input_file)
        try:
            input_iterator = iter(input_file)
            chunks = []
            for tempdir in cycle(tempdirs):
                current_chunk = list(islice(input_iterator, self.buffer_size))
                if self.filter_chunks:
                    current_chunk = self.filter_chunk(current_chunk) 
                if current_chunk:
                    if self.logger: self.logger.debug("Chunk: len current_chunk: %s chunks: %s temp_file_size %s buffer_size %s"%(len(current_chunk), len(chunks), self.temp_file_size, self.buffer_size))
                    current_chunk.sort(key=key)
                    output_chunk = open(os.path.join(tempdir,'%06i_%s_%s'%(len(chunks), os.getpid(), self.id)),'w+b',self.temp_file_size)
                    output_chunk.writelines(current_chunk)
                    output_chunk.flush()
                    output_chunk.seek(0)
                    chunks.append(output_chunk.name)
                else:
                    break

        except KeyboardInterrupt: # If there is an interruption, delete all temporary files and raise the exception for further processing.
            print 'Removing temporary files...'
            self.remove_chunks(chunks)
            raise

        finally:
            input_file.close()
        
        if output is None:       
            output = "%s/tempsort%s_%s"%(tempdirs[0], os.getpid(), self.id)
        
        output_file = open(output,'wb',self.temp_file_size)
        
        try:
            output_file.writelines(self.merge(chunks,key))
        finally:
            self.remove_chunks(chunks)

        output_file.close()
        return open(output)

    def merge(self, chunks, key=None):
        if self.logger: self.logger.info("... Merging chunks...")
        if key is None:
            key = lambda x : x

        values = []
        for index, chunk in enumerate(chunks):
            try:
                chunk_file = open(chunk)
                iterator = iter(chunk_file)
                value = iterator.next()
            except StopIteration:
                self.remove_chunks(chunks)
                #try: chunks.remove(chunk) except: pass # igual hay algo magico aqui que se me ha pasado, pero creo que no vale para nada 
            else:
                heappush(values,((key(value), index, value, iterator, chunk_file)))

        while values:
            k, index, value, iterator, chunk = heappop(values)
            yield value
            try:
                value = iterator.next()
            except StopIteration:
                self.remove_chunks(chunks)
                #aqui tambien habia magia remove chunks
            else:
                heappush(values,(key(value),index,value,iterator,chunk))