def readConf(self): '''Read a configuration file (configobj format) and return a list of Clusters''' result = [] config = ConfigObj(self.filename) clusters = self._getConfigValue(config, 'clusters') if not isinstance(clusters, list): raise SyntaxError('Configuration error [%s] - clusters is not a list. Add a coma to create one' % self.filename) for c in iter(self._getConfigValue(config, 'clusters')): cluster = Cluster() cluster.name = self._getConfigValue(config, c, 'name') #print ('Cluster found : %s' % cluster.name) servers = self._getConfigValue(config, c, 'servers') if not isinstance(servers, list): raise SyntaxError('Configuration error [%s] - servers is not a list. Add a coma to create one' % self.filename) for s in iter(servers): srv = Server() srv.ip = self._getConfigValue(config, s, 'ip') srv.port = self._getConfigValue(config, s, 'port') srv.secure = self._getConfigValue(config, s, 'secure') srv.modealt = self._getConfigValue(config, s, 'modealt') #print ('Server found : %s:%s' % (srv.ip, srv.port)) ## vhosts = self._getConfigValue(config, s, 'vhosts') if isinstance(vhosts, list): ## If no vhost defined, switch to a default one if len(vhosts) == 0: srv.add_vhost('') for vh in iter(self._getConfigValue(config, s, 'vhosts')): vhost_name = self._getConfigValue(config, vh, 'name') vhost_burl = self._getConfigValue(config, vh, 'burl') #print ('Vhost found : %s/%s' % (vhost_name, vhost_burl)) srv.add_vhost(vhost_name, vhost_burl) else: raise SyntaxError('Configuration error [%s] - [%s].vhosts is not a list. Add a coma to create one' % (self.filename, s)) cluster.servers.append(srv) ## Appending cluster object to returned result result.append(cluster) return result
def readConf(self): '''Read a configuration file (configobj format) and return a list of Clusters''' result = [] config = ConfigObj(self.filename) clusters = self._getConfigValue(config, 'clusters') if not isinstance(clusters, list): raise SyntaxError( 'Configuration error [%s] - clusters is not a list. Add a coma to create one' % self.filename) for c in iter(self._getConfigValue(config, 'clusters')): cluster = Cluster() cluster.name = self._getConfigValue(config, c, 'name') #print ('Cluster found : %s' % cluster.name) for s in iter(self._getConfigValue(config, c, 'servers')): srv = Server() srv.ip = self._getConfigValue(config, s, 'ip') srv.port = self._getConfigValue(config, s, 'port') srv.secure = self._getConfigValue(config, s, 'secure') srv.modealt = self._getConfigValue(config, s, 'modealt') #print ('Server found : %s:%s' % (srv.ip, srv.port)) ## vhosts = self._getConfigValue(config, s, 'vhosts') if isinstance(vhosts, list): ## If no vhost defined, switch to a default one if len(vhosts) == 0: srv.add_vhost('') for vh in iter(self._getConfigValue(config, s, 'vhosts')): vhost_name = self._getConfigValue(config, vh, 'name') vhost_burl = self._getConfigValue(config, vh, 'burl') #print ('Vhost found : %s/%s' % (vhost_name, vhost_burl)) srv.add_vhost(vhost_name, vhost_burl) else: raise SyntaxError( 'Configuration error [%s] - [%s].vhosts is not a list. Add a coma to create one' % (self.filename, s)) cluster.servers.append(srv) ## Appending cluster object to returned result result.append(cluster) return result
class DualSortedReader: """Given two sorted files of tags in a format supported by Pyicoteo, iterates through them returning them in order""" def __init__(self, file_a_path, file_b_path, format, read_half_open=False, logger=None): self.logger = logger self.file_a = open(file_a_path) self.file_b = open(file_b_path) self.current_a = Cluster(cached=False, read=format, read_half_open=read_half_open, logger=self.logger) self.current_b = Cluster(cached=False, read=format, read_half_open=read_half_open, logger=self.logger) def __iter__(self): stop_a = True # indicates if the exception StopIteration is raised by file a (True) or file b (False) safe_reader = SafeReader(self.logger) try: while 1: if not self.current_a: stop_a = True line_a = self.file_a.next() safe_reader.safe_read_line(self.current_a, line_a) if not self.current_b: stop_a = False line_b = self.file_b.next() safe_reader.safe_read_line(self.current_b, line_b) if self.current_a < self.current_b: self.current_a.clear() yield line_a else: self.current_b.clear() yield line_b except StopIteration: # we still need to print the reminder of the sorter file if stop_a: while self.file_b: yield line_b line_b = self.file_b.next() else: while self.file_a: yield line_a line_a = self.file_a.next()
def readConf(self): '''Read a configuration file (configobj format) and return a list of Clusters''' config = CP.ConfigParser({'secure': 'false', 'modealt': 'false', }) config.read(self.filename) result = [] clusters = config.get('main', 'clusters').split(',') for c in clusters: cluster = Cluster() cluster.name = config.get(c, 'name') #print ('Cluster found : %s' % cluster.name) for s in config.get(c, 'servers').split(','): srv = Server() srv.ip = config.get(s, 'ip') srv.port = config.get(s, 'port') srv.secure = config.getboolean(s, 'secure') srv.modealt = config.getboolean(s, 'modealt') #print ('Server found : %s:%s' % (srv.ip, srv.port)) try: vhosts = config.get(s, 'vhosts').split(',') if len(vhosts) == 0: raise CP.NoOptionError except CP.NoOptionError: srv.add_vhost('') else: for vh in vhosts: vhost_name = config.get(vh, 'name') vhost_burl = config.get(vh, 'burl') #print ('Vhost found : %s/%s' % (vhost_name, vhost_burl)) srv.add_vhost(vhost_name, vhost_burl) cluster.servers.append(srv) ## Appending cluster object to returned result result.append(cluster) return result
def _read_next_tag(self): """Loads the cache if the line read by the cursor is not there yet. If the line is empty, it means that the end of file was reached, so this function sends a signal for the parent function to halt. If the region is stranded, the only tags returned will be the ones of that strand""" try: line = self.file_iterator.readline() except StopIteration: return True if line == '': return True self.current_tag = Cluster(read=self.experiment_format, read_half_open=self.read_half_open, rounding=self.rounding, cached=False, logger=self.logger) self.safe_read_line(self.current_tag, line) return False
def __init__(self, file_format, read_half_open=False, frag_size=0, id=0, logger=True, filter_chunks=True, push_distance=0, buffer_size = 320000, temp_file_size = 8000000): self.logger = logger self.file_format = file_format self.frag_size = frag_size self.push_distance = push_distance self.buffer_size = buffer_size self.temp_file_size = temp_file_size self.filter_chunks = filter_chunks try: if self.file_format: self.cluster = Cluster(read=self.file_format, write=self.file_format, read_half_open=read_half_open, write_half_open=read_half_open, logger=self.logger) except ConversionNotSupported: self.logger.error('') self.logger.error('Reading "%s" is not supported (unknown format).\n'%self.file_format) list_available_formats() self.id = id
def get_overlaping_counts(self, region, overlap=1): counts = 0 # load last seek self.file_iterator.seek(self.slow_seek) self.current_tag = Cluster() # advance slow seek while (self.current_tag.name < region.name) or (self.current_tag.name == region.name and region.start > self.current_tag.end): self.slow_seek = self.file_iterator.tell() if self._read_next_tag(): return counts # get intersections while self.current_tag.start <= region.end and self.current_tag.name == region.name: if self.current_tag.overlap(region) >= overlap: if not region.strand or region.strand == self.current_tag.strand: counts += 1 if self._read_next_tag(): return counts return counts
def __initvalues(self): self.slow_seek = 0 self.current_tag = Cluster()
class SortedFileCountReader: """ Holds a cursor and a file path. Given a start and an end, it iterates through the file starting on the cursor position, and retrieves the *counts* (number of reads) that overlap with the region specified. Because this class doesn't store the reads, but only counts them, it doesn't have memory problems when encountering huge clusters of reads. """ def __init__(self, file_path, experiment_format, read_half_open=False, rounding=True, cached=True, logger=None): self.__dict__.update(locals()) self.file_iterator = open_file(file_path, format=experiment_format, logger=logger) if logger: self.logger.debug('Fetcher used for %s: Sequential Sorted Counts Reader'%file_path) self.safe_reader = SafeReader(logger=logger) self.__initvalues() def rewind(self): """Start again reading the file from the start""" self.file_iterator.seek(0) self.__initvalues() def __initvalues(self): self.slow_seek = 0 self.current_tag = Cluster() def _read_next_tag(self): """Loads the cache if the line read by the cursor is not there yet. If the line is empty, it means that the end of file was reached, so this function sends a signal for the parent function to halt. If the region is stranded, the only tags returned will be the ones of that strand""" try: line = self.file_iterator.readline() except StopIteration: return True if line == '': return True self.current_tag = Cluster(read=self.experiment_format, read_half_open=self.read_half_open, rounding=self.rounding, cached=False, logger=self.logger) self.safe_read_line(self.current_tag, line) return False def get_overlaping_counts(self, region, overlap=1): counts = 0 # load last seek self.file_iterator.seek(self.slow_seek) self.current_tag = Cluster() # advance slow seek while (self.current_tag.name < region.name) or (self.current_tag.name == region.name and region.start > self.current_tag.end): self.slow_seek = self.file_iterator.tell() if self._read_next_tag(): return counts # get intersections while self.current_tag.start <= region.end and self.current_tag.name == region.name: if self.current_tag.overlap(region) >= overlap: if not region.strand or region.strand == self.current_tag.strand: counts += 1 if self._read_next_tag(): return counts return counts def safe_read_line(self, cluster, line): self.safe_reader.safe_read_line(cluster, line)
def __init__(self, file_a_path, file_b_path, format, read_half_open=False, logger=None): self.logger = logger self.file_a = open(file_a_path) self.file_b = open(file_b_path) self.current_a = Cluster(cached=False, read=format, read_half_open=read_half_open, logger=self.logger) self.current_b = Cluster(cached=False, read=format, read_half_open=read_half_open, logger=self.logger)
class BigSort: """ This class can sort huge files without loading them fully into memory. Based on a recipe by Tomasz Bieruta. """ def __init__(self, file_format, read_half_open=False, frag_size=0, id=0, logger=True, filter_chunks=True, push_distance=0, buffer_size = 320000, temp_file_size = 8000000): self.logger = logger self.file_format = file_format self.frag_size = frag_size self.push_distance = push_distance self.buffer_size = buffer_size self.temp_file_size = temp_file_size self.filter_chunks = filter_chunks try: if self.file_format: self.cluster = Cluster(read=self.file_format, write=self.file_format, read_half_open=read_half_open, write_half_open=read_half_open, logger=self.logger) except ConversionNotSupported: self.logger.error('') self.logger.error('Reading "%s" is not supported (unknown format).\n'%self.file_format) list_available_formats() self.id = id def skipHeaderLines(self, key, experiment_file): validLine = False count = 0 while not validLine and count < 400: #file formats with more than 400 lines of header should die anyway try: currentPos = experiment_file.tell() line = [experiment_file.readline()] line.sort(key=key) experiment_file.seek(currentPos) validLine = True except: count += 1 def remove_chunks(self, chunks): for chunk in chunks: try: os.remove(chunk) except: pass def filter_chunk(self, chunk): filtered_chunk = [] for line in chunk: if self.cluster.reader.quality_filter(line): self.cluster.clear() try: self.cluster.read_line(line) if self.frag_size: self.cluster.extend(self.frag_size) if self.push_distance: self.cluster.push(self.push_distance) except InvalidLine: if self.logger: self.logger.debug('Discarding middle invalid line: %s'%line) if not self.cluster.is_empty(): filtered_chunk.append(self.cluster.write_line()) return filtered_chunk def sort(self, input, output=None, key=None, tempdirs=[]): if key is None: # unless explicitly specified, sort with the default lambda key = sorting_lambda(self.file_format) if not tempdirs: tempdirs.append(gettempdir()) input_file = open(input,'rb',self.temp_file_size) self.skipHeaderLines(key, input_file) try: input_iterator = iter(input_file) chunks = [] for tempdir in cycle(tempdirs): current_chunk = list(islice(input_iterator, self.buffer_size)) if self.filter_chunks: current_chunk = self.filter_chunk(current_chunk) if current_chunk: if self.logger: self.logger.debug("Chunk: len current_chunk: %s chunks: %s temp_file_size %s buffer_size %s"%(len(current_chunk), len(chunks), self.temp_file_size, self.buffer_size)) current_chunk.sort(key=key) output_chunk = open(os.path.join(tempdir,'%06i_%s_%s'%(len(chunks), os.getpid(), self.id)),'w+b',self.temp_file_size) output_chunk.writelines(current_chunk) output_chunk.flush() output_chunk.seek(0) chunks.append(output_chunk.name) else: break except KeyboardInterrupt: # If there is an interruption, delete all temporary files and raise the exception for further processing. print 'Removing temporary files...' self.remove_chunks(chunks) raise finally: input_file.close() if output is None: output = "%s/tempsort%s_%s"%(tempdirs[0], os.getpid(), self.id) output_file = open(output,'wb',self.temp_file_size) try: output_file.writelines(self.merge(chunks,key)) finally: self.remove_chunks(chunks) output_file.close() return open(output) def merge(self, chunks, key=None): if self.logger: self.logger.info("... Merging chunks...") if key is None: key = lambda x : x values = [] for index, chunk in enumerate(chunks): try: chunk_file = open(chunk) iterator = iter(chunk_file) value = iterator.next() except StopIteration: self.remove_chunks(chunks) #try: chunks.remove(chunk) except: pass # igual hay algo magico aqui que se me ha pasado, pero creo que no vale para nada else: heappush(values,((key(value), index, value, iterator, chunk_file))) while values: k, index, value, iterator, chunk = heappop(values) yield value try: value = iterator.next() except StopIteration: self.remove_chunks(chunks) #aqui tambien habia magia remove chunks else: heappush(values,(key(value),index,value,iterator,chunk))