Пример #1
0
    def __init__(self, filename, region=None):
        
        self.filename = filename
        self.region = False
        self.region_str = region
        self.region_has_pos = False
        self.been_in_region = False


        self.lg = lg.getLogger('PSVCF')

        if region:
            self.region = True
            _x = self.region_str.split(':')
            self.region_seq = _x[0]
            if len(_x) > 1:
                self.region_has_pos = True
                _y = _x[1].split('-')
                self.region_start = int(_y[0])
                self.region_end = int(_y[1])
            
        self.file_mode = 'file'
        if filename == '-':
            self.file_mode = 'stdin'
            self.F = Peekorator(sys.stdin)
        elif filename[-3:] == '.gz' and os.path.exists(filename + '.tbi') \
                and self.region:
            self.lg.debug('tabix mode')
            self.file_mode = 'tabix'
            #TABIX mode!
            cl = 'tabix -h %s %s' % (filename, self.region_str)
            self.lg.debug("running tabis: %s" % cl)
            self.TABIX_PROCESS = sp.Popen(cl.split(), stdout=sp.PIPE)
            self.F = Peekorator(self.TABIX_PROCESS.stdout)
        elif filename[-3:] == '.gz':
            self.lg.debug('Opening as bz2')
            self.F = Peekorator(gzip.open(self.filename))
        elif filename[-4:] == '.bz2':
            self.lg.debug('Opening as bz2')
            self.F = Peekorator(bz2.BZ2File(self.filename))
        else:
            self.lg.debug('normal file mode')
            self.F = Peekorator(open(self.filename))

        self.meta_header_lines = []
        self.header_line = ""

        #read header
        while True:
            line = self.F.peek
            if not line:
                raise StopIteration
            elif not line.strip():
                self.F.next()
                pass
            elif line[:2] == '##':         
                self.F.next()
                self.meta_header_lines.append(line.strip())
            elif line[:6] == '#CHROM':
                #header:
                self.interpret_header(line)
                break
            elif line[0] != '#':
                #bummer - no header??                
                self.create_dummy_header()
                break
            
        self.lg.info('Opening %s' % self.filename)
Пример #2
0
class PSVCF(object):
    
    def __init__(self, filename, region=None):
        
        self.filename = filename
        self.region = False
        self.region_str = region
        self.region_has_pos = False
        self.been_in_region = False


        self.lg = lg.getLogger('PSVCF')

        if region:
            self.region = True
            _x = self.region_str.split(':')
            self.region_seq = _x[0]
            if len(_x) > 1:
                self.region_has_pos = True
                _y = _x[1].split('-')
                self.region_start = int(_y[0])
                self.region_end = int(_y[1])
            
        self.file_mode = 'file'
        if filename == '-':
            self.file_mode = 'stdin'
            self.F = Peekorator(sys.stdin)
        elif filename[-3:] == '.gz' and os.path.exists(filename + '.tbi') \
                and self.region:
            self.lg.debug('tabix mode')
            self.file_mode = 'tabix'
            #TABIX mode!
            cl = 'tabix -h %s %s' % (filename, self.region_str)
            self.lg.debug("running tabis: %s" % cl)
            self.TABIX_PROCESS = sp.Popen(cl.split(), stdout=sp.PIPE)
            self.F = Peekorator(self.TABIX_PROCESS.stdout)
        elif filename[-3:] == '.gz':
            self.lg.debug('Opening as bz2')
            self.F = Peekorator(gzip.open(self.filename))
        elif filename[-4:] == '.bz2':
            self.lg.debug('Opening as bz2')
            self.F = Peekorator(bz2.BZ2File(self.filename))
        else:
            self.lg.debug('normal file mode')
            self.F = Peekorator(open(self.filename))

        self.meta_header_lines = []
        self.header_line = ""

        #read header
        while True:
            line = self.F.peek
            if not line:
                raise StopIteration
            elif not line.strip():
                self.F.next()
                pass
            elif line[:2] == '##':         
                self.F.next()
                self.meta_header_lines.append(line.strip())
            elif line[:6] == '#CHROM':
                #header:
                self.interpret_header(line)
                break
            elif line[0] != '#':
                #bummer - no header??                
                self.create_dummy_header()
                break
            
        self.lg.info('Opening %s' % self.filename)
        
    def __iter__(self):
        return self

    def create_dummy_header(self):        
        self.header_line = "\t".join(
            ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'sample'])
        ls = self.header_line.split("\t")
        self.sample_names = ls[9:]


    def interpret_header(self, line): 
        self.header_line = line.strip()
        ls = self.header_line.split("\t")
        assert(ls[:9] == ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT'])
        self.sample_names = ls[9:]

    def simple_names(self):
        """
        Return simplified sample names
        """
        def _simple_name(s):
            s = os.path.basename(s)
            s = s.replace('.vcf', '')
            s = s.replace('.bam', '')
            s = s.replace('.bz2', '')
            return s
        return [_simple_name(x) for x in self.sample_names]

    def next(self):
        while True:
            line = self.F.next()
            if not line:
                #EOF
                raise StopIteration
            line = line.strip()
            if not line:
                #empty line
                continue            
            if line[:2] == '##':
                self.meta_header_lines.append(line.strip())
                continue
            if line[:6] == '#CHROM':
                #header:
                self.interpret_header(line)
                continue

            try:
                loc = Locus(line, sample_names = self.sample_names)
            except SampleParseError:
                lg.critical("could not parse line")
                lg.critical(line)
                raise SampleParseError()

            if not self.region:
                #No region is specified - return regardless
                return loc

            #figure out if this locus falls within the specified region
            if loc.seq != self.region_seq:
                if self.been_in_region:
                    self.fin()

            if not self.region_has_pos:
                self.been_in_region = True
                return loc
            
            if loc.pos < self.region_start:
                continue
            if loc.pos > self.region_end:
                raise StopIteration

            self.been_in_region = True
            return loc

    def fin():
        """
        Finish iterations
        """
        if self.file_mode == 'file':
            self.F.close()
        elif self.file_mode == 'stdin':
            pass
        elif self.file_mode == 'tabix':
            self.F.close()
        raise StopIteration


    def add_meta(self, k, v):
        """
        Set a meta key/value pair - will be added to the header
        """
        self.meta_header_lines.append("##%s=%s" % (k,v))
        
    def build_header(self):
        """
        Return a new header
        """
        rv = self.meta_header_lines
        rv.append(self.header_line)
        return "\n".join(rv) + "\n"