Пример #1
0
    def next(self):
        """ Returns next GFFFeature. """

        #
        # Helper function.
        #

        def handle_parse_error(parse_error):
            """ Actions to take when ParseError found. """
            if self.outstream:
                if self.print_delegate and hasattr(self.print_delegate,
                                                   "__call__"):
                    self.print_delegate(self.outstream, e, self)
            self.skipped += 1
            # no reason to stuff an entire bad file into memmory
            if self.skipped < 10:
                self.skipped_lines.append(
                    (self.linenum, self.current_line, str(e)))

            # For debugging, uncomment this to propogate parsing exceptions up.
            # I.e. the underlying reason for an unexpected StopIteration exception
            # can be found by uncommenting this.
            # raise e

        #
        # Get next GFFFeature
        #
        raw_size = self.seed_interval_line_len

        # If there is no seed interval, set one. Also, if there are no more
        # intervals to read, this is where iterator dies.
        if not self.seed_interval:
            while not self.seed_interval:
                try:
                    self.seed_interval = GenomicIntervalReader.next(self)
                except ParseError, e:
                    handle_parse_error(e)
                # TODO: When no longer supporting python 2.4 use finally:
                #finally:
                raw_size += len(self.current_line)
Пример #2
0
    def next( self ):
        """ Returns next GFFFeature. """

        #
        # Helper function.
        #

        def handle_parse_error( parse_error ):
            """ Actions to take when ParseError found. """
            if self.outstream:
                if self.print_delegate and hasattr(self.print_delegate, "__call__"):
                    self.print_delegate( self.outstream, e, self )
            self.skipped += 1
            # no reason to stuff an entire bad file into memmory
            if self.skipped < 10:
                self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )

            # For debugging, uncomment this to propogate parsing exceptions up.
            # I.e. the underlying reason for an unexpected StopIteration exception
            # can be found by uncommenting this.
            # raise e

        #
        # Get next GFFFeature
        #
        raw_size = self.seed_interval_line_len

        # If there is no seed interval, set one. Also, if there are no more
        # intervals to read, this is where iterator dies.
        if not self.seed_interval:
            while not self.seed_interval:
                try:
                    self.seed_interval = GenomicIntervalReader.next( self )
                except ParseError, e:
                    handle_parse_error( e )
                # TODO: When no longer supporting python 2.4 use finally:
                #finally:
                raw_size += len( self.current_line )
Пример #3
0
    def next( self ):
        """ Returns next GFFFeature. """

        #
        # Helper function.
        #

        def handle_parse_error( parse_error ):
            """ Actions to take when ParseError found. """
            if self.outstream:
                if self.print_delegate and hasattr(self.print_delegate, "__call__"):
                    self.print_delegate( self.outstream, e, self )
            self.skipped += 1
            # no reason to stuff an entire bad file into memmory
            if self.skipped < 10:
                self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )

            # For debugging, uncomment this to propogate parsing exceptions up.
            # I.e. the underlying reason for an unexpected StopIteration exception
            # can be found by uncommenting this.
            # raise e

        #
        # Get next GFFFeature
        #
        raw_size = self.seed_interval_line_len

        # If there is no seed interval, set one. Also, if there are no more
        # intervals to read, this is where iterator dies.
        if not self.seed_interval:
            while not self.seed_interval:
                try:
                    self.seed_interval = GenomicIntervalReader.next( self )
                except ParseError as e:
                    handle_parse_error( e )
                # TODO: When no longer supporting python 2.4 use finally:
                # finally:
                raw_size += len( self.current_line )

        # If header or comment, clear seed interval and return it with its size.
        if isinstance( self.seed_interval, ( Header, Comment ) ):
            return_val = self.seed_interval
            return_val.raw_size = len( self.current_line )
            self.seed_interval = None
            self.seed_interval_line_len = 0
            return return_val

        # Initialize feature identifier from seed.
        feature_group = self.seed_interval.attributes.get( 'group', None )  # For GFF
        # For GFF3
        feature_id = self.seed_interval.attributes.get( 'ID', None )
        # For GTF.
        feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None )

        # Read all intervals associated with seed.
        feature_intervals = []
        feature_intervals.append( self.seed_interval )
        while True:
            try:
                interval = GenomicIntervalReader.next( self )
                raw_size += len( self.current_line )
            except StopIteration as e:
                # No more intervals to read, but last feature needs to be
                # returned.
                interval = None
                raw_size += len( self.current_line )
                break
            except ParseError as e:
                handle_parse_error( e )
                raw_size += len( self.current_line )
                continue
            # TODO: When no longer supporting python 2.4 use finally:
            # finally:
            # raw_size += len( self.current_line )

            # Ignore comments.
            if isinstance( interval, Comment ):
                continue

            # Determine if interval is part of feature.
            part_of = False
            group = interval.attributes.get( 'group', None )
            # GFF test:
            if group and feature_group == group:
                part_of = True
            # GFF3 test:
            parent_id = interval.attributes.get( 'Parent', None )
            cur_id = interval.attributes.get( 'ID', None )
            if ( cur_id and cur_id == feature_id ) or ( parent_id and parent_id == feature_id ):
                part_of = True
            # GTF test:
            transcript_id = interval.attributes.get( 'transcript_id', None )
            if transcript_id and transcript_id == feature_transcript_id:
                part_of = True

            # If interval is not part of feature, clean up and break.
            if not part_of:
                # Adjust raw size because current line is not part of feature.
                raw_size -= len( self.current_line )
                break

            # Interval associated with feature.
            feature_intervals.append( interval )

        # Last interval read is the seed for the next interval.
        self.seed_interval = interval
        self.seed_interval_line_len = len( self.current_line )

        # Return feature.
        feature = GFFFeature( self, self.chrom_col, self.feature_col, self.start_col,
                              self.end_col, self.strand_col, self.score_col,
                              self.default_strand, fix_strand=self.fix_strand,
                              intervals=feature_intervals, raw_size=raw_size )

        # Convert to BED coords?
        if self.convert_to_bed_coord:
            convert_gff_coords_to_bed( feature )

        return feature
Пример #4
0
def getRegionsAndGroups(regionsFileName, onlyMultiplesOf=1):
    # reads a bed file containing the position
    # of genomic intervals
    # In case is hash sign '#' is found in the
    # file, this is considered as a delimiter
    # to split the heatmap into groups

    regions = []
    regionsDict = OrderedDict()
    regionGroups = [(0, '')]

    prevInterval = None
    duplicates = 0
    totalIntervals = 0
    includedIntervals = 0
    # drop some lines
    for ginterval in GenomicIntervalReader(
            open(regionsFileName, 'r').readlines()):
        totalIntervals += 1
        if ginterval.__str__()[0] == '#':
            if includedIntervals > 1 and includedIntervals - regionGroups[-1][
                    0] > 1:
                label = ginterval.__str__()[1:]
                newLabel = label
                if label in regionsDict.keys():
                    # loop to find a unique label name
                    i = 0
                    while True:
                        i += 1
                        newLabel = label + "_r" + str(i)
                        if newLabel not in regionsDict.keys():
                            break

                regionsDict[newLabel] = regions[:]
                regions = []
            continue
        # if the list of regions is to big, only consider a fraction of the data
        if totalIntervals % onlyMultiplesOf != 0:
            continue
        # skip regions that have the same position as the previous.
        # This assumes that the regions file given is sorted
        if prevInterval and prevInterval.chrom == ginterval.chrom and \
                prevInterval.start == ginterval.start and \
                prevInterval.end == ginterval.end:
            if args.verbose:
                print "Gene in same region already included:  %s %s:%s-%s. Skipping" % (
                    ginterval.fields[3], ginterval.chrom, ginterval.start,
                    ginterval.end)

            duplicates += 1
            continue
        else:
            prevInterval = ginterval

        regions.append(intervalWrapper(ginterval))
        includedIntervals += 1

    if len(regions):
        regionsDict[args.regionsLabel] = regions

    if args.verbose:
        print "%d (%.2f) regions covering the exact same interval were found" % \
            (duplicates,
             float(duplicates) *100 / totalIntervals)

    return regionsDict
Create a site profile vector showing the average signal accumulated from a
bigwig file around the center of each interval from a BED file.

Output is the average signal value at that relative position across the 
intervals.

usage: %prog bigwig_file.bw padding < bed_file.bed 
"""

import sys
from numpy import *

from bx.intervals.io import GenomicIntervalReader
from bx.bbi.bigwig_file import BigWigFile

bw = BigWigFile( open( sys.argv[1] ) )
padding = int( sys.argv[2] )
totals = zeros( padding*2, dtype=float64 )
valid = zeros( padding*2, dtype=int32 )

for interval in GenomicIntervalReader( sys.stdin ):
    center = floor( ( interval.start + interval.end ) / 2 )
    values = bw.get_as_array( interval.chrom, center - padding, center + padding )
    # Determine which positions had data and mask the rest for totalling
    invalid = isnan( values )
    values[ invalid ] = 0
    totals += values
    valid += ( ~ invalid )

savetxt( sys.stdout, totals/valid )
Пример #6
0
class GFFReaderWrapper(NiceReaderWrapper):
    """
    Reader wrapper for GFF files.

    Wrapper has two major functions:

    1. group entries for GFF file (via group column), GFF3 (via id attribute),
       or GTF (via gene_id/transcript id);
    2. convert coordinates from GFF format--starting and ending coordinates
       are 1-based, closed--to the 'traditional'/BED interval format--0 based,
       half-open. This is useful when using GFF files as inputs to tools that
       expect traditional interval format.
    """
    def __init__(self,
                 reader,
                 chrom_col=0,
                 feature_col=2,
                 start_col=3,
                 end_col=4,
                 strand_col=6,
                 score_col=5,
                 fix_strand=False,
                 convert_to_bed_coord=False,
                 **kwargs):
        NiceReaderWrapper.__init__(self,
                                   reader,
                                   chrom_col=chrom_col,
                                   start_col=start_col,
                                   end_col=end_col,
                                   strand_col=strand_col,
                                   fix_strand=fix_strand,
                                   **kwargs)
        self.feature_col = feature_col
        self.score_col = score_col
        self.convert_to_bed_coord = convert_to_bed_coord
        self.last_line = None
        self.cur_offset = 0
        self.seed_interval = None
        self.seed_interval_line_len = 0

    def parse_row(self, line):
        interval = GFFInterval(self,
                               line.split("\t"),
                               self.chrom_col,
                               self.feature_col,
                               self.start_col,
                               self.end_col,
                               self.strand_col,
                               self.score_col,
                               self.default_strand,
                               fix_strand=self.fix_strand)
        return interval

    def next(self):
        """ Returns next GFFFeature. """

        #
        # Helper function.
        #

        def handle_parse_error(parse_error):
            """ Actions to take when ParseError found. """
            if self.outstream:
                if self.print_delegate and hasattr(self.print_delegate,
                                                   "__call__"):
                    self.print_delegate(self.outstream, e, self)
            self.skipped += 1
            # no reason to stuff an entire bad file into memmory
            if self.skipped < 10:
                self.skipped_lines.append(
                    (self.linenum, self.current_line, str(e)))

            # For debugging, uncomment this to propogate parsing exceptions up.
            # I.e. the underlying reason for an unexpected StopIteration exception
            # can be found by uncommenting this.
            # raise e

        #
        # Get next GFFFeature
        #
        raw_size = self.seed_interval_line_len

        # If there is no seed interval, set one. Also, if there are no more
        # intervals to read, this is where iterator dies.
        if not self.seed_interval:
            while not self.seed_interval:
                try:
                    self.seed_interval = GenomicIntervalReader.next(self)
                except ParseError, e:
                    handle_parse_error(e)
                # TODO: When no longer supporting python 2.4 use finally:
                #finally:
                raw_size += len(self.current_line)

        # If header or comment, clear seed interval and return it with its size.
        if isinstance(self.seed_interval, (Header, Comment)):
            return_val = self.seed_interval
            return_val.raw_size = len(self.current_line)
            self.seed_interval = None
            self.seed_interval_line_len = 0
            return return_val

        # Initialize feature identifier from seed.
        feature_group = self.seed_interval.attributes.get('group',
                                                          None)  # For GFF
        # For GFF3
        feature_id = self.seed_interval.attributes.get('ID', None)
        # For GTF.
        feature_transcript_id = self.seed_interval.attributes.get(
            'transcript_id', None)

        # Read all intervals associated with seed.
        feature_intervals = []
        feature_intervals.append(self.seed_interval)
        while True:
            try:
                interval = GenomicIntervalReader.next(self)
                raw_size += len(self.current_line)
            except StopIteration, e:
                # No more intervals to read, but last feature needs to be
                # returned.
                interval = None
                raw_size += len(self.current_line)
                break
            except ParseError, e:
                handle_parse_error(e)
                raw_size += len(self.current_line)
                continue
Пример #7
0
def main():
    allchroms = False

    options, args = doc_optparse.parse(__doc__)
    try:
        chr_col_1, start_col_1, end_col_1, strand_col_1 = parse_cols_arg(
            options.cols1)
        lengths = options.lengths
        if options.all:
            allchroms = True
        in_fname, out_fname = args
    except:
        doc_optparse.exception()

    g1 = NiceReaderWrapper(fileinput.FileInput(in_fname),
                           chrom_col=chr_col_1,
                           start_col=start_col_1,
                           end_col=end_col_1,
                           strand_col=strand_col_1,
                           fix_strand=True)

    lens = dict()
    chroms = list()
    # dbfile is used to determine the length of each chromosome.  The lengths
    # are added to the lens dict and passed copmlement operation code in bx.
    dbfile = fileinput.FileInput(lengths)

    if dbfile:
        if not allchroms:
            try:
                for line in dbfile:
                    fields = line.split("\t")
                    lens[fields[0]] = int(fields[1])
            except:
                # assume LEN doesn't exist or is corrupt somehow
                pass
        elif allchroms:
            try:
                for line in dbfile:
                    fields = line.split("\t")
                    end = int(fields[1])
                    chroms.append("\t".join([fields[0], "0", str(end)]))
            except:
                pass

    # Safety...if the dbfile didn't exist and we're on allchroms, then
    # default to generic complement
    if allchroms and len(chroms) == 0:
        allchroms = False

    if allchroms:
        chromReader = GenomicIntervalReader(chroms)
        generator = subtract([chromReader, g1])
    else:
        generator = complement(g1, lens)

    out_file = open(out_fname, "w")

    try:
        for interval in generator:
            if type(interval) is GenomicInterval:
                out_file.write("%s\n" % "\t".join(interval))
            else:
                out_file.write("%s\n" % interval)
    except ParseError, exc:
        out_file.close()
        fail("Invalid file format: %s" % str(exc))