예제 #1
0
 def calc_bin(self, _bin=None):
     if _bin is None:
         try:
             _bin = bins.bins(self.start, self.end, one=True)
         except TypeError:
             _bin = None
     return _bin
예제 #2
0
파일: feature.py 프로젝트: rmzelle/gffutils
 def calc_bin(self, _bin=None):
     if _bin is None:
         try:
             _bin = bins.bins(self.start, self.end, one=True)
         except TypeError:
             _bin = None
     return _bin
예제 #3
0
 def calc_bin(self, _bin=None):
     """
     Calculate the smallest UCSC genomic bin that will contain this feature.
     """
     if _bin is None:
         try:
             _bin = bins.bins(self.start, self.end, one=True)
         except TypeError:
             _bin = None
     return _bin
예제 #4
0
파일: feature.py 프로젝트: arnikz/gffutils
 def calc_bin(self, _bin=None):
     """
     Calculate the smallest UCSC genomic bin that will contain this feature.
     """
     if _bin is None:
         try:
             _bin = bins.bins(self.start, self.end, one=True)
         except TypeError:
             _bin = None
     return _bin
예제 #5
0
def _bin_from_dict(d):
    """
    Given a dictionary yielded by the parser, return the genomic "UCSC" bin
    """
    try:
        start = int(d['start'])
        end = int(d['end'])
        return bins.bins(start, end, one=True)

    # e.g., if "."
    except ValueError:
        return None
예제 #6
0
def _bin_from_dict(d):
    """
    Given a dictionary yielded by the parser, return the genomic "UCSC" bin
    """
    try:
        start = int(d['start'])
        end = int(d['end'])
        return bins.bins(start, end, one=True)

    # e.g., if "."
    except ValueError:
        return None
예제 #7
0
    def interfeatures(self, features, new_featuretype=None,
                      merge_attributes=True, dialect=None,
                      attribute_func=None, update_attributes=None):
        """
        Construct new features representing the space between features.

        For example, if `features` is a list of exons, then this method will
        return the introns.  If `features` is a list of genes, then this method
        will return the intergenic regions.

        Providing N features will return N - 1 new features.

        This method purposefully does *not* do any merging or sorting of
        coordinates, so you may want to use :meth:`FeatureDB.merge` first, or
        when selecting features use the `order_by` kwarg, e.g.,
        `db.features_of_type('gene', order_by=('seqid', 'start'))`.

        Parameters
        ----------
        features : iterable of :class:`feature.Feature` instances
            Sorted, merged iterable

        new_featuretype : string or None
            The new features will all be of this type, or, if None (default)
            then the featuretypes will be constructed from the neighboring
            features, e.g., `inter_exon_exon`.

        merge_attributes : bool
            If True, new features' attributes will be a merge of the neighboring
            features' attributes.  This is useful if you have provided a list of
            exons; the introns will then retain the transcript and/or gene
            parents as a single item. Otherwise, if False, the attribute will
            be a comma-separated list of values, potentially listing the same
            gene ID twice.

        attribute_func : callable or None
            If None, then nothing special is done to the attributes.  If
            callable, then the callable accepts two attribute dictionaries and
            returns a single attribute dictionary.  If `merge_attributes` is
            True, then `attribute_func` is called before `merge_attributes`.
            This could be useful for manually managing IDs for the new
            features.

        update_attributes : dict
            After attributes have been modified and merged, this dictionary can
            be used to replace parts of the attributes dictionary.

        Returns
        -------
        A generator that yields :class:`Feature` objects
        """
        for i, f in enumerate(features):
            # no inter-feature for the first one
            if i == 0:
                interfeature_start = f.stop
                last_feature = f
                continue

            interfeature_stop = f.start
            if new_featuretype is None:
                new_featuretype = 'inter_%s_%s' % (
                    last_feature.featuretype, f.featuretype)
            if last_feature.strand != f.strand:
                new_strand = '.'
            else:
                new_strand = f.strand

            if last_feature.chrom != f.chrom:
                # We've moved to a new chromosome.  For example, if we're
                # getting intergenic regions from all genes, they will be on
                # different chromosomes. We still assume sorted features, but
                # don't complain if they're on different chromosomes -- just
                # move on.
                last_feature = f
                continue

            strand = new_strand
            chrom = last_feature.chrom

            # Shrink
            interfeature_start += 1
            interfeature_stop -= 1

            if merge_attributes:
                new_attributes = helpers.merge_attributes(
                    last_feature.attributes, f.attributes)
            else:
                new_attributes = {}

            if update_attributes:
                new_attributes.update(update_attributes)

            new_bin = bins.bins(
                interfeature_start, interfeature_stop, one=True)
            _id = None
            fields = dict(
                seqid=chrom,
                source='gffutils_derived',
                featuretype=new_featuretype,
                start=interfeature_start,
                end=interfeature_stop,
                score='.',
                strand=strand,
                frame='.',
                attributes=new_attributes,
                bin=new_bin)

            if dialect is None:
                # Support for @classmethod -- if calling from the class, then
                # self.dialect is not defined, so defer to Feature's default
                # (which will be constants.dialect, or GFF3).
                try:
                    dialect = self.dialect
                except AttributeError:
                    dialect = None
            yield self._feature_returner(**fields)
            interfeature_start = f.stop
예제 #8
0
    def region(self, region=None, seqid=None, start=None, end=None,
               strand=None, featuretype=None, completely_within=False):
        """
        Return features within specified genomic coordinates.

        Specifying genomic coordinates can be done in a flexible manner

        Parameters
        ----------
        region : string, tuple, or Feature instance
            If string, then of the form "seqid:start-end".  If tuple, then
            (seqid, start, end).  If :class:`Feature`, then use the features
            seqid, start, and end values.

            This argument is mutually exclusive with start/end/seqid.

            *Note*: By design, even if a feature is provided, its strand will
            be ignored.  If you want to restrict the output by strand, use the
            separate `strand` kwarg.

        strand : + | - | . | None
            If `strand` is provided, then only those features exactly matching
            `strand` will be returned. So `strand='.'` will only return
            unstranded features. Default is `strand=None` which does not
            restrict by strand.

        seqid, start, end, strand
            Mutually exclusive with `region`.  These kwargs can be used to
            approximate slice notation; see "Details" section below.

        featuretype : None, string, or iterable
            If not None, then restrict output.  If string, then only report
            that feature type.  If iterable, then report all featuretypes in
            the iterable.

        completely_within : bool
            By default (`completely_within=False`), returns features that
            partially or completely overlap `region`.  If
            `completely_within=True`, features that are completely within
            `region` will be returned.

        Notes
        -------

        The meaning of `seqid`, `start`, and `end` is interpreted as follows:

        ====== ====== ===== ======================================
        seqid  start  end   meaning
        ====== ====== ===== ======================================
        str    int    int   equivalent to `region` kwarg
        None   int    int   features from all chroms within coords
        str    None   int   equivalent to [:end] slice notation
        str    int    None  equivalent to [start:] slice notation
        None   None   None  equivalent to FeatureDB.all_features()
        ====== ====== ===== ======================================

        If performance is a concern, use `completely_within=True`. This allows
        the query to be optimized by only looking for features that fall in the
        precise genomic bin (same strategy as UCSC Genome Browser and
        BEDTools). Otherwise all features' start/stop coords need to be
        searched to see if they partially overlap the region of interest.

        Examples
        --------

        - `region(seqid="chr1", start=1000)` returns all features on chr1 that
          start or extend past position 1000

        - `region(seqid="chr1", start=1000, completely_within=True)` returns
          all features on chr1 that start past position 1000.

        - `region("chr1:1-100", strand="+", completely_within=True)` returns
          only plus-strand features that completely fall within positions 1 to
          100 on chr1.

        Returns
        -------
        A generator object that yields :class:`Feature` objects.
        """
        # Argument handling.
        if region is not None:
            if (seqid is not None) or (start is not None) or (end is not None):
                raise ValueError(
                    "If region is supplied, do not supply seqid, "
                    "start, or end as separate kwargs")
            if isinstance(region, six.string_types):
                toks = region.split(':')
                if len(toks) == 1:
                    seqid = toks[0]
                    start, end = None, None
                else:
                    seqid, coords = toks[:2]
                    if len(toks) == 3:
                        strand = toks[2]
                    start, end = coords.split('-')

            elif isinstance(region, Feature):
                seqid = region.seqid
                start = region.start
                end = region.end
                strand = region.strand

            # otherwise assume it's a tuple
            else:
                seqid, start, end = region[:3]

        # e.g.,
        #   completely_within=True..... start >= {start} AND end <= {end}
        #   completely_within=False.... start <  {end}   AND end >  {start}
        if completely_within:
            start_op = '>='
            end_op = '<='
        else:
            start_op = '<'
            end_op = '>'
            end, start = start, end

        args = []
        position_clause = []
        if seqid is not None:
            position_clause.append('seqid = ?')
            args.append(seqid)
        if start is not None:
            start = int(start)
            position_clause.append('start %s ?' % start_op)
            args.append(start)
        if end is not None:
            end = int(end)
            position_clause.append('end %s ?' % end_op)
            args.append(end)

        position_clause = ' AND '.join(position_clause)

        # Only use bins if we have defined boundaries and completely_within is
        # True. Otherwise you can't know how far away a feature stretches
        # (which means bins are not computable ahead of time)
        _bin_clause = ''
        if (start is not None) and (end is not None) and completely_within:
            if start <= bins.MAX_CHROM_SIZE and end <= bins.MAX_CHROM_SIZE:
                _bins = list(bins.bins(start, end, one=False))
                # See issue #45
                if len(_bins) < 900:
                    _bin_clause = ' or ' .join(['bin = ?' for _ in _bins])
                    _bin_clause = 'AND ( %s )' % _bin_clause
                    args += _bins

        query = ' '.join([
            constants._SELECT,
            'WHERE ',
            position_clause,
            _bin_clause])

        # Add the featuretype clause
        if featuretype is not None:
            if isinstance(featuretype, six.string_types):
                featuretype = [featuretype]
            feature_clause = ' or '.join(
                ['featuretype = ?' for _ in featuretype])
            query += ' AND (%s) ' % feature_clause
            args.extend(featuretype)

        if strand is not None:
            strand_clause = ' and strand = ? '
            query += strand_clause
            args.append(strand)

        c = self.conn.cursor()
        self._last_query = query
        self._last_args = args
        self._context = {
            'start': start,
            'end': end,
            'seqid': seqid,
            'region': region,
        }
        c.execute(query, tuple(args))
        for i in c:
            yield self._feature_returner(**i)
예제 #9
0
파일: create.py 프로젝트: linsson/gffutils
    def _update_relations(self):

        if not self.infer_gene_extent:
            return

        # TODO: do any indexes speed this up?
        c = self.conn.cursor()
        c2 = self.conn.cursor()

        logger.info("Creating relations(parent) index")
        c.execute('DROP INDEX IF EXISTS relationsparent')
        c.execute('CREATE INDEX relationsparent ON relations (parent)')
        logger.info("Creating relations(child) index")
        c.execute('DROP INDEX IF EXISTS relationschild')
        c.execute('CREATE INDEX relationschild ON relations (child)')

        logger.info('Inferring gene and transcript extents, '
                    'and writing to tempfile')
        tmp = tempfile.NamedTemporaryFile(delete=False).name
        tmp = '/tmp/gffutils'
        fout = open(tmp, 'w')

        self._tmpfile = tmp

        # This takes some explanation...
        #
        # First, the nested subquery gets the level-1 parents of
        # self.subfeature featuretypes.  For an on-spec GTF file,
        # self.subfeature = "exon". So this subquery translates to getting the
        # distinct level-1 parents of exons -- which are transcripts.
        #
        # OK, so this first subquery is now a list of transcripts; call it
        # "firstlevel".
        #
        # Then join firstlevel on relations, but the trick is to now consider
        # each transcript a *child* -- so that relations.parent (on the first
        # line of the query) will be the first-level parent of the transcript
        # (the gene).
        #
        #
        # The result is something like:
        #
        #   transcript1     gene1
        #   transcript2     gene1
        #   transcript3     gene2
        #
        # Note that genes are repeated; below we need to ensure that only one
        # is added.  To ensure this, the results are ordered by the gene ID.

        c.execute(
            '''
            SELECT DISTINCT firstlevel.parent, relations.parent
            FROM (
                SELECT DISTINCT parent
                FROM relations
                JOIN features ON features.id = relations.child
                WHERE features.featuretype = ?
                AND relations.level = 1
            )
            AS firstlevel
            JOIN relations ON firstlevel.parent = child
            WHERE relations.level = 1
            ORDER BY relations.parent
            ''', (self.subfeature,))

        # Now we iterate through those results (using a new cursor) to infer
        # the extent of transcripts and genes.

        last_gene_id = None
        n_features = 0
        for transcript_id, gene_id in c:
            # transcript extent
            c2.execute(
                '''
                SELECT MIN(start), MAX(end), strand, seqid
                FROM features
                JOIN relations ON
                features.id = relations.child
                WHERE parent = ? AND featuretype == ?
                ''', (transcript_id, self.subfeature))
            transcript_start, transcript_end, strand, seqid = c2.fetchone()
            transcript_attributes = {
                self.transcript_key: [transcript_id],
                self.gene_key: [gene_id]
            }
            transcript_bin = bins.bins(
                transcript_start, transcript_end, one=True)

            # Write out to file; we'll be reading it back in shortly.  Omit
            # score, frame, source, and extra since they will always have the
            # same default values (".", ".", "gffutils_derived", and []
            # respectively)

            fout.write('\t'.join(map(str, [
                transcript_id,
                seqid,
                transcript_start,
                transcript_end,
                strand,
                'transcript',
                transcript_bin,
                helpers._jsonify(transcript_attributes)
            ])) + '\n')

            n_features += 1

            # Infer gene extent, but only if we haven't done so already.
            if gene_id != last_gene_id:
                c2.execute(
                    '''
                    SELECT MIN(start), MAX(end), strand, seqid
                    FROM features
                    JOIN relations ON
                    features.id = relations.child
                    WHERE parent = ? AND featuretype == ?
                    ''', (gene_id, self.subfeature))
                gene_start, gene_end, strand, seqid = c2.fetchone()
                gene_attributes = {self.gene_key: [gene_id]}
                gene_bin = bins.bins(gene_start, gene_end, one=True)

                fout.write('\t'.join(map(str, [
                    gene_id,
                    seqid,
                    gene_start,
                    gene_end,
                    strand,
                    'gene',
                    gene_bin,
                    helpers._jsonify(gene_attributes)
                ])) + '\n')

            last_gene_id = gene_id
            n_features += 1

        fout.close()

        def derived_feature_generator():
            """
            Generator of items from the file that was just created...
            """
            keys = ['parent', 'seqid', 'start', 'end', 'strand',
                    'featuretype', 'bin', 'attributes']
            for line in open(fout.name):
                d = dict(list(zip(keys, line.strip().split('\t'))))
                d.pop('parent')
                d['score'] = '.'
                d['source'] = 'gffutils_derived'
                d['frame'] = '.'
                d['extra'] = []
                d['attributes'] = helpers._unjsonify(d['attributes'])
                f = feature.Feature(**d)
                f.id = self._id_handler(f)
                yield f

        # Drop the indexes so the inserts are faster
        c.execute('DROP INDEX IF EXISTS relationsparent')
        c.execute('DROP INDEX IF EXISTS relationschild')

        # Insert the just-inferred transcripts and genes.  TODO: should we
        # *always* use "merge" here for the merge_strategy?
        logger.info("Importing inferred features into db")
        last_perc = None
        for i, f in enumerate(derived_feature_generator()):
            perc = int(i / float(n_features) * 100)
            if perc != last_perc:
                sys.stderr.write('%s of %s (%s%%)\r' % (i, n_features, perc))
                sys.stderr.flush()
            last_perc = perc
            try:
                self._insert(f, c)
            except sqlite3.IntegrityError:
                fixed, final_strategy = self._do_merge(f, 'merge')
                c.execute(
                    '''
                    UPDATE features SET attributes = ?
                    WHERE id = ?
                    ''', (helpers._jsonify(fixed.attributes),
                          fixed.id))

        logger.info("Committing changes")
        self.conn.commit()
        os.unlink(fout.name)
예제 #10
0
파일: create.py 프로젝트: drubin23/gffutils
    def _update_relations(self):

        if not self.infer_gene_extent:
            return

        # TODO: do any indexes speed this up?
        c = self.conn.cursor()
        c2 = self.conn.cursor()

        logger.info("Creating relations(parent) index")
        c.execute('DROP INDEX IF EXISTS relationsparent')
        c.execute('CREATE INDEX relationsparent ON relations (parent)')
        logger.info("Creating relations(child) index")
        c.execute('DROP INDEX IF EXISTS relationschild')
        c.execute('CREATE INDEX relationschild ON relations (child)')

        logger.info('Inferring gene and transcript extents, '
                    'and writing to tempfile')
        tmp = tempfile.NamedTemporaryFile(delete=False).name
        tmp = '/tmp/gffutils'
        fout = open(tmp, 'w')

        self._tmpfile = tmp

        # This takes some explanation...
        #
        # First, the nested subquery gets the level-1 parents of
        # self.subfeature featuretypes.  For an on-spec GTF file,
        # self.subfeature = "exon". So this subquery translates to getting the
        # distinct level-1 parents of exons -- which are transcripts.
        #
        # OK, so this first subquery is now a list of transcripts; call it
        # "firstlevel".
        #
        # Then join firstlevel on relations, but the trick is to now consider
        # each transcript a *child* -- so that relations.parent (on the first
        # line of the query) will be the first-level parent of the transcript
        # (the gene).
        #
        #
        # The result is something like:
        #
        #   transcript1     gene1
        #   transcript2     gene1
        #   transcript3     gene2
        #
        # Note that genes are repeated; below we need to ensure that only one
        # is added.  To ensure this, the results are ordered by the gene ID.

        c.execute(
            '''
            SELECT DISTINCT firstlevel.parent, relations.parent
            FROM (
                SELECT DISTINCT parent
                FROM relations
                JOIN features ON features.id = relations.child
                WHERE features.featuretype = ?
                AND relations.level = 1
            )
            AS firstlevel
            JOIN relations ON firstlevel.parent = child
            WHERE relations.level = 1
            ORDER BY relations.parent
            ''', (self.subfeature,))

        # Now we iterate through those results (using a new cursor) to infer
        # the extent of transcripts and genes.

        last_gene_id = None
        n_features = 0
        for transcript_id, gene_id in c:
            # transcript extent
            c2.execute(
                '''
                SELECT MIN(start), MAX(end), strand, seqid
                FROM features
                JOIN relations ON
                features.id = relations.child
                WHERE parent = ? AND featuretype == ?
                ''', (transcript_id, self.subfeature))
            transcript_start, transcript_end, strand, seqid = c2.fetchone()
            transcript_attributes = {
                self.transcript_key: [transcript_id],
                self.gene_key: [gene_id]
            }
            transcript_bin = bins.bins(
                transcript_start, transcript_end, one=True)

            # Write out to file; we'll be reading it back in shortly.  Omit
            # score, frame, source, and extra since they will always have the
            # same default values (".", ".", "gffutils_derived", and []
            # respectively)

            fout.write('\t'.join(map(str, [
                transcript_id,
                seqid,
                transcript_start,
                transcript_end,
                strand,
                'transcript',
                transcript_bin,
                helpers._jsonify(transcript_attributes)
            ])) + '\n')

            n_features += 1

            # Infer gene extent, but only if we haven't done so already.
            if gene_id != last_gene_id:
                c2.execute(
                    '''
                    SELECT MIN(start), MAX(end), strand, seqid
                    FROM features
                    JOIN relations ON
                    features.id = relations.child
                    WHERE parent = ? AND featuretype == ?
                    ''', (gene_id, self.subfeature))
                gene_start, gene_end, strand, seqid = c2.fetchone()
                gene_attributes = {self.gene_key: [gene_id]}
                gene_bin = bins.bins(gene_start, gene_end, one=True)

                fout.write('\t'.join(map(str, [
                    gene_id,
                    seqid,
                    gene_start,
                    gene_end,
                    strand,
                    'gene',
                    gene_bin,
                    helpers._jsonify(gene_attributes)
                ])) + '\n')

            last_gene_id = gene_id
            n_features += 1

        fout.close()

        def derived_feature_generator():
            """
            Generator of items from the file that was just created...
            """
            keys = ['parent', 'seqid', 'start', 'end', 'strand',
                    'featuretype', 'bin', 'attributes']
            for line in open(fout.name):
                d = dict(list(zip(keys, line.strip().split('\t'))))
                d.pop('parent')
                d['score'] = '.'
                d['source'] = 'gffutils_derived'
                d['frame'] = '.'
                d['extra'] = []
                d['attributes'] = helpers._unjsonify(d['attributes'])
                f = feature.Feature(**d)
                f.id = self._id_handler(f)
                yield f

        # Drop the indexes so the inserts are faster
        c.execute('DROP INDEX IF EXISTS relationsparent')
        c.execute('DROP INDEX IF EXISTS relationschild')

        # Insert the just-inferred transcripts and genes.  TODO: should we
        # *always* use "merge" here for the merge_strategy?
        logger.info("Importing inferred features into db")
        last_perc = None
        for i, f in enumerate(derived_feature_generator()):
            perc = int(i / float(n_features) * 100)
            if perc != last_perc:
                sys.stderr.write('%s of %s (%s%%)\r' % (i, n_features, perc))
                sys.stderr.flush()
            last_perc = perc
            try:
                self._insert(f, c)
            except sqlite3.IntegrityError:
                fixed, final_strategy = self._do_merge(f, 'merge')
                c.execute(
                    '''
                    UPDATE features SET attributes = ?
                    WHERE id = ?
                    ''', (helpers._jsonify(fixed.attributes),
                          fixed.id))

        logger.info("Committing changes")
        self.conn.commit()
        os.unlink(fout.name)
예제 #11
0
def make_query(args,
               other=None,
               limit=None,
               strand=None,
               featuretype=None,
               extra=None,
               order_by=None,
               reverse=False,
               completely_within=False):
    """
    Multi-purpose, bare-bones ORM function.

    This function composes queries given some commonly-used kwargs that can be
    passed to FeatureDB methods (like .parents(), .children(), .all_features(),
    .features_of_type()).  It handles, in one place, things like restricting to
    featuretype, limiting to a genomic range, limiting to one strand, or
    returning results ordered by different criteria.

    Additional filtering/subsetting/sorting behavior should be added here.

    (Note: this ended up having better performance (and flexibility) than
    sqlalchemy)

    This function also provides support for additional JOINs etc (supplied via
    the `other` kwarg) and extra conditional clauses (`extra` kwarg).  See the
    `_QUERY` var below for the order in which they are used.

    For example, FeatureDB._relation uses `other` to supply the JOIN
    substatment, and that same method also uses `extra` to supply the
    "relations.level = ?" substatment (see the source for FeatureDB._relation
    for more details).

    `args` contains the arguments that will ultimately be supplied to the
    sqlite3.connection.execute function.  It may be further populated below --
    for example, if strand="+", then the query will include a strand clause,
    and the strand will be appended to the args.

    `args` can be pre-filled with args that are passed to `other` and `extra`.
    """

    _QUERY = ("{_SELECT} {OTHER} {EXTRA} {FEATURETYPE} "
              "{LIMIT} {STRAND} {ORDER_BY}")

    # Construct a dictionary `d` that will be used later as _QUERY.format(**d).
    # Default is just _SELECT, which returns all records in the features table.
    # (Recall that constants._SELECT gets the fields in the order needed to
    # reconstruct a Feature)
    d = dict(_SELECT=constants._SELECT,
             OTHER="",
             FEATURETYPE="",
             LIMIT="",
             STRAND="",
             ORDER_BY="",
             EXTRA="")

    if other:
        d['OTHER'] = other
    if extra:
        d['EXTRA'] = extra

    # If `other` and `extra` take args (that is, they have "?" in them), then
    # they should have been provided in `args`.
    required_args = (d['EXTRA'] + d['OTHER']).count('?')
    if len(args) != required_args:
        raise ValueError('Not enough args (%s) for subquery' % args)

    # Below, if a kwarg is specified, then we create sections of the query --
    # appending to args as necessary.
    #
    # IMPORTANT: the order in which things are processed here is the same as
    # the order of the placeholders in _QUERY.  That is, we need to build the
    # args in parallel with the query to avoid putting the wrong args in the
    # wrong place.

    if featuretype:
        # Handle single or iterables of featuretypes.
        #
        # e.g., "featuretype = 'exon'"
        #
        # or, "featuretype IN ('exon', 'CDS')"
        if isinstance(featuretype, six.string_types):
            d['FEATURETYPE'] = "features.featuretype = ?"
            args.append(featuretype)
        else:
            d['FEATURETYPE'] = ("features.featuretype IN  (%s)" %
                                (','.join(["?" for _ in featuretype])))
            args.extend(featuretype)

    if limit:
        # Restrict to a genomic region.  Makes use of the UCSC binning strategy
        # for performance.
        #
        # `limit` is a string or a tuple of (chrom, start, stop)
        #
        # e.g., "seqid = 'chr2L' AND start > 1000 AND end < 5000"
        if isinstance(limit, six.string_types):
            seqid, startstop = limit.split(':')
            start, end = startstop.split('-')
        else:
            seqid, start, end = limit

        # Identify possible bins
        _bins = bins.bins(int(start), int(end), one=False)

        # Use different overlap conditions
        if completely_within:
            d['LIMIT'] = ("features.seqid = ? AND features.start >= ? "
                          "AND features.end <= ?")
            args.extend([seqid, start, end])

        else:
            d['LIMIT'] = ("features.seqid = ? AND features.start <= ? "
                          "AND features.end >= ?")
            # Note order (end, start)
            args.extend([seqid, end, start])

        # Add bin clause. See issue #45.
        if len(_bins) < 900:
            d['LIMIT'] += " AND features.bin IN (%s)" % (','.join(
                map(str, _bins)))

    if strand:
        # e.g., "strand = '+'"
        d['STRAND'] = "features.strand = ?"
        args.append(strand)

    # TODO: implement file_order!
    valid_order_by = constants._gffkeys_extra + ['file_order', 'length']
    _order_by = []
    if order_by:
        # Default is essentially random order.
        #
        # e.g. "ORDER BY seqid, start DESC"
        if isinstance(order_by, six.string_types):
            _order_by.append(order_by)

        else:
            for k in order_by:
                if k not in valid_order_by:
                    raise ValueError("%s not a valid order-by value in %s" %
                                     (k, valid_order_by))

                # There's no length field, so order by end - start
                if k == 'length':
                    k = '(end - start)'

                _order_by.append(k)

        _order_by = ','.join(_order_by)
        if reverse:
            direction = 'DESC'
        else:
            direction = 'ASC'
        d['ORDER_BY'] = 'ORDER BY %s %s' % (_order_by, direction)

    # Ensure only one "WHERE" is included; the rest get "AND ".  This is ugly.
    where = False
    if "where" in d['OTHER'].lower():
        where = True
    for i in ['EXTRA', 'FEATURETYPE', 'LIMIT', 'STRAND']:
        if d[i]:
            if not where:
                d[i] = "WHERE " + d[i]
                where = True
            else:
                d[i] = "AND " + d[i]

    return _QUERY.format(**d), args
예제 #12
0
파일: interface.py 프로젝트: daler/gffutils
    def interfeatures(self, features, new_featuretype=None,
                      merge_attributes=True, dialect=None,
                      attribute_func=None, update_attributes=None):
        """
        Construct new features representing the space between features.

        For example, if `features` is a list of exons, then this method will
        return the introns.  If `features` is a list of genes, then this method
        will return the intergenic regions.

        Providing N features will return N - 1 new features.

        This method purposefully does *not* do any merging or sorting of
        coordinates, so you may want to use :meth:`FeatureDB.merge` first, or
        when selecting features use the `order_by` kwarg, e.g.,
        `db.features_of_type('gene', order_by=('seqid', 'start'))`.

        Parameters
        ----------
        features : iterable of :class:`feature.Feature` instances
            Sorted, merged iterable

        new_featuretype : string or None
            The new features will all be of this type, or, if None (default)
            then the featuretypes will be constructed from the neighboring
            features, e.g., `inter_exon_exon`.

        merge_attributes : bool
            If True, new features' attributes will be a merge of the neighboring
            features' attributes.  This is useful if you have provided a list of
            exons; the introns will then retain the transcript and/or gene
            parents as a single item. Otherwise, if False, the attribute will
            be a comma-separated list of values, potentially listing the same
            gene ID twice.

        attribute_func : callable or None
            If None, then nothing special is done to the attributes.  If
            callable, then the callable accepts two attribute dictionaries and
            returns a single attribute dictionary.  If `merge_attributes` is
            True, then `attribute_func` is called before `merge_attributes`.
            This could be useful for manually managing IDs for the new
            features.

        update_attributes : dict
            After attributes have been modified and merged, this dictionary can
            be used to replace parts of the attributes dictionary.

        Returns
        -------
        A generator that yields :class:`Feature` objects
        """
        for i, f in enumerate(features):
            # no inter-feature for the first one
            if i == 0:
                interfeature_start = f.stop
                last_feature = f
                continue

            interfeature_stop = f.start
            if new_featuretype is None:
                new_featuretype = 'inter_%s_%s' % (
                    last_feature.featuretype, f.featuretype)
            if last_feature.strand != f.strand:
                new_strand = '.'
            else:
                new_strand = f.strand

            if last_feature.chrom != f.chrom:
                # We've moved to a new chromosome.  For example, if we're
                # getting intergenic regions from all genes, they will be on
                # different chromosomes. We still assume sorted features, but
                # don't complain if they're on different chromosomes -- just
                # move on.
                last_feature = f
                continue

            strand = new_strand
            chrom = last_feature.chrom

            # Shrink
            interfeature_start += 1
            interfeature_stop -= 1

            if merge_attributes:
                new_attributes = helpers.merge_attributes(
                    last_feature.attributes, f.attributes)
            else:
                new_attributes = {}

            if update_attributes:
                new_attributes.update(update_attributes)

            new_bin = bins.bins(
                interfeature_start, interfeature_stop, one=True)
            _id = None
            fields = dict(
                seqid=chrom,
                source='gffutils_derived',
                featuretype=new_featuretype,
                start=interfeature_start,
                end=interfeature_stop,
                score='.',
                strand=strand,
                frame='.',
                attributes=new_attributes,
                bin=new_bin)

            if dialect is None:
                # Support for @classmethod -- if calling from the class, then
                # self.dialect is not defined, so defer to Feature's default
                # (which will be constants.dialect, or GFF3).
                try:
                    dialect = self.dialect
                except AttributeError:
                    dialect = None
            yield self._feature_returner(**fields)
            interfeature_start = f.stop
예제 #13
0
파일: interface.py 프로젝트: daler/gffutils
    def region(self, region=None, seqid=None, start=None, end=None,
               strand=None, featuretype=None, completely_within=False):
        """
        Return features within specified genomic coordinates.

        Specifying genomic coordinates can be done in a flexible manner

        Parameters
        ----------
        region : string, tuple, or Feature instance
            If string, then of the form "seqid:start-end".  If tuple, then
            (seqid, start, end).  If :class:`Feature`, then use the features
            seqid, start, and end values.

            This argument is mutually exclusive with start/end/seqid.

            *Note*: By design, even if a feature is provided, its strand will
            be ignored.  If you want to restrict the output by strand, use the
            separate `strand` kwarg.

        strand : + | - | . | None
            If `strand` is provided, then only those features exactly matching
            `strand` will be returned. So `strand='.'` will only return
            unstranded features. Default is `strand=None` which does not
            restrict by strand.

        seqid, start, end, strand
            Mutually exclusive with `region`.  These kwargs can be used to
            approximate slice notation; see "Details" section below.

        featuretype : None, string, or iterable
            If not None, then restrict output.  If string, then only report
            that feature type.  If iterable, then report all featuretypes in
            the iterable.

        completely_within : bool
            By default (`completely_within=False`), returns features that
            partially or completely overlap `region`.  If
            `completely_within=True`, features that are completely within
            `region` will be returned.

        Notes
        -------

        The meaning of `seqid`, `start`, and `end` is interpreted as follows:

        ====== ====== ===== ======================================
        seqid  start  end   meaning
        ====== ====== ===== ======================================
        str    int    int   equivalent to `region` kwarg
        None   int    int   features from all chroms within coords
        str    None   int   equivalent to [:end] slice notation
        str    int    None  equivalent to [start:] slice notation
        None   None   None  equivalent to FeatureDB.all_features()
        ====== ====== ===== ======================================

        If performance is a concern, use `completely_within=True`. This allows
        the query to be optimized by only looking for features that fall in the
        precise genomic bin (same strategy as UCSC Genome Browser and
        BEDTools). Otherwise all features' start/stop coords need to be
        searched to see if they partially overlap the region of interest.

        Examples
        --------

        - `region(seqid="chr1", start=1000)` returns all features on chr1 that
          start or extend past position 1000

        - `region(seqid="chr1", start=1000, completely_within=True)` returns
          all features on chr1 that start past position 1000.

        - `region("chr1:1-100", strand="+", completely_within=True)` returns
          only plus-strand features that completely fall within positions 1 to
          100 on chr1.

        Returns
        -------
        A generator object that yields :class:`Feature` objects.
        """
        # Argument handling.
        if region is not None:
            if (seqid is not None) or (start is not None) or (end is not None):
                raise ValueError(
                    "If region is supplied, do not supply seqid, "
                    "start, or end as separate kwargs")
            if isinstance(region, six.string_types):
                toks = region.split(':')
                if len(toks) == 1:
                    seqid = toks[0]
                    start, end = None, None
                else:
                    seqid, coords = toks[:2]
                    if len(toks) == 3:
                        strand = toks[2]
                    start, end = coords.split('-')

            elif isinstance(region, Feature):
                seqid = region.seqid
                start = region.start
                end = region.end
                strand = region.strand

            # otherwise assume it's a tuple
            else:
                seqid, start, end = region[:3]

        # e.g.,
        #   completely_within=True..... start >= {start} AND end <= {end}
        #   completely_within=False.... start <  {end}   AND end >  {start}
        if completely_within:
            start_op = '>='
            end_op = '<='
        else:
            start_op = '<'
            end_op = '>'
            end, start = start, end

        args = []
        position_clause = []
        if seqid is not None:
            position_clause.append('seqid = ?')
            args.append(seqid)
        if start is not None:
            start = int(start)
            position_clause.append('start %s ?' % start_op)
            args.append(start)
        if end is not None:
            end = int(end)
            position_clause.append('end %s ?' % end_op)
            args.append(end)

        position_clause = ' AND '.join(position_clause)

        # Only use bins if we have defined boundaries and completely_within is
        # True. Otherwise you can't know how far away a feature stretches
        # (which means bins are not computable ahead of time)
        _bin_clause = ''
        if (start is not None) and (end is not None) and completely_within:
            if start <= bins.MAX_CHROM_SIZE and end <= bins.MAX_CHROM_SIZE:
                _bins = list(bins.bins(start, end, one=False))
                # See issue #45
                if len(_bins) < 900:
                    _bin_clause = ' or ' .join(['bin = ?' for _ in _bins])
                    _bin_clause = 'AND ( %s )' % _bin_clause
                    args += _bins

        query = ' '.join([
            constants._SELECT,
            'WHERE ',
            position_clause,
            _bin_clause])

        # Add the featuretype clause
        if featuretype is not None:
            if isinstance(featuretype, six.string_types):
                featuretype = [featuretype]
            feature_clause = ' or '.join(
                ['featuretype = ?' for _ in featuretype])
            query += ' AND (%s) ' % feature_clause
            args.extend(featuretype)

        if strand is not None:
            strand_clause = ' and strand = ? '
            query += strand_clause
            args.append(strand)

        c = self.conn.cursor()
        self._last_query = query
        self._last_args = args
        self._context = {
            'start': start,
            'end': end,
            'seqid': seqid,
            'region': region,
        }
        c.execute(query, tuple(args))
        for i in c:
            yield self._feature_returner(**i)
예제 #14
0
    def region(self, region, featuretype=None, completely_within=False):
        """
        Return features with any part overlapping `region`.

        Parameters
        ----------
        region : string, tuple, or Feature instance
            If string, then of the form "seqid:start-end".  If tuple, then
            (seqid, start, end).  If :class:`Feature`, then use the features
            seqid, start, and end values.

        featuretype : None, string, or iterable
            If not None, then restrict output.  If string, then only report
            that feature type.  If iterable, then report all featuretypes in
            the iterable.

        completely_within : bool
            If False (default), returns features that overlap `region`, even
            partially.  If True, only return features that are completely
            within `region`.
        """
        strand = None
        if isinstance(region, six.string_types):
            toks = region.split(':')
            seqid, coords = toks[:2]
            if len(toks) == 3:
                strand = toks[2]
            start, end = coords.split('-')

        elif isinstance(region, Feature):
            seqid = region.seqid
            start = region.start
            end = region.end
            strand = region.strand
        else:
            seqid, start, end = region[:3]
            if len(region) == 4:
                strand = region[3]

        # Get a list of all possible bins for this region
        _bins = list(bins.bins(int(start), int(end), one=False))

        if completely_within:
            position_clause = 'start >= ? AND end <= ?'
            args = [seqid, start, end]
        else:
            position_clause = 'start < ? AND end > ?'
            # note start/end swap
            args = [seqid, end, start]

        args += _bins

        _bin_clause = ' or ' .join(['bin = ?' for _ in _bins])

        query = ' '.join([
            constants._SELECT,
            'WHERE seqid = ? AND', position_clause,
            'AND', '(', _bin_clause, ')'])

        # Add the featuretype clause
        if featuretype is not None:
            if isinstance(featuretype, six.string_types):
                featuretype = [featuretype]
            feature_clause = ' or '.join(
                ['featuretype = ?' for _ in featuretype])
            query += ' AND (%s) ' % feature_clause
            args.extend(featuretype)

        if strand is not None:
            strand_clause = ' and strand = ? '
            query += strand_clause
            args.append(strand)

        c = self.conn.cursor()
        c.execute(query, tuple(args))
        for i in c:
            yield self._feature_returner(**i)
예제 #15
0
    def interfeatures(self, features, new_featuretype=None,
                      merge_attributes=True, dialect=None):
        """
        Construct new features representing the space between features.

        For example, if `features` is a list of exons, then this method will
        return the introns.  If `features` is a list of genes, then this method
        will return the intergenic regions.

        Providing N features will return N - 1 new features.

        This method purposefully does *not* do any merging or sorting of
        coordinates, so you may want to use :meth:`FeatureDB.merge` first.

        The new features' attributes will be a merge of the neighboring
        features' attributes.  This is useful if you have provided a list of
        exons; the introns will then retain the transcript and/or gene parents.

        Parameters
        ----------
        features : iterable of :class:`feature.Feature` instances
            Sorted, merged iterable

        new_featuretype : string or None
            The new features will all be of this type, or, if None (default)
            then the featuretypes will be constructed from the neighboring
            features, e.g., `inter_exon_exon`.

        attribute_func : callable or None
            If None, then nothing special is done to the attributes.  If
            callable, then the callable accepts two attribute dictionaries and
            returns a single attribute dictionary.  If `merge_attributes` is
            True, then `attribute_func` is called before `merge_attributes`.
            This could be useful for manually managing IDs for the new
            features.
        """
        for i, f in enumerate(features):
            # no inter-feature for the first one
            if i == 0:
                interfeature_start = f.stop
                last_feature = f
                continue

            interfeature_stop = f.start
            if new_featuretype is None:
                new_featuretype = 'inter_%s_%s' % (
                    last_feature.featuretype, f.featuretype)
            assert last_feature.strand == f.strand
            assert last_feature.chrom == f.chrom
            strand = last_feature.strand
            chrom = last_feature.chrom

            # Shrink
            interfeature_start += 1
            interfeature_stop -= 1

            new_attributes = helpers.merge_attributes(
                last_feature.attributes, f.attributes)

            new_bin = bins.bins(
                interfeature_start, interfeature_stop, one=True)
            _id = None
            fields = dict(
                seqid=chrom,
                source='gffutils_derived',
                featuretype=new_featuretype,
                start=interfeature_start,
                end=interfeature_stop,
                score='.',
                strand=strand,
                frame='.',
                attributes=new_attributes,
                bin=new_bin)

            if dialect is None:
                # Support for @classmethod -- if calling from the class, then
                # self.dialect is not defined, so defer to Feature's default
                # (which will be constants.dialect, or GFF3).
                try:
                    dialect = self.dialect
                except AttributeError:
                    dialect = None
            yield self._feature_returner(**fields)
            interfeature_start = f.stop
예제 #16
0
def make_query(args, other=None, limit=None, strand=None, featuretype=None,
               extra=None, order_by=None, reverse=False,
               completely_within=False):
    """
    This function composes queries given some commonly-used kwargs that can be
    passed to FeatureDB methods (like .parents(), .children(), .all_features(),
    .features_of_type()).  It handles, in one place, things like restricting to
    featuretype, limiting to a genomic range, limiting to one strand, or
    returning results ordered by different criteria.

    Additional filtering/subsetting/sorting behavior should be added here.

    (Note: this ended up having better performance (and flexibility) than
    sqlalchemy)

    This function also provides support for additional JOINs etc (supplied via
    the `other` kwarg) and extra conditional clauses (`extra` kwarg).  See the
    `_QUERY` var below for the order in which they are used.

    For example, FeatureDB._relation uses `other` to supply the JOIN
    substatment, and that same method also uses `extra` to supply the
    "relations.level = ?" substatment (see the source for FeatureDB._relation
    for more details).

    `args` contains the arguments that will ultimately be supplied to the
    sqlite3.connection.execute function.  It may be further populated below --
    for example, if strand="+", then the query will include a strand clause,
    and the strand will be appended to the args.

    `args` can be pre-filled with args that are passed to `other` and `extra`.
    """

    _QUERY = ("{_SELECT} {OTHER} {EXTRA} {FEATURETYPE} "
              "{LIMIT} {STRAND} {ORDER_BY}")

    # Construct a dictionary `d` that will be used later as _QUERY.format(**d).
    # Default is just _SELECT, which returns all records in the features table.
    # (Recall that constants._SELECT gets the fields in the order needed to
    # reconstruct a Feature)
    d = dict(_SELECT=constants._SELECT, OTHER="", FEATURETYPE="", LIMIT="",
             STRAND="", ORDER_BY="", EXTRA="")

    if other:
        d['OTHER'] = other
    if extra:
        d['EXTRA'] = extra

    # If `other` and `extra` take args (that is, they have "?" in them), then
    # they should have been provided in `args`.
    required_args = (d['EXTRA'] + d['OTHER']).count('?')
    if len(args) != required_args:
        raise ValueError('Not enough args (%s) for subquery' % args)

    # Below, if a kwarg is specified, then we create sections of the query --
    # appending to args as necessary.
    #
    # IMPORTANT: the order in which things are processed here is the same as
    # the order of the placeholders in _QUERY.  That is, we need to build the
    # args in parallel with the query to avoid putting the wrong args in the
    # wrong place.

    if featuretype:
        # Handle single or iterables of featuretypes.
        #
        # e.g., "featuretype = 'exon'"
        #
        # or, "featuretype IN ('exon', 'CDS')"
        if isinstance(featuretype, six.string_types):
            d['FEATURETYPE'] = "features.featuretype = ?"
            args.append(featuretype)
        else:
            d['FEATURETYPE'] = (
                "features.featuretype IN  (%s)"
                % (','.join(["?" for _ in featuretype]))
            )
            args.extend(featuretype)

    if limit:
        # Restrict to a genomic region.  Makes use of the UCSC binning strategy
        # for performance.
        #
        # `limit` is a string or a tuple of (chrom, start, stop)
        #
        # e.g., "seqid = 'chr2L' AND start > 1000 AND end < 5000"
        if isinstance(limit, six.string_types):
            seqid, startstop = limit.split(':')
            start, end = startstop.split('-')
        else:
            seqid, start, end = limit

        # Identify possible bins
        _bins = bins.bins(int(start), int(end), one=False)

        # Use different overlap conditions
        if completely_within:
            d['LIMIT'] = (
                "features.seqid = ? AND features.start >= ? "
                "AND features.end <= ?"
            )
            args.extend([seqid, start, end])

        else:
            d['LIMIT'] = (
                "features.seqid = ? AND features.start <= ? "
                "AND features.end >= ?"
            )
            # Note order (end, start)
            args.extend([seqid, end, start])

        # Add bin clause
        d['LIMIT'] += " AND features.bin IN (%s)" % (','.join(map(str, _bins)))

    if strand:
        # e.g., "strand = '+'"
        d['STRAND'] = "features.strand = ?"
        args.append(strand)

    # TODO: implement file_order!
    valid_order_by = constants._gffkeys_extra + ['file_order', 'length']
    _order_by = []
    if order_by:
        # Default is essentially random order.
        #
        # e.g. "ORDER BY seqid, start DESC"
        if isinstance(order_by, six.string_types):
            _order_by.append(order_by)

        else:
            for k in order_by:
                if k not in valid_order_by:
                    raise ValueError("%s not a valid order-by value in %s"
                                     % (k, valid_order_by))

                # There's no length field, so order by end - start
                if k == 'length':
                    k = '(end - start)'

                _order_by.append(k)

        _order_by = ','.join(_order_by)
        if reverse:
            direction = 'DESC'
        else:
            direction = 'ASC'
        d['ORDER_BY'] = 'ORDER BY %s %s' % (_order_by, direction)

    # Ensure only one "WHERE" is included; the rest get "AND ".  This is ugly.
    where = False
    if "where" in d['OTHER'].lower():
        where = True
    for i in ['EXTRA', 'FEATURETYPE', 'LIMIT', 'STRAND']:
        if d[i]:
            if not where:
                d[i] = "WHERE " + d[i]
                where = True
            else:
                d[i] = "AND " + d[i]

    return _QUERY.format(**d), args
예제 #17
0
    def __init__(self, seqid=".", source=".", featuretype=".",
                 start=".", end=".", score=".", strand=".", frame=".",
                 attributes=None, extra=None, bin=None, id=None, dialect=None,
                 file_order=None, keep_order=False, sort_attribute_values=False):
        """
        Represents a feature from the database.

        When printed, reproduces the original line from the file as faithfully
        as possible using `dialect`.

        Usually you won't want to use this directly, since it has various
        implementation details needed for operating in the context of FeatureDB
        objects.  Instead, try the :func:`feature_from_line` function.

        Parameters
        ----------

        seqid : string
            Name of the sequence (often chromosome)

        source : string
            Source of the feature; typically the originating database or
            program that predicted the feature

        featuretype : string
            Type of feature.  For example "gene", "exon", "TSS", etc

        start, end : int or "."
            1-based coordinates; start must be <= end.  If "." (the default
            placeholder for GFF files), then the corresponding attribute will
            be None.

        score : string
            Stored as a string.

        strand : "+" | "-" | "."
            Strand of the feature; "." when strand is not relevant.

        frame : "0" | "1" | "2"
            Coding frame.  0 means in-frame; 1 means there is one extra base at
            the beginning, so the first codon starts at the second base;
            2 means two extra bases at the beginning.  Interpretation is strand
            specific; "beginning" for a minus-strand feature is at the end
            coordinate.

        attributes : string or dict
            If a string, first assume it is serialized JSON; if this fails then
            assume it's the original key/vals string.  If it's a dictionary
            already, then use as-is.

            The end result is that this instance's `attributes` attribute will
            always be a dictionary.

            Upon printing, the attributes will be reconstructed based on this
            dictionary and the dialect -- except if the original attributes
            string was provided, in which case that will be used directly.

        extra : string or list
            Additional fields after the canonical 9 fields for GFF/GTF.

            If a string, then first assume it's serialized JSON; if this fails
            then assume it's a tab-delimited string of additional fields.  If
            it's a list already, then use as-is.

        bin : int
            UCSC genomic bin. If None, will be created based on provided
            start/end; if start or end is "." then bin will be None.

        id : None or string
            Database-specific primary key for this feature.  The only time this
            should not be None is if this feature is coming from a database, in
            which case it will be filled in automatically.

        dialect : dict or None

            The dialect to use when reconstructing attribute strings; defaults
            to the GFF3 spec.  :class:`FeatureDB` objects will automatically
            attach the dialect from the original file.

        file_order : int
            This is the `rowid` special field used in a sqlite3 database; this
            is provided by FeatureDB.

        keep_order : bool
            If True, then the attributes in the printed string will be in the
            order specified in the dialect.  Disabled by default, since this
            sorting step is time-consuming over many features.

        sort_attribute_values : bool
            If True, then the values of each attribute will be sorted when the
            feature is printed.  Mostly useful for testing, where the order is
            important for checking against expected values. Disabled by
            default, since it can be time-consuming over many features.

        """
        # start/end can be provided as int-like, ".", or None, but will be
        # converted to int or None
        if start == ".":
            start = None
        elif start is not None:
            start = int(start)
        if end == ".":
            end = None
        elif end is not None:
            end = int(end)

        # Flexible handling of attributes:
        # If dict, then use that; otherwise assume JSON and convert to a dict;
        # otherwise assume original string and convert to a dict.
        #
        # dict_class is set at the module level above...this is so you can swap
        # in and out different dict implementations (ordered, defaultdict, etc)
        # for testing.
        attributes = attributes or dict_class()

        if isinstance(attributes, six.string_types):
            try:
                attributes = helpers._unjsonify(attributes, isattributes=True)

            # it's a string but not JSON: assume original attributes string.
            except simplejson.JSONDecodeError:

                # But Feature.attributes is still a dict
                attributes, _dialect = parser._split_keyvals(attributes)

                # Use this dialect if none provided.
                dialect = dialect or _dialect

        # If string, then try un-JSONifying it into a list; if that doesn't
        # work then assume it's tab-delimited and convert to a list.
        extra = extra or []
        if isinstance(extra, six.string_types):
            try:
                extra = helpers._unjsonify(extra)
            except simplejson.JSONDecodeError:
                extra = extra.split('\t')

        # Calculate bin if not provided
        if bin is None:
            try:
                bin = bins.bins(start, end, one=True)
            except TypeError:
                bin = None

        self.seqid = seqid
        self.source = source
        self.featuretype = featuretype
        self.start = start
        self.end = end
        self.score = score
        self.strand = strand
        self.frame = frame
        self.attributes = attributes
        self.extra = extra
        self.bin = bin
        self.id = id
        self.dialect = dialect or constants.dialect
        self.file_order = file_order
        self.keep_order = keep_order
        self.sort_attribute_values = sort_attribute_values