示例#1
0
def missing_tags(record):
    """Find features without product
    """
    results = []
    good = 0
    bad = 0
    qc_features = []

    for gene in coding_genes(record.features):
        cds = [x for x in genes(gene.sub_features, feature_type="CDS")]
        if len(cds) == 0:
            log.warn("Gene missing CDS subfeature %s", get_gff3_id(gene))
            continue

        cds = cds[0]

        if "product" not in cds.qualifiers:
            log.info("Missing product tag on %s", get_gff3_id(gene))
            qc_features.append(
                gen_qc_feature(
                    cds.location.start,
                    cds.location.end,
                    "Missing product tag",
                    strand=cds.strand,
                ))
            results.append(cds)
            bad += 1
        else:
            good += 1

    return good, bad, results, qc_features
示例#2
0
def excessive_overlap(record, excess=15, excess_divergent=30):
    """
    Find excessive overlaps in the genome, where excessive is defined as 15
    bases for same strand, and 30 for divergent translation.

    Does a product of all the top-level features in the genome, and calculates
    gaps.
    """
    results = []
    bad = 0
    qc_features = []

    for (gene_a,
         gene_b) in itertools.combinations(coding_genes(record.features), 2):
        # Get the CDS from the subfeature list.
        # TODO: not recursive.
        cds_a = [x for x in genes(gene_a.sub_features, feature_type="CDS")]
        cds_b = [x for x in genes(gene_b.sub_features, feature_type="CDS")]

        if len(cds_a) == 0:
            log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_a))
            continue

        if len(cds_b) == 0:
            log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_b))
            continue

        cds_a = cds_a[0]
        cds_b = cds_b[0]

        # Set of locations that are included in the CDS of A and the
        # CDS of B
        cas = set(range(cds_a.location.start, cds_a.location.end))
        cbs = set(range(cds_b.location.start, cds_b.location.end))

        # Here we calculate the intersection between the two sets, and
        # if it's larger than our excessive size, we know that they're
        # overlapped
        ix = cas.intersection(cbs)

        if (cds_a.location.strand == cds_b.location.strand and len(ix) >=
                excess) or (cds_a.location.strand != cds_b.location.strand
                            and len(ix) >= excess_divergent):
            bad += float(len(ix)) / float(min(excess, excess_divergent))
            qc_features.append(
                gen_qc_feature(min(ix),
                               max(ix),
                               "Excessive Overlap",
                               id_src=gene_a))
            results.append((gene_a, gene_b, min(ix), max(ix)))

    # Good isn't accurate here. It's a triangle number and just ugly, but we
    # don't care enough to fix it.
    good = len(list(coding_genes(record.features)))
    good = int(good - bad)
    if good < 0:
        good = 0
    return good, int(bad), results, qc_features
def bad_gene_model(record):
    """Find features without product
    """
    results = []
    good = 0
    bad = 0
    qc_features = []

    for gene in coding_genes(record.features):
        exons = [x for x in genes(gene.sub_features, feature_type='exon') if len(x) > 10]
        CDSs = [x for x in genes(gene.sub_features, feature_type='CDS')]
        if len(exons) >= 1 and len(CDSs) >= 1:
            if len(exons) != len(CDSs):
                results.append((
                    get_gff3_id(gene),
                    None,
                    None,
                    'Mismatched number of exons and CDSs in gff3 representation',
                ))
                qc_features.append(gen_qc_feature(
                    gene.location.start, gene.location.end,
                    'Mismatched number of exons and CDSs in gff3 representation',
                    strand=gene.strand,
                    id_src=gene
                ))
                bad += 1
            else:
                for (exon, cds) in zip(sorted(exons, key=lambda x: x.location.start), sorted(CDSs, key=lambda x: x.location.start)):
                    if len(exon) != len(cds):
                        results.append((
                            get_gff3_id(gene),
                            exon,
                            cds,
                            'CDS does not extend to full length of gene',
                        ))
                        qc_features.append(gen_qc_feature(
                            exon.location.start, exon.location.end,
                            'CDS does not extend to full length of gene',
                            strand=exon.strand,
                            id_src=gene
                        ))
                        bad += 1
                    else:
                        good += 1
        else:
            log.warn("Could not handle %s, %s", exons, CDSs)
            results.append((
                get_gff3_id(gene),
                None,
                None,
                '{0} exons, {1} CDSs'.format(len(exons), len(CDSs))
            ))

    return good, len(results) + bad, results, qc_features
def weird_starts(record):
    """Find features without product
    """
    good = 0
    bad = 0
    qc_features = []
    results = []

    overall = {}
    for gene in coding_genes(record.features):
        seq = [x for x in genes(gene.sub_features, feature_type='CDS')]
        if len(seq) == 0:
            log.warn("No CDS for gene %s", get_gff3_id(gene))
            continue
        else:
            seq = seq[0]

        seq_str = str(seq.extract(record.seq))
        start_codon = seq_str[0:3]
        stop_codon = seq_str[-3]
        seq.__start = start_codon
        seq.__stop = stop_codon
        if start_codon not in overall:
            overall[start_codon] = 1
        else:
            overall[start_codon] += 1

        if start_codon not in ('ATG', 'TTG', 'GTG'):
            log.warn("Weird start codon (%s) on %s", start_codon, get_gff3_id(gene))
            seq.__error = 'Unusual start codon %s' % start_codon

            s = 0
            e = 0
            if seq.strand > 0:
                s = seq.location.start
                e = seq.location.start + 3
            else:
                s = seq.location.end
                e = seq.location.end - 3

            results.append(seq)

            qc_features.append(gen_qc_feature(
                s, e,
                'Weird start codon',
                strand=seq.strand,
                id_src=gene
            ))
            bad += 1
        else:
            good += 1

    return good, bad, results, qc_features, overall
示例#5
0
    def featureBox(self, feature, rowData, class_group, row):
        x = (self.calc_width *
             (feature.location.start - rowData[row]["start"]) /
             self._internal_maxrowlength + self.x_offset)
        h = 15
        y = (row - 1) * self.ils + self.y_offset - h / 2

        w = (float(self.calc_width *
                   abs(feature.location.end - feature.location.start)) /
             self._internal_maxrowlength)

        if self.separate_strands:
            y += -30 * feature.location.strand

        # Alternate acording to frame
        y += (10 * (
            (feature.location.start - 2 * feature.location.strand + 1) % 3) -
              10 * feature.location.strand)

        return (
            self.svg.rect(
                id=base64.b32encode(get_gff3_id(feature.feature)),
                insert=(x, y),
                size=(w, h),
                style="fill:%s;stroke-width:0.5;" % feature.color,
            ),
            x,
            y,
            w,
            h,
        )
示例#6
0
    def featureLabel(self, feature, rowData, class_group, row, label, x, y, w,
                     h):
        lx = x + w / 2 - len(label) * 4
        lxm = x + (float(w) / 2)
        ly = y + h / 2 + 40

        if feature.location.strand > 0:
            ly -= 70

        g = self.svg.g(id="%s_g" %
                       base64.b32encode(get_gff3_id(feature.feature)))
        g.add(
            self.svg.text(
                id="label_text_%s" %
                base64.b32encode(get_gff3_id(feature.feature)),
                text=label,
                x=[lx],
                y=[ly],
                style=FONT_STYLE,
            ))

        if feature.location.strand > 0:
            callout_start = (lxm, y)
            callout_end = (lxm, ly)
        else:
            callout_start = (lxm, y + h)
            callout_end = (lxm, ly - h)

        g.add(
            self.svg.line(
                id="label_callout_%s" %
                base64.b32encode(get_gff3_id(feature.feature)),
                start=callout_start,
                end=callout_end,
            ))

        return g
示例#7
0
def missing_rbs(record, lookahead_min=5, lookahead_max=15):
    """
    Identify gene features with missing RBSs

    This "looks ahead" 5-15 bases ahead of each gene feature, and checks if
    there's an RBS feature in those bounds.

    The returned data is a set of genes with the RBS sequence in the __upstream
    attribute, and a message in the __message attribute.
    """
    results = []
    good = 0
    bad = 0
    qc_features = []
    sd_finder = NaiveSDCaller()

    any_rbss = False

    for gene in coding_genes(record.features):
        # Check if there are RBSs, TODO: make this recursive. Each feature in
        # gene.sub_features can also have sub_features.
        rbss = get_rbs_from(gene)
        # No RBS found
        if len(rbss) == 0:
            # Get the sequence lookahead_min to lookahead_max upstream
            if gene.strand > 0:
                start = gene.location.start - lookahead_max
                end = gene.location.start - lookahead_min
            else:
                start = gene.location.end + lookahead_min
                end = gene.location.end + lookahead_max
            # We have to ensure the feature is ON the genome, otherwise we may
            # be trying to access a location outside of the length of the
            # genome, which would be bad.
            (start,
             end) = __ensure_location_in_bounds(start=start,
                                                end=end,
                                                parent_length=len(record))
            # Temporary feature to extract sequence
            tmp = SeqFeature(FeatureLocation(start, end, strand=gene.strand),
                             type="domain")
            # Get the sequence
            seq = str(tmp.extract(record.seq))
            # Set the default properties
            gene.__upstream = seq.lower()
            gene.__message = "No RBS annotated, None found"

            # Try and do an automated shinefind call
            sds = sd_finder.list_sds(seq)
            if len(sds) > 0:
                sd = sds[0]
                gene.__upstream = sd_finder.highlight_sd(
                    seq.lower(), sd["start"], sd["end"])
                gene.__message = "Unannotated but valid RBS"

            qc_features.append(
                gen_qc_feature(start,
                               end,
                               "Missing RBS",
                               strand=gene.strand,
                               id_src=gene))

            bad += 1
            results.append(gene)
        else:
            if len(rbss) > 1:
                log.warn("%s RBSs found for gene %s", rbss[0].id,
                         get_gff3_id(gene))
            any_rbss = True
            # get first RBS/CDS
            cds = list(genes(gene.sub_features, feature_type="CDS"))[0]
            rbs = rbss[0]

            # Get the distance between the two
            if gene.strand > 0:
                distance = cds.location.start - rbs.location.end
            else:
                distance = rbs.location.start - cds.location.end

            # If the RBS is too far away, annotate that
            if distance > lookahead_max:
                gene.__message = "RBS too far away (%s nt)" % distance

                qc_features.append(
                    gen_qc_feature(
                        rbs.location.start,
                        rbs.location.end,
                        gene.__message,
                        strand=gene.strand,
                        id_src=gene,
                    ))

                bad += 1
                results.append(gene)
            else:
                good += 1

    return good, bad, results, qc_features, any_rbss
示例#8
0
def weird_starts(record):
    """Find features without product
    """
    good = 0
    bad = 0
    qc_features = []
    results = []

    overall = {}
    for gene in coding_genes(record.features):
        seq = [x for x in genes(gene.sub_features, feature_type="CDS")]
        if len(seq) == 0:
            log.warn("No CDS for gene %s", get_gff3_id(gene))
            continue
        else:
            seq = seq[0]

        seq_str = str(seq.extract(record.seq))
        start_codon = seq_str[0:3]
        if len(seq_str) < 3:
            sys.stderr.write("Fatal Error: CDS of length less than 3 at " +
                             str(seq.location) + '\n')
            exit(2)


#        if len(seq_str) % 3 != 0:
#            if len(seq_str) < 3:
#                stop_codon = seq_str[-(len(seq_str))]
#            else:
#                stop_codon = seq_str[-3]
#
#            log.warn("CDS at %s length is not a multiple of three (Length = %d)", get_gff3_id(gene), len(seq_str))
#            seq.__error = "Bad CDS Length"
#            results.append(seq)
#            qc_features.append(
#                gen_qc_feature(
#                    s, e, "Bad Length", strand=seq.strand, id_src=gene
#                )
#            )
#            bad += 1
#            seq.__start = start_codon
#            seq.__stop = stop_codon
#            continue

        stop_codon = seq_str[-3]
        seq.__start = start_codon
        seq.__stop = stop_codon
        if start_codon not in overall:
            overall[start_codon] = 1
        else:
            overall[start_codon] += 1

        if start_codon not in ("ATG", "TTG", "GTG"):
            log.warn("Weird start codon (%s) on %s", start_codon,
                     get_gff3_id(gene))
            seq.__error = "Unusual start codon %s" % start_codon

            s = 0
            e = 0
            if seq.strand > 0:
                s = seq.location.start
                e = seq.location.start + 3
            else:
                s = seq.location.end
                e = seq.location.end - 3

            results.append(seq)
            qc_features.append(
                gen_qc_feature(s,
                               e,
                               "Weird start codon",
                               strand=seq.strand,
                               id_src=gene))
            bad += 1
        else:
            good += 1

    return good, bad, results, qc_features, overall