def missing_tags(record): """Find features without product """ results = [] good = 0 bad = 0 qc_features = [] for gene in coding_genes(record.features): cds = [x for x in genes(gene.sub_features, feature_type="CDS")] if len(cds) == 0: log.warn("Gene missing CDS subfeature %s", get_gff3_id(gene)) continue cds = cds[0] if "product" not in cds.qualifiers: log.info("Missing product tag on %s", get_gff3_id(gene)) qc_features.append( gen_qc_feature( cds.location.start, cds.location.end, "Missing product tag", strand=cds.strand, )) results.append(cds) bad += 1 else: good += 1 return good, bad, results, qc_features
def excessive_overlap(record, excess=15, excess_divergent=30): """ Find excessive overlaps in the genome, where excessive is defined as 15 bases for same strand, and 30 for divergent translation. Does a product of all the top-level features in the genome, and calculates gaps. """ results = [] bad = 0 qc_features = [] for (gene_a, gene_b) in itertools.combinations(coding_genes(record.features), 2): # Get the CDS from the subfeature list. # TODO: not recursive. cds_a = [x for x in genes(gene_a.sub_features, feature_type="CDS")] cds_b = [x for x in genes(gene_b.sub_features, feature_type="CDS")] if len(cds_a) == 0: log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_a)) continue if len(cds_b) == 0: log.warn("Gene missing subfeatures; %s", get_gff3_id(gene_b)) continue cds_a = cds_a[0] cds_b = cds_b[0] # Set of locations that are included in the CDS of A and the # CDS of B cas = set(range(cds_a.location.start, cds_a.location.end)) cbs = set(range(cds_b.location.start, cds_b.location.end)) # Here we calculate the intersection between the two sets, and # if it's larger than our excessive size, we know that they're # overlapped ix = cas.intersection(cbs) if (cds_a.location.strand == cds_b.location.strand and len(ix) >= excess) or (cds_a.location.strand != cds_b.location.strand and len(ix) >= excess_divergent): bad += float(len(ix)) / float(min(excess, excess_divergent)) qc_features.append( gen_qc_feature(min(ix), max(ix), "Excessive Overlap", id_src=gene_a)) results.append((gene_a, gene_b, min(ix), max(ix))) # Good isn't accurate here. It's a triangle number and just ugly, but we # don't care enough to fix it. good = len(list(coding_genes(record.features))) good = int(good - bad) if good < 0: good = 0 return good, int(bad), results, qc_features
def bad_gene_model(record): """Find features without product """ results = [] good = 0 bad = 0 qc_features = [] for gene in coding_genes(record.features): exons = [x for x in genes(gene.sub_features, feature_type='exon') if len(x) > 10] CDSs = [x for x in genes(gene.sub_features, feature_type='CDS')] if len(exons) >= 1 and len(CDSs) >= 1: if len(exons) != len(CDSs): results.append(( get_gff3_id(gene), None, None, 'Mismatched number of exons and CDSs in gff3 representation', )) qc_features.append(gen_qc_feature( gene.location.start, gene.location.end, 'Mismatched number of exons and CDSs in gff3 representation', strand=gene.strand, id_src=gene )) bad += 1 else: for (exon, cds) in zip(sorted(exons, key=lambda x: x.location.start), sorted(CDSs, key=lambda x: x.location.start)): if len(exon) != len(cds): results.append(( get_gff3_id(gene), exon, cds, 'CDS does not extend to full length of gene', )) qc_features.append(gen_qc_feature( exon.location.start, exon.location.end, 'CDS does not extend to full length of gene', strand=exon.strand, id_src=gene )) bad += 1 else: good += 1 else: log.warn("Could not handle %s, %s", exons, CDSs) results.append(( get_gff3_id(gene), None, None, '{0} exons, {1} CDSs'.format(len(exons), len(CDSs)) )) return good, len(results) + bad, results, qc_features
def weird_starts(record): """Find features without product """ good = 0 bad = 0 qc_features = [] results = [] overall = {} for gene in coding_genes(record.features): seq = [x for x in genes(gene.sub_features, feature_type='CDS')] if len(seq) == 0: log.warn("No CDS for gene %s", get_gff3_id(gene)) continue else: seq = seq[0] seq_str = str(seq.extract(record.seq)) start_codon = seq_str[0:3] stop_codon = seq_str[-3] seq.__start = start_codon seq.__stop = stop_codon if start_codon not in overall: overall[start_codon] = 1 else: overall[start_codon] += 1 if start_codon not in ('ATG', 'TTG', 'GTG'): log.warn("Weird start codon (%s) on %s", start_codon, get_gff3_id(gene)) seq.__error = 'Unusual start codon %s' % start_codon s = 0 e = 0 if seq.strand > 0: s = seq.location.start e = seq.location.start + 3 else: s = seq.location.end e = seq.location.end - 3 results.append(seq) qc_features.append(gen_qc_feature( s, e, 'Weird start codon', strand=seq.strand, id_src=gene )) bad += 1 else: good += 1 return good, bad, results, qc_features, overall
def featureBox(self, feature, rowData, class_group, row): x = (self.calc_width * (feature.location.start - rowData[row]["start"]) / self._internal_maxrowlength + self.x_offset) h = 15 y = (row - 1) * self.ils + self.y_offset - h / 2 w = (float(self.calc_width * abs(feature.location.end - feature.location.start)) / self._internal_maxrowlength) if self.separate_strands: y += -30 * feature.location.strand # Alternate acording to frame y += (10 * ( (feature.location.start - 2 * feature.location.strand + 1) % 3) - 10 * feature.location.strand) return ( self.svg.rect( id=base64.b32encode(get_gff3_id(feature.feature)), insert=(x, y), size=(w, h), style="fill:%s;stroke-width:0.5;" % feature.color, ), x, y, w, h, )
def featureLabel(self, feature, rowData, class_group, row, label, x, y, w, h): lx = x + w / 2 - len(label) * 4 lxm = x + (float(w) / 2) ly = y + h / 2 + 40 if feature.location.strand > 0: ly -= 70 g = self.svg.g(id="%s_g" % base64.b32encode(get_gff3_id(feature.feature))) g.add( self.svg.text( id="label_text_%s" % base64.b32encode(get_gff3_id(feature.feature)), text=label, x=[lx], y=[ly], style=FONT_STYLE, )) if feature.location.strand > 0: callout_start = (lxm, y) callout_end = (lxm, ly) else: callout_start = (lxm, y + h) callout_end = (lxm, ly - h) g.add( self.svg.line( id="label_callout_%s" % base64.b32encode(get_gff3_id(feature.feature)), start=callout_start, end=callout_end, )) return g
def missing_rbs(record, lookahead_min=5, lookahead_max=15): """ Identify gene features with missing RBSs This "looks ahead" 5-15 bases ahead of each gene feature, and checks if there's an RBS feature in those bounds. The returned data is a set of genes with the RBS sequence in the __upstream attribute, and a message in the __message attribute. """ results = [] good = 0 bad = 0 qc_features = [] sd_finder = NaiveSDCaller() any_rbss = False for gene in coding_genes(record.features): # Check if there are RBSs, TODO: make this recursive. Each feature in # gene.sub_features can also have sub_features. rbss = get_rbs_from(gene) # No RBS found if len(rbss) == 0: # Get the sequence lookahead_min to lookahead_max upstream if gene.strand > 0: start = gene.location.start - lookahead_max end = gene.location.start - lookahead_min else: start = gene.location.end + lookahead_min end = gene.location.end + lookahead_max # We have to ensure the feature is ON the genome, otherwise we may # be trying to access a location outside of the length of the # genome, which would be bad. (start, end) = __ensure_location_in_bounds(start=start, end=end, parent_length=len(record)) # Temporary feature to extract sequence tmp = SeqFeature(FeatureLocation(start, end, strand=gene.strand), type="domain") # Get the sequence seq = str(tmp.extract(record.seq)) # Set the default properties gene.__upstream = seq.lower() gene.__message = "No RBS annotated, None found" # Try and do an automated shinefind call sds = sd_finder.list_sds(seq) if len(sds) > 0: sd = sds[0] gene.__upstream = sd_finder.highlight_sd( seq.lower(), sd["start"], sd["end"]) gene.__message = "Unannotated but valid RBS" qc_features.append( gen_qc_feature(start, end, "Missing RBS", strand=gene.strand, id_src=gene)) bad += 1 results.append(gene) else: if len(rbss) > 1: log.warn("%s RBSs found for gene %s", rbss[0].id, get_gff3_id(gene)) any_rbss = True # get first RBS/CDS cds = list(genes(gene.sub_features, feature_type="CDS"))[0] rbs = rbss[0] # Get the distance between the two if gene.strand > 0: distance = cds.location.start - rbs.location.end else: distance = rbs.location.start - cds.location.end # If the RBS is too far away, annotate that if distance > lookahead_max: gene.__message = "RBS too far away (%s nt)" % distance qc_features.append( gen_qc_feature( rbs.location.start, rbs.location.end, gene.__message, strand=gene.strand, id_src=gene, )) bad += 1 results.append(gene) else: good += 1 return good, bad, results, qc_features, any_rbss
def weird_starts(record): """Find features without product """ good = 0 bad = 0 qc_features = [] results = [] overall = {} for gene in coding_genes(record.features): seq = [x for x in genes(gene.sub_features, feature_type="CDS")] if len(seq) == 0: log.warn("No CDS for gene %s", get_gff3_id(gene)) continue else: seq = seq[0] seq_str = str(seq.extract(record.seq)) start_codon = seq_str[0:3] if len(seq_str) < 3: sys.stderr.write("Fatal Error: CDS of length less than 3 at " + str(seq.location) + '\n') exit(2) # if len(seq_str) % 3 != 0: # if len(seq_str) < 3: # stop_codon = seq_str[-(len(seq_str))] # else: # stop_codon = seq_str[-3] # # log.warn("CDS at %s length is not a multiple of three (Length = %d)", get_gff3_id(gene), len(seq_str)) # seq.__error = "Bad CDS Length" # results.append(seq) # qc_features.append( # gen_qc_feature( # s, e, "Bad Length", strand=seq.strand, id_src=gene # ) # ) # bad += 1 # seq.__start = start_codon # seq.__stop = stop_codon # continue stop_codon = seq_str[-3] seq.__start = start_codon seq.__stop = stop_codon if start_codon not in overall: overall[start_codon] = 1 else: overall[start_codon] += 1 if start_codon not in ("ATG", "TTG", "GTG"): log.warn("Weird start codon (%s) on %s", start_codon, get_gff3_id(gene)) seq.__error = "Unusual start codon %s" % start_codon s = 0 e = 0 if seq.strand > 0: s = seq.location.start e = seq.location.start + 3 else: s = seq.location.end e = seq.location.end - 3 results.append(seq) qc_features.append( gen_qc_feature(s, e, "Weird start codon", strand=seq.strand, id_src=gene)) bad += 1 else: good += 1 return good, bad, results, qc_features, overall