def run(self): extractions = [ ] for item in self.genes.split(','): extraction = item.split('/') assert len(extraction) == 4 extractions.append(extraction) rename = { } if self.rename: for item in self.rename.split(','): old,new = item.split('=') rename[old] = new work = self.get_workspace() with workspace.tempspace() as temp: items = list(annotation.read_annotations(self.annotation)) for item in items: item.seqid = rename.get(item.seqid, item.seqid) annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log)) del items with open(temp/'temp.fa','wb') as f: for name,seq in io.read_sequences(self.genome): name = name.split()[0] name = rename.get(name,name) io.write_fasta(f, name, seq) reference_directory.Make_tt_reference( self.output_dir, filenames = [ temp/'temp.fa', temp/'temp.gff' ] + self.extra, index = self.index, shrimp = self.shrimp, bowtie = self.bowtie, star = self.star ).run()
def run(self): extractions = [ ] for item in self.genes.split(','): extraction = item.split('/') assert len(extraction) == 4 extractions.append(extraction) rename = { } if self.rename: for item in self.rename.split(','): old,new = item.split('=') rename[old] = new work = self.get_workspace() with workspace.tempspace() as temp: items = list(annotation.read_annotations(self.annotation)) for item in items: item.seqid = rename.get(item.seqid, item.seqid) annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log)) del items with open(temp/'temp.fa','wb') as f: for name,seq in io.read_sequences(self.genome): name = name.split()[0] name = rename.get(name,name) io.write_fasta(f, name, seq) reference_directory.Make_tt_reference( self.output_dir, filenames = [ temp/'temp.fa', temp/'temp.gff' ], index = self.index, ).run()
def run(self): assert self.release assert self.species assert self.assembly assert self.dna extractions = [ ] for item in self.genes.split(','): extraction = item.split('/') assert len(extraction) == 4 extractions.append(extraction) rename = { } if self.rename: for item in self.rename.split(','): old,new = item.split('=') rename[old] = new work = self.get_workspace() ensembl = workspace.Workspace(work/'ensembl') genome_filename = self.species+"."+self.assembly+"."+self.dna+".fa.gz" genome_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/fasta/"+self.species.lower()+"/dna/"+genome_filename gff_filename = self.species+"."+self.assembly+"."+self.release+".gff3.gz" gff_url = "rsync://ftp.ensembl.org/ensembl/pub/release-"+self.release+"/gff3/"+self.species.lower()+"/"+gff_filename if self.download: self.log.log("Fetching "+genome_url+"\n") io.execute(['rsync','-aP',genome_url, ensembl/genome_filename]) self.log.log("Fetching "+gff_url+"\n") io.execute(['rsync','-aP',gff_url, ensembl/gff_filename]) with workspace.tempspace() as temp: items = list(annotation.read_annotations(ensembl/gff_filename)) for item in items: item.seqid = rename.get(item.seqid, item.seqid) annotation.write_gff3(temp/'temp.gff', get_genes(items, extractions, self.log)) del items with open(temp/'temp.fa','wb') as f: for name,seq in io.read_sequences(ensembl/genome_filename): name = name.split()[0] name = rename.get(name,name) io.write_fasta(f, name, seq) reference_directory.Make_tt_reference( self.output_dir, filenames = [ temp/'temp.fa', temp/'temp.gff' ], index = self.index, ).run()
def run(self): items = list(annotation.read_annotations(self.parent)) annotation.link_up_annotations(items) for item in items: assert len(item.parents) <= 1 genes = [ item for item in items if item.type == "gene" ] downstrand_genes = [ _extend(_three_prime(item), self.extension) for item in genes ] exons = [ item for item in items if item.type == "exon" ] utrs = [ _extend(item, self.extension) for item in items if item.type == "three_prime_utr" ] gene_index = span_index.index_annotations(genes) downstrand_gene_index = span_index.index_annotations(downstrand_genes) exon_index = span_index.index_annotations(exons) utr_index = span_index.index_annotations(utrs) peaks = list(annotation.read_annotations(self.child)) for peak in peaks: # Query is final base in genome before poly(A) starts query = peak.three_prime().shifted(-1,0) hit_to = "3'UTR" hits = [ item.parents[0].parents[0] for item in utr_index.get(query, True) ] if not hits: hit_to = "Exon" hits = [ item.parents[0].parents[0] for item in exon_index.get(query, True) ] # For non-coding RNAs, which don't have a 3' UTR if not hits: hit_to = "Downstrand" hits = downstrand_gene_index.get(query, True) if not hits: hit_to = "Intron" hits = gene_index.get(query, True) antisense_hits = gene_index.get(query.reversed(), True) if not hits: hit_to = "Antisense" hits = antisense_hits if hits: peak.attr["Parent"] = join_descriptions([ item.get_id() for item in hits ], ",") peak.attr["Relation"] = hit_to peak.attr["Name"] = join_descriptions([ item.attr.get("Name","") for item in hits ]) peak.attr["Product"] = hit_to + " " + join_descriptions([ item.attr.get("Product","") for item in hits ]) peak.attr["Biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in hits ]) if antisense_hits: peak.attr["Antisense_parent"] = join_descriptions([ item.get_id() for item in antisense_hits ], ",") peak.attr["Antisense_name"] = join_descriptions([ item.attr.get("Name","") for item in antisense_hits ]) peak.attr["Antisense_product"] = "Antisense " + join_descriptions([ item.attr.get("Product","") for item in antisense_hits ]) peak.attr["Antisense_biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in antisense_hits]) annotation.write_gff3(self.prefix+"-parent.gff", genes) #Hmm annotation.write_gff3(self.prefix+"-child.gff", peaks)
def run(self): seqs = env.load_ref(self.reference).seqs result = [] errors = [] with open(self.csv_file, "rU") as f: reader = csv.reader(f) headings = reader.next() headings = [item.lower() for item in headings] assert "id" in headings assert "primer" in headings id_col = headings.index("id") primer_col = headings.index("primer") for row in reader: if len(row) == 0 or (not row[id_col].strip() and not row[primer_col].strip()): continue id = row[id_col].strip() assert " " not in id, "ID contains space: " + id primer = row[primer_col].strip().upper() assert len(primer) > self.skip, "Primer too short: " + id assert [char in "ACGT" for char in primer], "Primer not ACGT: " + id primer = primer[self.skip:] rprimer = bio.reverse_complement(primer) hits = [] for seq_name in seqs: for match in re.finditer(primer, seqs[seq_name], re.IGNORECASE): hits.append((seq_name, 1, match.start(), match.start() + self.length)) for match in re.finditer(rprimer, seqs[seq_name], re.IGNORECASE): hits.append((seq_name, -1, match.end() - self.length, match.end())) if len(hits) > 100: raise config.Error("Many many hits for " + id + ".") if not hits: errors.append("No hits for " + id + ".") continue if len(hits) > 1: self.log.log("Warning: %d hits for %s.\n" % (len(hits), id)) for i, hit in enumerate(hits): hit_name = id if len(hits) > 1: hit_name += "-%dof%d" % (i + 1, len(hits)) result.append( annotation.Annotation(seqid=hit[0], source="tail-tools", type="region", start=hit[2], end=hit[3], strand=hit[1], attr=dict(ID=hit_name, Primer=primer))) if errors: raise config.Error("\n".join(errors)) annotation.write_gff3(self.prefix + ".gff", result)
def run(self): work = self.get_workspace() work.update_param(remove=['tail_tools_reference_version']) nesoni.Make_reference( self.output_dir, filenames = self.filenames, snpeff = False, cs = 'ifavailable' if self.index else False, ls = False, bowtie = 'ifavailable' if self.index else False, ).run() annotations = list(annotation.read_annotations(work/'reference.gff')) annotation.link_up_annotations(annotations) exon_index = span_index.index_annotations([ item for item in annotations if item.type == "exon" ]) mrna_end_index = span_index.index_annotations([ item.three_prime() for item in annotations if item.type == "mRNA" ]) mrna_utrs = [ ] gene_utrs = [ ] for gene in annotations: if gene.type != 'gene': continue mrnas = [ item for item in gene.children if item.type == 'mRNA' ] assert mrnas, "Gene without any mRNAs: "+gene.get_id() gene.attr['color'] = '#880088' gene.start = min(item.start for item in mrnas) gene.end = max(item.end for item in mrnas) gene.attr["max_extension"] = str(_max_extension(gene, exon_index, mrna_end_index)) gene_utr_5primes = [ ] for mrna in mrnas: assert mrna.strand == gene.strand, mrna assert mrna.seqid == gene.seqid, mrna mrna.attr["max_extension"] = str(_max_extension(mrna, exon_index, mrna_end_index)) cdss = [ item for item in mrna.children if item.type == 'CDS' ] exons = [ item for item in mrna.children if item.type == 'exon' ] if not exons: continue #link up annotations sorts children, so final is really final for item in exons[:-1]: item.attr["max_extension"] = "0" exons[-1].attr["max_extension"] = mrna.attr["max_extension"] if not cdss: continue mrna_utr_5primes = [ ] if gene.strand >= 0: cds_3prime = max(item.end for item in cdss) for item in exons: if item.end >= cds_3prime: mrna_utr_5primes.append(max(item.start,cds_3prime)) else: cds_3prime = min(item.start for item in cdss) for item in exons: if item.start <= cds_3prime: mrna_utr_5primes.append(min(item.end,cds_3prime)) if mrna.strand >= 0: utr_start = min(mrna_utr_5primes) if mrna_utr_5primes else mrna.end utr_end = max(utr_start+1,mrna.end) gene_utr_5primes.append(utr_start) else: utr_end = max(mrna_utr_5primes) if mrna_utr_5primes else mrna.start utr_start = min(mrna.start,utr_end-1) gene_utr_5primes.append(utr_end) attr = mrna.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID']+'-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source = 'tt', type = 'three_prime_utr', seqid = mrna.seqid, strand = mrna.strand, start = utr_start, end = utr_end, attr = attr, ) max_ext = _max_extension(utr, exon_index, mrna_end_index) utr.attr["max_extension"] = str(max_ext) #Only include if there is an annotated 3' UTR or end is not in the middle of some other isoform's exon if utr_end-utr_start+max_ext > 1: mrna_utrs.append(utr) if gene.strand >= 0: utr_start = max(gene_utr_5primes) if gene_utr_5primes else gene.end utr_end = max(utr_start+1,gene.end) else: utr_end = min(gene_utr_5primes) if gene_utr_5primes else gene.start utr_start = min(gene.start,utr_end-1) attr = gene.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID']+'-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source = 'tt', type = 'three_prime_utr', seqid = gene.seqid, strand = gene.strand, start = utr_start, end = utr_end, attr = attr, ) utr.attr["max_extension"] = str(_max_extension(utr, exon_index, mrna_end_index)) gene_utrs.append(utr) annotation.write_gff3(work/'reference.gff', annotations + mrna_utrs) annotation.write_gff3(work/'utr.gff', gene_utrs) work.update_param(tail_tools_reference_version=work.VERSION)
def run(self): seqs = env.load_ref(self.reference).seqs result = [ ] errors = [ ] with open(self.csv_file, "rU") as f: reader = csv.reader(f) headings = reader.next() headings = [ item.lower() for item in headings ] assert "id" in headings assert "primer" in headings id_col = headings.index("id") primer_col = headings.index("primer") for row in reader: if len(row) == 0 or (not row[id_col].strip() and not row[primer_col].strip()): continue id = row[id_col].strip() assert " " not in id, "ID contains space: "+id primer = row[primer_col].strip().upper() assert len(primer) > self.skip, "Primer too short: "+id assert [ char in "ACGT" for char in primer ], "Primer not ACGT: "+id primer = primer[self.skip:] rprimer = bio.reverse_complement(primer) hits = [ ] for seq_name in seqs: for match in re.finditer( primer, seqs[seq_name], re.IGNORECASE): hits.append( (seq_name, 1, match.start(), match.start()+self.length) ) for match in re.finditer( rprimer, seqs[seq_name], re.IGNORECASE): hits.append( (seq_name, -1, match.end()-self.length, match.end()) ) if len(hits) > 100: raise config.Error("Many many hits for "+id+".") if not hits: errors.append("No hits for "+id+".") continue if len(hits) > 1: self.log.log("Warning: %d hits for %s.\n" % (len(hits),id)) for i, hit in enumerate(hits): hit_name = id if len(hits) > 1: hit_name += "-%dof%d" % (i+1,len(hits)) result.append(annotation.Annotation( seqid = hit[0], source = "tail-tools", type = "region", start = hit[2], end = hit[3], strand = hit[1], attr = dict( ID=hit_name, Primer=primer ) )) if errors: raise config.Error("\n".join(errors)) annotation.write_gff3(self.prefix+".gff", result)
def run(self): work = self.get_workspace() work.update_param(remove=['tail_tools_reference_version']) nesoni.Make_reference( self.output_dir, filenames=self.filenames, snpeff=False, cs='ifavailable' if self.index else False, ls=False, bowtie='ifavailable' if self.index else False, ).run() annotations = list(annotation.read_annotations(work / 'reference.gff')) annotation.link_up_annotations(annotations) exon_index = span_index.index_annotations( [item for item in annotations if item.type == "exon"]) mrna_end_index = span_index.index_annotations([ item.three_prime() for item in annotations if item.type == "mRNA" ]) mrna_utrs = [] gene_utrs = [] for gene in annotations: if gene.type != 'gene': continue mrnas = [item for item in gene.children if item.type == 'mRNA'] assert mrnas, "Gene without any mRNAs: " + gene.get_id() gene.attr['color'] = '#880088' gene.start = min(item.start for item in mrnas) gene.end = max(item.end for item in mrnas) gene.attr["max_extension"] = str( _max_extension(gene, exon_index, mrna_end_index)) gene_utr_5primes = [] for mrna in mrnas: assert mrna.strand == gene.strand, mrna assert mrna.seqid == gene.seqid, mrna mrna.attr["max_extension"] = str( _max_extension(mrna, exon_index, mrna_end_index)) cdss = [item for item in mrna.children if item.type == 'CDS'] exons = [item for item in mrna.children if item.type == 'exon'] if not exons: continue #link up annotations sorts children, so final is really final for item in exons[:-1]: item.attr["max_extension"] = "0" exons[-1].attr["max_extension"] = mrna.attr["max_extension"] if not cdss: continue mrna_utr_5primes = [] if gene.strand >= 0: cds_3prime = max(item.end for item in cdss) for item in exons: if item.end >= cds_3prime: mrna_utr_5primes.append(max( item.start, cds_3prime)) else: cds_3prime = min(item.start for item in cdss) for item in exons: if item.start <= cds_3prime: mrna_utr_5primes.append(min(item.end, cds_3prime)) if mrna.strand >= 0: utr_start = min( mrna_utr_5primes) if mrna_utr_5primes else mrna.end utr_end = max(utr_start + 1, mrna.end) gene_utr_5primes.append(utr_start) else: utr_end = max( mrna_utr_5primes) if mrna_utr_5primes else mrna.start utr_start = min(mrna.start, utr_end - 1) gene_utr_5primes.append(utr_end) attr = mrna.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID'] + '-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source='tt', type='three_prime_utr', seqid=mrna.seqid, strand=mrna.strand, start=utr_start, end=utr_end, attr=attr, ) max_ext = _max_extension(utr, exon_index, mrna_end_index) utr.attr["max_extension"] = str(max_ext) #Only include if there is an annotated 3' UTR or end is not in the middle of some other isoform's exon if utr_end - utr_start + max_ext > 1: mrna_utrs.append(utr) if gene.strand >= 0: utr_start = max( gene_utr_5primes) if gene_utr_5primes else gene.end utr_end = max(utr_start + 1, gene.end) else: utr_end = min( gene_utr_5primes) if gene_utr_5primes else gene.start utr_start = min(gene.start, utr_end - 1) attr = gene.attr.copy() attr['Parent'] = attr['ID'] attr['ID'] = attr['ID'] + '-3UTR' attr['color'] = '#008888' utr = annotation.Annotation( source='tt', type='three_prime_utr', seqid=gene.seqid, strand=gene.strand, start=utr_start, end=utr_end, attr=attr, ) utr.attr["max_extension"] = str( _max_extension(utr, exon_index, mrna_end_index)) gene_utrs.append(utr) annotation.write_gff3(work / 'reference.gff', annotations + mrna_utrs) annotation.write_gff3(work / 'utr.gff', gene_utrs) work.update_param(tail_tools_reference_version=work.VERSION)
def run(self): items = list(annotation.read_annotations(self.parent)) annotation.link_up_annotations(items) for item in items: assert len(item.parents) <= 1 genes = [ item for item in items if item.type == "gene" ] downstrand_genes = [ _extend(_three_prime(item), self.extension) for item in genes ] exons = [ item for item in items if item.type == "exon" ] utrs = [ _extend(item, self.extension) for item in items if item.type == "three_prime_utr" ] gene_index = span_index.index_annotations(genes) downstrand_gene_index = span_index.index_annotations(downstrand_genes) exon_index = span_index.index_annotations(exons) utr_index = span_index.index_annotations(utrs) peaks = [ ] for peak in annotation.read_annotations(self.child): if float(peak.attr.get("mean_tail","0.0")) < self.min_tail: continue peaks.append(peak) for peak in peaks: # Query is final base in genome before poly(A) starts query = peak.three_prime().shifted(-1,0) hit_to = "3'UTR" hits = [ item.parents[0].parents[0] for item in utr_index.get(query, True) ] if not hits: hit_to = "Exon" hits = [ item.parents[0].parents[0] for item in exon_index.get(query, True) ] # For non-coding RNAs, which don't have a 3' UTR if not hits: hit_to = "Downstrand" hits = downstrand_gene_index.get(query, True) if not hits: hit_to = "Intron" hits = gene_index.get(query, True) antisense_hits = gene_index.get(query.reversed(), True) if not hits: hit_to = "Antisense" hits = antisense_hits if hits: peak.attr["Parent"] = join_descriptions([ item.get_id() for item in hits ], ",") peak.attr["Relation"] = hit_to peak.attr["Name"] = join_descriptions([ item.attr.get("Name","") for item in hits ]) peak.attr["Product"] = hit_to + " " + join_descriptions([ item.attr.get("Product","") for item in hits ]) peak.attr["Biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in hits ]) if antisense_hits: peak.attr["Antisense_parent"] = join_descriptions([ item.get_id() for item in antisense_hits ], ",") peak.attr["Antisense_name"] = join_descriptions([ item.attr.get("Name","") for item in antisense_hits ]) peak.attr["Antisense_product"] = "Antisense " + join_descriptions([ item.attr.get("Product","") for item in antisense_hits ]) peak.attr["Antisense_biotype"] = join_descriptions([ item.attr.get("Biotype","") for item in antisense_hits]) annotation.write_gff3(self.prefix+"-parent.gff", genes) #Hmm annotation.write_gff3(self.prefix+"-child.gff", peaks)
def run(self): ref = env.load_ref(self.reference_dir) analysis = env.load_analysis(self.analysis_dir) samples = self.samples if not samples: samples = analysis.peak_counts['Count'].value_type().keys() def total_count(peak_id): return sum(analysis.peak_counts['Count'][peak_id][sample] for sample in samples) #for each gene #determine a cutoff region #get the most highly expressed peak in that region #output something useful peaks_of = collections.defaultdict(list) for peak in analysis.peaks.values(): if "Parent" in peak.attr: peaks_of[peak.attr["Parent"]].append(peak) called_peaks = [ ] called_utrs = [ ] called_genes = [ ] n_good = 0 n_bad = 0 for utr in ref.utrs.values(): query = _extend(utr, self.extension) candidates = [ ] for peak in peaks_of[utr.attr["Parent"]]: if peak.attr.get("Relation") != "3'UTR": continue relpeak = peak.relative_to(query) if relpeak.end <= 0: continue candidates.append((-total_count(peak.get_id()), peak.get_id())) candidates.sort() if not candidates: #print 'no peak for ', utr.attr['Parent'] n_bad += 1 continue peak = analysis.peaks_asis[candidates[0][1]].copy() peak.attr['Parent'] = utr.attr['Parent'] peak.attr['Name'] = utr.attr.get('Name','') peak.attr['Product'] = utr.attr.get('Product','') called_peaks.append(peak) called_utr = utr.five_prime().span_with(peak.three_prime()) called_utr.attr['Peak'] = peak.get_id() called_utrs.append(called_utr) called_gene = ref.genes[utr.attr['Parent']].five_prime().span_with(peak.three_prime()) called_gene.attr['Peak'] = peak.get_id() called_genes.append(called_gene) n_good += 1 print n_good, 'UTR called' print n_bad, 'no UTR called' annotation.write_gff3(self.prefix + '-peaks.gff', called_peaks) annotation.write_gff3(self.prefix + '-utrs.gff', called_utrs) annotation.write_gff3(self.prefix + '-genes.gff', called_genes)