def computeOverlapGO(infile, outfile): '''compute overlap between codingmarkers and windows. Only markers of certain GO categories are counted. This is done by setting the gene_id and transcript_id of markers of the ENSEMBEL gene that it overlaps with. This list is filtered first to keep only those ids with valid GO associations ''' to_cluster = False filter_goid = set(IOTools.readList(open(PARAMS["filename_gofilter"]))) filter_genes = set() E.info("number of goids: %i" % len(filter_goid)) for l in open(PARAMS["filename_go"]): f, id, goid, desc, evd = l[:-1].split("\t")[:5] if goid in filter_goid: filter_genes.add(id) tmpfile1 = P.getTempFile(dir=".") for line in open("ensembl.diff.genes_ovl"): a, b = line[:-1].split("\t") if b not in filter_genes: continue tmpfile1.write(line) E.info("number of genes taken: %i" % len(filter_genes)) tmpfile1.close() tmpfilename1 = tmpfile1.name tmpfilename = P.getTempFilename(dir=".") statement = '''python %(scriptsdir)s/gtf2gtf.py --rename=gene \ --apply=%(tmpfilename1)s \ < %(infile)s > %(tmpfilename)s ''' P.run(**dict(locals().items() + PARAMS.items())) statement = '''python %(scriptsdir)s/gff2table.py --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) --decorator=counts --filename-data=%(tmpfilename)s \ --skip-empty \ --is-gtf \ --log=%(outfile)s.log \ < %(genome)s.fasta > %(outfile)s''' P.run(**dict(locals().items() + PARAMS.items())) os.unlink(tmpfilename)
def computeOverlapGO( infile, outfile ): '''compute overlap between codingmarkers and windows. Only markers of certain GO categories are counted. This is done by setting the gene_id and transcript_id of markers of the ENSEMBEL gene that it overlaps with. This list is filtered first to keep only those ids with valid GO associations ''' to_cluster = False filter_goid = set(IOTools.readList( open( PARAMS["filename_gofilter"] ) )) filter_genes = set() E.info( "number of goids: %i" % len(filter_goid)) for l in open( PARAMS["filename_go"]): f, id, goid, desc, evd = l[:-1].split("\t")[:5] if goid in filter_goid: filter_genes.add( id ) tmpfile1 = P.getTempFile( dir = "." ) for line in open("ensembl.diff.genes_ovl" ): a,b = line[:-1].split( "\t" ) if b not in filter_genes: continue tmpfile1.write(line) E.info( "number of genes taken: %i" % len(filter_genes)) tmpfile1.close() tmpfilename1 = tmpfile1.name tmpfilename = P.getTempFilename( dir = "." ) statement = '''python %(scriptsdir)s/gtf2gtf.py --rename=gene \ --apply=%(tmpfilename1)s \ < %(infile)s > %(tmpfilename)s ''' P.run( **dict( locals().items() + PARAMS.items() ) ) statement = '''python %(scriptsdir)s/gff2table.py --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) --decorator=counts --filename-data=%(tmpfilename)s \ --skip-empty \ --is-gtf \ --log=%(outfile)s.log \ < %(genome)s.fasta > %(outfile)s''' P.run( **dict( locals().items() + PARAMS.items() ) ) os.unlink( tmpfilename )
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-f", "--change-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="guess quality score format and set quality scores to format [default=%default].") parser.add_option("--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option("--sample", dest="sample", type="float", help="sample a proportion of reads [default=%default].") parser.add_option("--pair", dest="pair", type="string", help="if data is paired, filename with second pair. " "Implemented for sampling [default=%default].") parser.add_option("--outfile-pair", dest="outfile_pair", type="string", help="if data is paired, filename for second pair. " "Implemented for sampling [default=%default].") parser.add_option("--uniq", dest="uniq", action="store_true", help="remove duplicate reads (by name) [default=%default].") parser.add_option("--apply", dest="apply", type="string", help="apply a filter to fastq file (taking only reads in filename) [default=%default].") parser.add_option("--trim3", dest="trim3", type="int", help="trim # bases from 3' end [default=%default].") parser.add_option("--sort", dest="sort", action="store_true", help="sort fastq by sequence id [default=%default].") parser.add_option("--seed", dest="seed", type="int", help="seed for random number generator [default=%default].") parser.add_option("--renumber-ids", dest="renumber_ids", type="string", help="rename reads in file by pattern [default=%default]") parser.set_defaults( change_format=None, guess_format=None, sample=None, trim3=None, pair=None, apply=None, uniq=False, outfile_pair=None, sort=None, seed=None, renumber_ids=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() if options.change_format: for record in Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.sample: sample_threshold = min(1.0, options.sample) random.seed(options.seed) if options.pair: if not options.outfile_pair: raise ValueError( "please specify output filename for second pair (--outfile-pair)") outfile1 = options.stdout outfile2 = IOTools.openFile(options.outfile_pair, "w") for record1, record2 in itertools.izip(Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.apply: ids = set(IOTools.readList(IOTools.openFile(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.trim3: trim3 = options.trim3 for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.uniq: keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.sort: if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.outfile_pair: raise ValueError( "please specify output filename for second pair (--outfile-pair)") E.warn( "consider sorting individual fastq files - this is memory intensive") entries1 = {} entries2 = {} for record1, record2 in itertools.izip(Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): entries1[ record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[ record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = IOTools.openFile(options.outfile_pair, "w") assert len(set(entries1.keys()).intersection(set(entries2.keys()))) == len(entries1), """paired files do not contain the same reads need to reconcile files""" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.renumber_ids: id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_ids % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version="%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option("-t", "--tablename", dest="tablename", type="string", help="tablename to get variants from (in samtools pileup format) [default=%default].") parser.add_option("-d", "--database", dest="database", type="string", help="sqlite3 database [default=%default].") parser.add_option("-f", "--exons-file", dest="filename_exons", type="string", help="filename with transcript model information (gtf formatted file) [default=%default].") parser.add_option("-r", "--filename-reference", dest="filename_reference", type="string", help="filename with transcript models of a reference gene set. Stop codons that do not" " overlap any of the exons in this file are ignore (gtf-formatted file) [default=%default].") parser.add_option("--vcf-file", dest="filename_vcf", type="string", help="filename with variants in VCF format. Should be indexed by tabix [default=%default].") parser.add_option("--pileup-file", dest="filename_pileup", type="string", help="filename with variants in samtools pileup format. Should be indexed by tabix [default=%default].") parser.add_option("--vcf-sample", dest="vcf_sample", type="string", help="sample id for species of interest in vcf formatted file [default=%default].") parser.add_option("-s", "--seleno-tsv-file", dest="filename_seleno", type="string", help="filename of a list of transcript ids that are selenoproteins [default=%default].") parser.add_option("-m", "--module", dest="modules", type="choice", action="append", choices=("gene-counts", "transcript-effects"), help="modules to apply [default=%default].") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("all", "peptide", "cds", "table", "gtf", "map"), help="sections to output [default=%default].") parser.add_option("-k", "--with-knockouts", dest="with_knockouts", action="store_true", help="add alleles that are knocked out to fasta and gtf files [default=%default].") parser.set_defaults( genome_file=None, filename_exons=None, filename_referenec=None, filename_seleno=None, modules=[], border=200, separator="|", tablename=None, database="csvdb", output=[], with_knockouts=False, filename_vcf=None, vcf_sample=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ninput, nskipped, noutput = 0, 0, 0 if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.filename_seleno: seleno = set(IOTools.readList(open(options.filename_seleno, "r"))) else: seleno = {} infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin)) # acquire variants from SQLlite database if options.tablename: if not options.database: raise ValueError("please supply both database and tablename") variant_getter = VariantGetterSqlite( options.database, options.tablename) elif options.filename_pileup: variant_getter = VariantGetterPileup(options.filename_pileup) elif options.filename_vcf: variant_getter = VariantGetterVCF( options.filename_vcf, options.vcf_sample) else: raise ValueError("please specify a source of variants.") if len(options.output) == 0 or "all" in options.output: output_all = True else: output_all = False if "cds" in options.output or output_all: outfile_cds = E.openOutputFile("cds.fasta") else: outfile_cds = None if "map" in options.output or output_all: outfile_map = E.openOutputFile("map.psl") else: outfile_map = None if "peptide" in options.output or output_all: outfile_peptides = E.openOutputFile("peptides.fasta") else: outfile_peptides = None if "table" in options.output or output_all: outfile_alleles = E.openOutputFile("table") outfile_alleles.write("\t".join( ("gene_id", "transcript_id", "allele_id", "contig", "strand", "is_wildtype", ("\t".join(Allele._fields)))) + "\n") else: outfile_alleles = None if "gtf" in options.output or output_all: outfile_gtf = E.openOutputFile("gtf") else: outfile_gtf = None # id separatar separator = options.separator for transcripts in infile_gtf: gene_id = transcripts[0][0].gene_id overall_start = min([min([x.start for x in y]) for y in transcripts]) overall_end = max([max([x.end for x in y]) for y in transcripts]) contig = transcripts[0][0].contig strand = transcripts[0][0].strand is_positive_strand = Genomics.IsPositiveStrand(strand) lcontig = fasta.getLength(contig) E.info("%s: started processing on %s:%i..%i (%s)" % (gene_id, contig, overall_start, overall_end, strand)) ninput += 1 extended_start = max(0, overall_start - options.border) extended_end = min(lcontig, overall_end + options.border) # if contig.startswith("chr"): contig = contig[3:] variants = variant_getter(contig, extended_start, extended_end) E.debug("%s: found %i variants in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print "# collected variants:", variants # collect intron/exon sequences # coordinates are forward/reverse # also updates the coordinates in transcripts all_exons, all_introns = collectExonIntronSequences(transcripts, fasta) # update variants such that they use the same coordinates # as the transcript variants = Variants.updateVariants(variants, lcontig, strand) # deal with overlapping but consistent variants variants = Variants.mergeVariants(variants) E.debug("%s: found %i variants after merging in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print "# merged variants:", variants # collect coordinate offsets and remove conflicting variants variants, removed_variants, offsets = Variants.buildOffsets( variants, contig=contig) if len(removed_variants) > 0: E.warn("removed %i conflicting variants" % len(removed_variants)) for v in removed_variants: E.info("removed variant: %s" % str(v)) E.info("%i variants after filtering" % len(variants)) if len(variants) > 0: # build variants indexed_variants = Variants.indexVariants(variants) # update exon sequences according to variants variant_exons = buildVariantSequences(indexed_variants, all_exons) # update intron sequences according to variants variant_introns = buildVariantSequences( indexed_variants, all_introns) if E.global_options.loglevel >= 10: for key in variant_exons: print "exon", key Genomics.printPrettyAlignment( all_exons[key], variant_exons[key][0], variant_exons[key][1], ) for key in variant_introns: print "intron", key Genomics.printPrettyAlignment( all_introns[key][:30] + all_introns[key][-30:], variant_introns[key][0][:30] + variant_introns[key][0][-30:], variant_introns[key][1][:30] + variant_introns[key][1][-30:]) else: variant_exons, variant_introns = None, None for transcript in transcripts: transcript.sort(key=lambda x: x.start) transcript_id = transcript[0].transcript_id alleles = buildAlleles(transcript, variant_exons, variant_introns, all_exons, all_introns, offsets, is_seleno=transcript_id in seleno, reference_coordinates=False, ) ############################################################## ############################################################## ############################################################## # output for aid, al in enumerate(alleles): allele, map_cds2reference = al reference_cds_sequence = buildCDSSequence( transcript, all_exons) is_wildtype = reference_cds_sequence == allele.cds allele_id = str(aid) assert len(allele.exon_starts) == allele.nexons assert len(allele.cds_starts) == allele.nexons assert len(allele.frames) == allele.nexons # the output id outid = separator.join((gene_id, transcript_id, allele_id)) # output map between cds and reference if outfile_map and map_cds2reference: match = Blat.Match() match.mQueryId = allele_id match.mQueryLength = allele.cds_len match.mSbjctId = contig match.mSbjctLength = lcontig match.strand = strand match.fromMap(map_cds2reference, use_strand=True) outfile_map.write("%s\n" % str(match)) # only output sequences for genes that have not been knocked # out, unless required if not allele.is_nmd_knockout or options.with_knockouts: if outfile_gtf: gtf = GTF.Entry() gtf.gene_id = gene_id gtf.transcript_id = transcript_id gtf.addAttribute("allele_id", allele_id) gtf.contig = contig gtf.strand = strand gtf.feature = "CDS" gtf.source = "gtfxnsps" l = 0 last_cds_start = allele.cds_starts[0] gtf.start = allele.exon_starts[0] gtf.frame = allele.frames[0] for exon_start, cds_start, frame in zip(allele.exon_starts[1:], allele.cds_starts[ 1:], allele.frames[1:]): cds_length = cds_start - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") gtf.start = exon_start gtf.frame = frame l += cds_length last_cds_start = cds_start cds_length = len(allele.cds) - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") if outfile_cds: outfile_cds.write(">%s\n%s\n" % (outid, allele.cds)) if outfile_peptides: outfile_peptides.write( ">%s\n%s\n" % (outid, allele.peptide)) # reformat for tabular output allele = allele._replace( cds_starts=",".join(map(str, allele.cds_starts)), exon_starts=",".join(map(str, allele.exon_starts)), frames=",".join(map(str, allele.frames))) # convert reference coordinates to positive strand coordinates if allele.reference_first_stop_start >= 0 and not is_positive_strand: allele = allele._replace( reference_first_stop_start=lcontig - allele.reference_first_stop_end, reference_first_stop_end=lcontig - allele.reference_first_stop_start, ) if outfile_alleles: outfile_alleles.write("%s\t%s\n" % ( "\t".join((gene_id, transcript_id, allele_id, contig, strand, "%i" % is_wildtype)), "\t".join(map(str, allele)))) noutput += 1 # only output first allele (debugging) # break E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: gtf2alleles.py 2886 2010-04-07 08:47:46Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome [default=%default].") parser.add_option( "-t", "--tablename", dest="tablename", type="string", help= "tablename to get variants from (in samtools pileup format) [default=%default]." ) parser.add_option("-d", "--database", dest="database", type="string", help="sqlite3 database [default=%default].") parser.add_option( "-f", "--exons-file", dest="filename_exons", type="string", help= "filename with transcript model information (gtf formatted file) [default=%default]." ) parser.add_option( "-r", "--filename-reference", dest="filename_reference", type="string", help= "filename with transcript models of a reference gene set. Stop codons that do not" " overlap any of the exons in this file are ignore (gtf-formatted file) [default=%default]." ) parser.add_option( "--vcf-file", dest="filename_vcf", type="string", help= "filename with variants in VCF format. Should be indexed by tabix [default=%default]." ) parser.add_option( "--pileup-file", dest="filename_pileup", type="string", help= "filename with variants in samtools pileup format. Should be indexed by tabix [default=%default]." ) parser.add_option( "--vcf-sample", dest="vcf_sample", type="string", help= "sample id for species of interest in vcf formatted file [default=%default]." ) parser.add_option( "-s", "--seleno-tsv-file", dest="filename_seleno", type="string", help= "filename of a list of transcript ids that are selenoproteins [default=%default]." ) parser.add_option("-m", "--module", dest="modules", type="choice", action="append", choices=("gene-counts", "transcript-effects"), help="modules to apply [default=%default].") parser.add_option("-o", "--output-section", dest="output", type="choice", action="append", choices=("all", "peptide", "cds", "table", "gtf", "map"), help="sections to output [default=%default].") parser.add_option( "-k", "--with-knockouts", dest="with_knockouts", action="store_true", help= "add alleles that are knocked out to fasta and gtf files [default=%default]." ) parser.set_defaults( genome_file=None, filename_exons=None, filename_referenec=None, filename_seleno=None, modules=[], border=200, separator="|", tablename=None, database="csvdb", output=[], with_knockouts=False, filename_vcf=None, vcf_sample=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ninput, nskipped, noutput = 0, 0, 0 if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.filename_seleno: seleno = set(IOTools.readList(open(options.filename_seleno, "r"))) else: seleno = {} infile_gtf = GTF.gene_iterator(GTF.iterator(options.stdin)) # acquire variants from SQLlite database if options.tablename: if not options.database: raise ValueError("please supply both database and tablename") variant_getter = VariantGetterSqlite(options.database, options.tablename) elif options.filename_pileup: variant_getter = VariantGetterPileup(options.filename_pileup) elif options.filename_vcf: variant_getter = VariantGetterVCF(options.filename_vcf, options.vcf_sample) else: raise ValueError("please specify a source of variants.") if len(options.output) == 0 or "all" in options.output: output_all = True else: output_all = False if "cds" in options.output or output_all: outfile_cds = E.openOutputFile("cds.fasta") else: outfile_cds = None if "map" in options.output or output_all: outfile_map = E.openOutputFile("map.psl") else: outfile_map = None if "peptide" in options.output or output_all: outfile_peptides = E.openOutputFile("peptides.fasta") else: outfile_peptides = None if "table" in options.output or output_all: outfile_alleles = E.openOutputFile("table") outfile_alleles.write("\t".join(("gene_id", "transcript_id", "allele_id", "contig", "strand", "is_wildtype", ("\t".join(Allele._fields)))) + "\n") else: outfile_alleles = None if "gtf" in options.output or output_all: outfile_gtf = E.openOutputFile("gtf") else: outfile_gtf = None # id separatar separator = options.separator for transcripts in infile_gtf: gene_id = transcripts[0][0].gene_id overall_start = min([min([x.start for x in y]) for y in transcripts]) overall_end = max([max([x.end for x in y]) for y in transcripts]) contig = transcripts[0][0].contig strand = transcripts[0][0].strand is_positive_strand = Genomics.IsPositiveStrand(strand) lcontig = fasta.getLength(contig) E.info("%s: started processing on %s:%i..%i (%s)" % (gene_id, contig, overall_start, overall_end, strand)) ninput += 1 extended_start = max(0, overall_start - options.border) extended_end = min(lcontig, overall_end + options.border) # if contig.startswith("chr"): contig = contig[3:] variants = variant_getter(contig, extended_start, extended_end) E.debug("%s: found %i variants in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# collected variants:", variants) # collect intron/exon sequences # coordinates are forward/reverse # also updates the coordinates in transcripts all_exons, all_introns = collectExonIntronSequences(transcripts, fasta) # update variants such that they use the same coordinates # as the transcript variants = Variants.updateVariants(variants, lcontig, strand) # deal with overlapping but consistent variants variants = Variants.mergeVariants(variants) E.debug("%s: found %i variants after merging in %s:%i..%i" % (gene_id, len(variants), contig, extended_start, extended_end)) if E.global_options.loglevel >= 10: print("# merged variants:", variants) # collect coordinate offsets and remove conflicting variants variants, removed_variants, offsets = Variants.buildOffsets( variants, contig=contig) if len(removed_variants) > 0: E.warn("removed %i conflicting variants" % len(removed_variants)) for v in removed_variants: E.info("removed variant: %s" % str(v)) E.info("%i variants after filtering" % len(variants)) if len(variants) > 0: # build variants indexed_variants = Variants.indexVariants(variants) # update exon sequences according to variants variant_exons = buildVariantSequences(indexed_variants, all_exons) # update intron sequences according to variants variant_introns = buildVariantSequences(indexed_variants, all_introns) if E.global_options.loglevel >= 10: for key in variant_exons: print("exon", key) Genomics.printPrettyAlignment( all_exons[key], variant_exons[key][0], variant_exons[key][1], ) for key in variant_introns: print("intron", key) Genomics.printPrettyAlignment( all_introns[key][:30] + all_introns[key][-30:], variant_introns[key][0][:30] + variant_introns[key][0][-30:], variant_introns[key][1][:30] + variant_introns[key][1][-30:]) else: variant_exons, variant_introns = None, None for transcript in transcripts: transcript.sort(key=lambda x: x.start) transcript_id = transcript[0].transcript_id alleles = buildAlleles( transcript, variant_exons, variant_introns, all_exons, all_introns, offsets, is_seleno=transcript_id in seleno, reference_coordinates=False, ) ############################################################## ############################################################## ############################################################## # output for aid, al in enumerate(alleles): allele, map_cds2reference = al reference_cds_sequence = buildCDSSequence( transcript, all_exons) is_wildtype = reference_cds_sequence == allele.cds allele_id = str(aid) assert len(allele.exon_starts) == allele.nexons assert len(allele.cds_starts) == allele.nexons assert len(allele.frames) == allele.nexons # the output id outid = separator.join((gene_id, transcript_id, allele_id)) # output map between cds and reference if outfile_map and map_cds2reference: match = Blat.Match() match.mQueryId = allele_id match.mQueryLength = allele.cds_len match.mSbjctId = contig match.mSbjctLength = lcontig match.strand = strand match.fromMap(map_cds2reference, use_strand=True) outfile_map.write("%s\n" % str(match)) # only output sequences for genes that have not been knocked # out, unless required if not allele.is_nmd_knockout or options.with_knockouts: if outfile_gtf: gtf = GTF.Entry() gtf.gene_id = gene_id gtf.transcript_id = transcript_id gtf.addAttribute("allele_id", allele_id) gtf.contig = contig gtf.strand = strand gtf.feature = "CDS" gtf.source = "gtfxnsps" l = 0 last_cds_start = allele.cds_starts[0] gtf.start = allele.exon_starts[0] gtf.frame = allele.frames[0] for exon_start, cds_start, frame in zip( allele.exon_starts[1:], allele.cds_starts[1:], allele.frames[1:]): cds_length = cds_start - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") gtf.start = exon_start gtf.frame = frame l += cds_length last_cds_start = cds_start cds_length = len(allele.cds) - last_cds_start gtf.end = gtf.start + cds_length if not is_positive_strand: gtf.start, gtf.end = lcontig - \ gtf.end, lcontig - gtf.start outfile_gtf.write(str(gtf) + "\n") if outfile_cds: outfile_cds.write(">%s\n%s\n" % (outid, allele.cds)) if outfile_peptides: outfile_peptides.write(">%s\n%s\n" % (outid, allele.peptide)) # reformat for tabular output allele = allele._replace( cds_starts=",".join(map(str, allele.cds_starts)), exon_starts=",".join(map(str, allele.exon_starts)), frames=",".join(map(str, allele.frames))) # convert reference coordinates to positive strand coordinates if allele.reference_first_stop_start >= 0 and not is_positive_strand: allele = allele._replace( reference_first_stop_start=lcontig - allele.reference_first_stop_end, reference_first_stop_end=lcontig - allele.reference_first_stop_start, ) if outfile_alleles: outfile_alleles.write("%s\t%s\n" % ("\t".join( (gene_id, transcript_id, allele_id, contig, strand, "%i" % is_wildtype)), "\t".join(map(str, allele)))) noutput += 1 # only output first allele (debugging) # break E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: set_diff.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true", help="add percentage information to each line.") parser.add_option( "-t", "--header-names", dest="headers", type="string", help= "comma separated list of headers. If empty or set to '-', filenames are used." ) parser.add_option("--skip-header", dest="add_header", action="store_false", help="do not add header to flat format.") parser.add_option("--output-with-header", dest="write_header", action="store_true", help="write header and exit.") parser.add_option("--with-title", dest="with_title", action="store_true", help="use column titles in input data [%default].") parser.add_option("--no-title", dest="with_title", action="store_false", help="there are no titles in input data [%default].") parser.set_defaults( add_percent=False, percent_format="%5.2f", headers=None, add_header=True, write_header=False, with_title=True, ) (options, args) = E.Start(parser) if options.add_header: options.stdout.write( "set1\tset2\tn1\tn2\tunion\tinter\tunique1\tunique2") if options.add_percent: options.stdout.write( "\tpinter\tpunique1\tpunique2\tpcov1\tpcov2\tpcovmax") options.stdout.write("\n") if options.write_header: sys.exit(0) if len(args) < 2: raise ValueError("please supply at least two filenames.") headers, titles, sets = [], [], [] if options.headers: if options.headers == "-": headers = args else: headers = options.headers.split(",") if len(headers) != len(args): raise ValueError( "please supply the same number of headers as there are filenames." ) for f in args: if options.with_title: title, data = IOTools.readList(IOTools.openFile(f, "r"), with_title=options.with_title) titles.append(title) else: data = IOTools.readList(open(f, "r")) sets.append(set(data)) if not headers and titles: headers = titles else: headers = args for x in range(len(sets) - 1): set1 = sets[x] for y in range(x + 1, len(sets)): set2 = sets[y] l1, l2 = len(set1), len(set2) options.stdout.write( "%s\t%s\t%i\t%i\t%i\t%i\t%i\t%i" % (headers[x], headers[y], l1, l2, len(set1.union(set2)), len(set1.intersection(set2)), len( set1.difference(set2)), len(set2.difference(set1)))) if options.add_percent: if len(set1) == 0: ri, r1, r2 = 0, 1, 0 c1, c2, cm = 1, 0, 0 elif len(set2) == 0: ri, r1, r2 = 0, 0, 1 c1, c2, cm = 0, 1, 0 else: i = len(set1.intersection(set2)) ri, r1, r2 = (i / float(len(set1.union(set2))), len(set1.difference(set2)) / float(l1), len(set2.difference(set1)) / float(l2)) c1, c2 = (i / float(l1), i / float(l2)) cm = max(c1, c2) options.stdout.write( "\t" + ("\t".join([options.percent_format for z in range(6)])) % (ri, r1, r2, c1, c2, cm)) options.stdout.write("\n") E.Stop()
for contig, gffs in GTF.readAsIntervals( iterator ).items(): if options.remove_regex and options.remove_regex.search( contig ): continue for x in gffs: options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, x[0],x[1] ) ) nsegments += 1 options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join( ["%i" % x for x in range(start, nsegments) ] ) ) ) E.info( "set %s annotated with %i segments" % (filename, nsegments - start) ) else: ## create subsets E.debug("applying subsets for %s" % filename ) geneid2label, label2segments = collections.defaultdict(list) , {} for label, filename_ids in options.subsets[filename]: gene_ids = IOTools.readList( open(filename_ids, "r") ) for gene_id in gene_ids: geneid2label[gene_id].append( label ) label2segments[label] = [] for contig, gffs in GTF.readAsIntervals( iterator, with_gene_id = True ).items(): if options.remove_regex and options.remove_regex.search( contig ): continue for start, end, gene_id in gffs: if gene_id not in geneid2label: continue for label in geneid2label[gene_id]: label2segments[label].append(nsegments) options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) ) nsegments += 1
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: set_diff.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true", help="add percentage information to each line.") parser.add_option("-t", "--headers", dest="headers", type="string", help="comma separated list of headers. If empty or set to '-', filenames are used.") parser.add_option("--skip-header", dest="add_header", action="store_false", help="do not add header to flat format.") parser.add_option("--write-header", dest="write_header", action="store_true", help="write header and exit.") parser.add_option("--with-title", dest="with_title", action="store_true", help="use column titles in input data [%default].") parser.add_option("--no-title", dest="with_title", action="store_false", help="there are no titles in input data [%default].") parser.set_defaults( add_percent=False, percent_format="%5.2f", headers=None, add_header=True, write_header=False, with_title=True, ) (options, args) = E.Start(parser) if options.add_header: options.stdout.write( "set1\tset2\tn1\tn2\tunion\tinter\tunique1\tunique2") if options.add_percent: options.stdout.write( "\tpinter\tpunique1\tpunique2\tpcov1\tpcov2\tpcovmax") options.stdout.write("\n") if options.write_header: sys.exit(0) if len(args) < 2: raise ValueError("please supply at least two filenames.") headers, titles, sets = [], [], [] if options.headers: if options.headers == "-": headers = args else: headers = options.headers.split(",") if len(headers) != len(args): raise ValueError( "please supply the same number of headers as there are filenames.") for f in args: if options.with_title: title, data = IOTools.readList( open(f, "r"), with_title=options.with_title) titles.append(title) else: data = IOTools.readList(open(f, "r")) sets.append(set(data)) if not headers and titles: headers = titles else: headers = args for x in range(len(sets) - 1): set1 = sets[x] for y in range(x + 1, len(sets)): set2 = sets[y] l1, l2 = len(set1), len(set2) options.stdout.write("%s\t%s\t%i\t%i\t%i\t%i\t%i\t%i" % (headers[x], headers[y], l1, l2, len(set1.union( set2)), len(set1.intersection( set2)), len(set1.difference( set2)), len(set2.difference(set1)))) if options.add_percent: if len(set1) == 0: ri, r1, r2 = 0, 1, 0 c1, c2, cm = 1, 0, 0 elif len(set2) == 0: ri, r1, r2 = 0, 0, 1 c1, c2, cm = 0, 1, 0 else: i = len(set1.intersection(set2)) ri, r1, r2 = ( i / float(len(set1.union(set2))), len(set1.difference(set2)) / float(l1), len(set2.difference(set1)) / float(l2)) c1, c2 = (i / float(l1), i / float(l2)) cm = max(c1, c2) options.stdout.write( "\t" + ("\t".join([options.percent_format for z in range(6)])) % (ri, r1, r2, c1, c2, cm)) options.stdout.write("\n") E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=( "apply", "change-format", "renumber-reads", "sample", "sort", "trim3", "trim5", "unique", "grep"), help="method to apply [%default]") parser.add_option( "--target-format", dest="target_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="guess quality score format and set quality scores " "to format [default=%default].") parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option( "--sample-size", dest="sample_size", type="float", help="proportion of reads to sample. " "Provide a proportion of reads to sample, e.g. 0.1 for 10%, " "0.5 for 50%, etc [default=%default].") parser.add_option( "--pair-fastq-file", dest="pair", type="string", help="if data is paired, filename with second pair. " "Implemented for sampling [default=%default].") parser.add_option( "--map-tsv-file", dest="map_tsv_file", type="string", help="filename with tab-separated identifiers mapping for " "method apply [default=%default].") parser.add_option( "--num-bases", dest="nbases", type="int", help="number of bases to trim [default=%default].") parser.add_option( "--seed", dest="seed", type="int", help="seed for random number generator [default=%default].") parser.add_option( "--pattern-identifier", dest="renumber_pattern", type="string", help="rename reads in file by pattern [default=%default]") parser.add_option( "--grep-pattern", dest="grep_pattern", type="string", help="subset to reads matching pattern [default=%default]") parser.set_defaults( method=None, change_format=None, guess_format=None, sample_size=0.1, nbases=0, pair=None, apply=None, seed=None, renumber_pattern="read_%010i", grep_pattern=".*") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) c = E.Counter() if options.method == "change-format": for record in Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "grep": for record in Fastq.iterate(options.stdin): if re.match(options.grep_pattern, record.seq): options.stdout.write("%s\n" % record) elif options.method == "sample": sample_threshold = min(1.0, options.sample_size) random.seed(options.seed) if options.pair: if not options.output_filename_pattern: raise ValueError( "please specify output filename pattern for " "second pair (--output-filename-pattern)") outfile1 = options.stdout outfile2 = IOTools.openFile(options.outfile_filename_pattern, "w") for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "apply": ids = set(IOTools.readList(IOTools.openFile(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "trim3": trim3 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "trim5": trim5 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim5(trim5) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "unique": keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.method == "sort": if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.output_filename_pattern: raise ValueError( "please specify output filename for second pair " "(--output-filename-pattern)") E.warn( "consider sorting individual fastq files - " "this is memory intensive") entries1 = {} entries2 = {} for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): entries1[ record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[ record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = IOTools.openFile(options.output_filename_pattern, "w") assert len(set(entries1.keys()).intersection( set(entries2.keys()))) == len(entries1),\ "paired files do not contain the same reads "\ "need to reconcile files" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.method == "renumber-reads": id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_pattern % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-f", "--change-format", dest="change_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help= "guess quality score format and set quality scores to format [default=%default]." ) parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option("--sample", dest="sample", type="float", help="sample a proportion of reads [default=%default].") parser.add_option("--pair", dest="pair", type="string", help="if data is paired, filename with second pair. " "Implemented for sampling [default=%default].") parser.add_option("--outfile-pair", dest="outfile_pair", type="string", help="if data is paired, filename for second pair. " "Implemented for sampling [default=%default].") parser.add_option( "--uniq", dest="uniq", action="store_true", help="remove duplicate reads (by name) [default=%default].") parser.add_option( "--apply", dest="apply", type="string", help= "apply a filter to fastq file (taking only reads in filename) [default=%default]." ) parser.add_option("--trim3", dest="trim3", type="int", help="trim # bases from 3' end [default=%default].") parser.add_option("--sort", dest="sort", action="store_true", help="sort fastq by sequence id [default=%default].") parser.add_option( "--seed", dest="seed", type="int", help="seed for random number generator [default=%default].") parser.add_option( "--renumber-ids", dest="renumber_ids", type="string", help="rename reads in file by pattern [default=%default]") parser.set_defaults(change_format=None, guess_format=None, sample=None, trim3=None, pair=None, apply=None, uniq=False, outfile_pair=None, sort=None, seed=None, renumber_ids=None) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() if options.change_format: for record in Fastq.iterate_convert(options.stdin, format=options.change_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.sample: sample_threshold = min(1.0, options.sample) random.seed(options.seed) if options.pair: if not options.outfile_pair: raise ValueError( "please specify output filename for second pair (--outfile-pair)" ) outfile1 = options.stdout outfile2 = IOTools.openFile(options.outfile_pair, "w") for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.apply: ids = set(IOTools.readList(IOTools.openFile(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.trim3: trim3 = options.trim3 for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.uniq: keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.sort: if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.outfile_pair: raise ValueError( "please specify output filename for second pair (--outfile-pair)" ) E.warn( "consider sorting individual fastq files - this is memory intensive" ) entries1 = {} entries2 = {} for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): entries1[record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = IOTools.openFile(options.outfile_pair, "w") assert len( set(entries1.keys()).intersection(set(entries2.keys())) ) == len(entries1), """paired files do not contain the same reads need to reconcile files""" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.renumber_ids: id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_ids % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) ## write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gff2annotator2tsv.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-f", "--features", dest="features", type="string", help="feature to collect [default=None].") parser.add_option("-i", "--files", dest="files", action="append", help="use multiple annotations [default=None].") parser.add_option( "-a", "--annotations", dest="annotations", type="string", help= "aggregate name for annotations if only single file is provided from STDIN [default=None]." ) parser.add_option( "--input-filename-map", dest="input_filename_map", type="string", help="filename with a map of gene_ids to categories [default=None].") parser.add_option( "--output-filename-synonyms", dest="output_filename_synonyms", type="string", help= "output filename for synonyms. For workspace building, the gff source will be used as the id (instead of the contig) [default=None]." ) parser.add_option("-m", "--max-length", dest="max_length", type="string", help="maximum segment length [default=None].") parser.add_option("-s", "--section", dest="section", type="choice", choices=("segments", "annotations", "annotations-genes", "annotations-go", "workspace", "annotations-gff"), help="annotator section [default=None].") parser.add_option( "--subset", dest="subsets", type="string", action="append", help= "add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]." ) parser.add_option( "--remove-regex", dest="remove_regex", type="string", help="regular expression of contigs to remove [default=None].") parser.set_defaults( genome_file=None, feature=None, section="segments", annotations="annotations", max_length=100000, files=[], subsets=[], input_filename_map=None, output_filename_synonyms=None, input_format="gff", remove_regex=None, ) (options, args) = E.Start(parser) options.files += args if len(options.files) == 0: options.files.append("-") options.files = list( itertools.chain(*[re.split("[,; ]+", x) for x in options.files])) if options.subsets: subsets = collections.defaultdict(list) for s in options.subsets: filename_gff, label, filename_ids = s.split(",") subsets[filename_gff].append((label, filename_ids)) options.subsets = subsets if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.section == "segments": prefix = "##Segs" elif options.section.startswith("annotations"): prefix = "##Id" elif options.section == "workspace": prefix = "##Work" else: raise ValueError("unknown section %s" % options.section) ninput, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0 if options.remove_regex: options.remove_regex = re.compile(options.remove_regex) if options.section in ("segments", "workspace"): iterator = GTF.iterator_filtered(GFF.iterator(options.stdin), feature=options.feature) if options.output_filename_synonyms: outfile_synonyms = open(options.output_filename_synonyms, "w") with_records = True else: outfile_synonyms = None with_records = False intervals = GTF.readAsIntervals(iterator, with_records=with_records) ninput, nsegments, ndiscarded, ncontigs = \ PipelineEnrichment.outputSegments(options.stdout, intervals, options.section, outfile_synonyms=outfile_synonyms, max_length=options.max_length, remove_regex=options.remove_regex) if outfile_synonyms: outfile_synonyms.close() elif options.section == "annotations-go": assert options.input_filename_map, "please supply option --input-filename-map" iterator = GTF.iterator_filtered(GTF.iterator(options.stdin), feature=options.feature) geneid2categories = IOTools.readMultiMap( open(options.input_filename_map, "r")) category2segments = collections.defaultdict(list) for contig, gffs in GTF.readAsIntervals(iterator, with_gene_id=True).items(): if options.remove_regex and options.remove_regex.search(contig): continue for start, end, geneid in gffs: if geneid not in geneid2categories: continue for category in geneid2categories[geneid]: category2segments[category].append(nsegments) options.stdout.write("%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end)) nsegments += 1 for category, segments in category2segments.iteritems(): options.stdout.write( "##Ann\t%s\t%s\n" % (category, "\t".join(["%i" % x for x in segments]))) E.info("set %s annotated with %i segments" % (category, len(segments))) elif options.section == "annotations": for filename in options.files: E.info("adding filename %s" % filename) start = nsegments is_gtf = False if filename == "-": iterator = GTF.iterator_filtered(GFF.iterator(sys.stdin), feature=options.feature) filename = options.annotations elif filename.endswith(".gtf"): is_gtf = True with open(filename, "r") as infile: iterator = GTF.iterator_filtered(GTF.iterator(infile), feature=options.feature) else: with open(filename, "r") as infile: iterator = GTF.iterator_filtered(GFF.iterator(infile), feature=options.feature) E.debug("processing %s" % (filename)) if not options.subsets or filename not in options.subsets: for contig, gffs in GTF.readAsIntervals(iterator).items(): if options.remove_regex and options.remove_regex.search( contig): continue for x in gffs: options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, x[0], x[1])) nsegments += 1 options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join( ["%i" % x for x in range(start, nsegments)]))) E.info("set %s annotated with %i segments" % (filename, nsegments - start)) else: raise ValueError("don't know how to filter %s" % filename) elif options.section == "annotations-gff": for filename in options.files: if filename == "-": iterator = GTF.iterator(sys.stdin) else: iterator = GTF.iterator_filtered( GFF.iterator(open(filename, "r"))) segments = collections.defaultdict(list) for gff in iterator: segments[":".join((gff.source, gff.feature))].append( (gff.contig, gff.start, gff.end)) feature2segments = {} for feature, s in segments.iteritems(): s.sort() s1 = nsegments for contig, start, end in s: if options.remove_regex and options.remove_regex.search( contig): continue options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end)) nsegments += 1 feature2segments[feature] = (s1, nsegments) for feature, id_range in feature2segments.iteritems(): start, end = id_range options.stdout.write( "##Ann\t%s\t%s\n" % (feature, "\t".join(["%i" % x for x in xrange(start, end)]))) E.info("set %s annotated with %i segments" % (feature, end - start)) elif options.section == "annotations-genes": for filename in options.files: E.info("adding filename %s" % filename) start = nsegments assert filename.endswith(".gtf") or filename.endswith(".gtf.gz"), \ "requiring .gtf files for gene list filtering, received %s" % filename infile = IOTools.openFile(filename) iterator = GTF.iterator_filtered(GTF.iterator(infile), feature=options.feature) E.debug("processing %s" % (filename)) if not options.subsets or filename not in options.subsets: # output all for contig, gffs in GTF.readAsIntervals(iterator).items(): if options.remove_regex and options.remove_regex.search( contig): continue for x in gffs: options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, x[0], x[1])) nsegments += 1 options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join( ["%i" % x for x in range(start, nsegments)]))) E.info("set %s annotated with %i segments" % (filename, nsegments - start)) else: # create subsets E.debug("applying subsets for %s" % filename) geneid2label, label2segments = collections.defaultdict( list), {} for label, filename_ids in options.subsets[filename]: gene_ids = IOTools.readList(open(filename_ids, "r")) for gene_id in gene_ids: geneid2label[gene_id].append(label) label2segments[label] = [] for contig, gffs in GTF.readAsIntervals( iterator, with_gene_id=True).items(): if options.remove_regex and options.remove_regex.search( contig): continue for start, end, gene_id in gffs: if gene_id not in geneid2label: continue for label in geneid2label[gene_id]: label2segments[label].append(nsegments) options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end)) nsegments += 1 for label, segments in label2segments.iteritems(): options.stdout.write( "##Ann\t%s\t%s\n" % (label, "\t".join(["%i" % x for x in segments]))) E.info("set %s (%s) annotated with %i segments" % (label, filename, len(segments))) E.info("ninput=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" % (ninput, ncontigs, nsegments, ndiscarded)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=("apply", "change-format", "renumber-reads", "sample", "sort", "trim3", "trim5", "unique", "grep"), help="method to apply [%default]") parser.add_option("--target-format", dest="target_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="guess quality score format and set quality scores " "to format [default=%default].") parser.add_option( "--guess-format", dest="guess_format", type="choice", choices=('sanger', 'solexa', 'phred64', 'integer', 'illumina-1.8'), help="quality score format to assume if ambiguous [default=%default].") parser.add_option( "--sample-size", dest="sample_size", type="float", help="proportion of reads to sample. " "Provide a proportion of reads to sample, e.g. 0.1 for 10%, " "0.5 for 50%, etc [default=%default].") parser.add_option("--pair-fastq-file", dest="pair", type="string", help="if data is paired, filename with second pair. " "Implemented for sampling [default=%default].") parser.add_option( "--map-tsv-file", dest="map_tsv_file", type="string", help="filename with tab-separated identifiers mapping for " "method apply [default=%default].") parser.add_option("--num-bases", dest="nbases", type="int", help="number of bases to trim [default=%default].") parser.add_option( "--seed", dest="seed", type="int", help="seed for random number generator [default=%default].") parser.add_option( "--pattern-identifier", dest="renumber_pattern", type="string", help="rename reads in file by pattern [default=%default]") parser.add_option( "--grep-pattern", dest="grep_pattern", type="string", help="subset to reads matching pattern [default=%default]") parser.set_defaults(method=None, change_format=None, guess_format=None, sample_size=0.1, nbases=0, pair=None, apply=None, seed=None, renumber_pattern="read_%010i", grep_pattern=".*") # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) c = E.Counter() if options.method == "change-format": for record in Fastq.iterate_convert(options.stdin, format=options.target_format, guess=options.guess_format): c.input += 1 options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "grep": for record in Fastq.iterate(options.stdin): if re.match(options.grep_pattern, record.seq): options.stdout.write("%s\n" % record) elif options.method == "sample": sample_threshold = min(1.0, options.sample_size) random.seed(options.seed) if options.pair: if not options.output_filename_pattern: raise ValueError("please specify output filename pattern for " "second pair (--output-filename-pattern)") outfile1 = options.stdout outfile2 = IOTools.openFile(options.output_filename_pattern, "w") for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): c.input += 1 if random.random() <= sample_threshold: c.output += 1 outfile1.write("%s\n" % record1) outfile2.write("%s\n" % record2) else: for record in Fastq.iterate(options.stdin): c.input += 1 if random.random() <= sample_threshold: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "apply": ids = set(IOTools.readList(IOTools.openFile(options.apply))) for record in Fastq.iterate(options.stdin): c.input += 1 if re.sub(" .*", "", record.identifier).strip() in ids: c.output += 1 options.stdout.write("%s\n" % record) elif options.method == "trim3": trim3 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim(trim3) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "trim5": trim5 = options.nbases for record in Fastq.iterate(options.stdin): c.input += 1 record.trim5(trim5) options.stdout.write("%s\n" % record) c.output += 1 elif options.method == "unique": keys = set() for record in Fastq.iterate(options.stdin): c.input += 1 if record.identifier in keys: continue else: keys.add(record.identifier) options.stdout.write("%s\n" % record) c.output += 1 # Need to change this to incorporate both pairs elif options.method == "sort": if not options.pair: # This is quicker for a single fastq file statement = "paste - - - - | sort -k1,1 -t ' ' | tr '\t' '\n'" os.system(statement) else: if not options.output_filename_pattern: raise ValueError( "please specify output filename for second pair " "(--output-filename-pattern)") E.warn("consider sorting individual fastq files - " "this is memory intensive") entries1 = {} entries2 = {} for record1, record2 in itertools.izip( Fastq.iterate(options.stdin), Fastq.iterate(IOTools.openFile(options.pair))): entries1[record1.identifier[:-2]] = (record1.seq, record1.quals) entries2[record2.identifier[:-2]] = (record2.seq, record2.quals) outfile1 = options.stdout outfile2 = IOTools.openFile(options.output_filename_pattern, "w") assert len(set(entries1.keys()).intersection( set(entries2.keys()))) == len(entries1),\ "paired files do not contain the same reads "\ "need to reconcile files" for entry in sorted(entries1): outfile1.write("@%s/1\n%s\n+\n%s\n" % (entry, entries1[entry][0], entries1[entry][1])) outfile2.write("@%s/2\n%s\n+\n%s\n" % (entry, entries2[entry][0], entries2[entry][1])) elif options.method == "renumber-reads": id_count = 1 for record in Fastq.iterate(options.stdin): record.identifier = options.renumber_pattern % id_count id_count += 1 options.stdout.write("@%s\n%s\n+\n%s\n" % (record.identifier, record.seq, record.quals)) # write footer and output benchmark information. E.info("%s" % str(c)) E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gff2annotator2tsv.py 2861 2010-02-23 17:36:32Z andreas $", usage = globals()["__doc__"]) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genome." ) parser.add_option( "-f", "--features", dest="features", type="string", help="feature to collect [default=None]." ) parser.add_option( "-i", "--files", dest="files", action="append", help="use multiple annotations [default=None]." ) parser.add_option( "-a", "--annotations", dest="annotations", type="string", help="aggregate name for annotations if only single file is provided from STDIN [default=None]." ) parser.add_option( "--input-filename-map", dest="input_filename_map", type="string", help="filename with a map of gene_ids to categories [default=None]." ) parser.add_option( "--output-filename-synonyms", dest="output_filename_synonyms", type="string", help="output filename for synonyms. For workspace building, the gff source will be used as the id (instead of the contig) [default=None]." ) parser.add_option( "-m", "--max-length", dest="max_length", type="string", help="maximum segment length [default=None]." ) parser.add_option( "-s", "--section", dest="section", type="choice", choices=("segments", "annotations", "annotations-genes", "annotations-go", "workspace", "annotations-gff" ), help="annotator section [default=None]." ) parser.add_option( "--subset", dest="subsets", type="string", action="append", help="add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]." ) parser.add_option( "--remove-regex", dest="remove_regex", type="string", help="regular expression of contigs to remove [default=None]." ) parser.set_defaults( genome_file = None, feature = None, section = "segments", annotations = "annotations", max_length = 100000, files = [], subsets = [], input_filename_map = None, output_filename_synonyms = None, input_format = "gff", remove_regex = None, ) (options, args) = E.Start( parser ) options.files += args if len(options.files) == 0: options.files.append("-") options.files = list( itertools.chain( *[ re.split( "[,; ]+", x) for x in options.files ] ) ) if options.subsets: subsets = collections.defaultdict( list ) for s in options.subsets: filename_gff,label,filename_ids = s.split( "," ) subsets[filename_gff].append( (label,filename_ids) ) options.subsets = subsets if options.genome_file: fasta = IndexedFasta.IndexedFasta( options.genome_file ) else: fasta = None if options.section == "segments": prefix = "##Segs" elif options.section.startswith( "annotations" ): prefix = "##Id" elif options.section == "workspace": prefix = "##Work" else: raise ValueError("unknown section %s" % options.section) ninput, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0 if options.remove_regex: options.remove_regex = re.compile( options.remove_regex ) if options.section in ("segments", "workspace"): iterator = GTF.iterator_filtered( GFF.iterator( options.stdin ), feature=options.feature ) if options.output_filename_synonyms: outfile_synonyms = open(options.output_filename_synonyms, "w") with_records = True else: outfile_synonyms = None with_records = False intervals =GTF.readAsIntervals( iterator, with_records = with_records ) ninput, nsegments, ndiscarded, ncontigs = \ PipelineEnrichment.outputSegments( options.stdout, intervals, options.section, outfile_synonyms = outfile_synonyms, max_length = options.max_length, remove_regex = options.remove_regex ) if outfile_synonyms: outfile_synonyms.close() elif options.section == "annotations-go": assert options.input_filename_map, "please supply option --input-filename-map" iterator = GTF.iterator_filtered( GTF.iterator( options.stdin ), feature=options.feature ) geneid2categories = IOTools.readMultiMap( open( options.input_filename_map, "r") ) category2segments = collections.defaultdict( list ) for contig, gffs in GTF.readAsIntervals( iterator, with_gene_id = True ).items(): if options.remove_regex and options.remove_regex.search( contig ): continue for start, end, geneid in gffs: if geneid not in geneid2categories: continue for category in geneid2categories[geneid]: category2segments[category].append(nsegments) options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) ) nsegments += 1 for category, segments in category2segments.iteritems(): options.stdout.write("##Ann\t%s\t%s\n" % (category, "\t".join( ["%i" % x for x in segments ] ) ) ) E.info( "set %s annotated with %i segments" % (category, len(segments)) ) elif options.section == "annotations": for filename in options.files: E.info( "adding filename %s" % filename ) start = nsegments is_gtf = False if filename == "-": iterator = GTF.iterator_filtered( GFF.iterator( sys.stdin ), feature=options.feature ) filename = options.annotations elif filename.endswith(".gtf"): is_gtf = True with open( filename, "r") as infile: iterator = GTF.iterator_filtered( GTF.iterator( infile ), feature=options.feature ) else: with open( filename, "r") as infile: iterator = GTF.iterator_filtered( GFF.iterator( infile ), feature=options.feature ) E.debug("processing %s" % (filename)) if not options.subsets or filename not in options.subsets: for contig, gffs in GTF.readAsIntervals( iterator ).items(): if options.remove_regex and options.remove_regex.search( contig ): continue for x in gffs: options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, x[0], x[1] ) ) nsegments += 1 options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join( ["%i" % x for x in range(start, nsegments) ] ) ) ) E.info( "set %s annotated with %i segments" % (filename, nsegments - start) ) else: raise ValueError("don't know how to filter %s" % filename ) elif options.section == "annotations-gff": for filename in options.files: if filename == "-": iterator = GTF.iterator( sys.stdin ) else: iterator = GTF.iterator_filtered( GFF.iterator( open( filename, "r") ) ) segments = collections.defaultdict( list ) for gff in iterator: segments[":".join((gff.source,gff.feature))].append( (gff.contig,gff.start, gff.end) ) feature2segments = {} for feature, s in segments.iteritems(): s.sort() s1 = nsegments for contig, start, end in s: if options.remove_regex and options.remove_regex.search( contig ): continue options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) ) nsegments += 1 feature2segments[feature] = (s1, nsegments) for feature, id_range in feature2segments.iteritems(): start, end = id_range options.stdout.write("##Ann\t%s\t%s\n" % (feature, "\t".join( ["%i" % x for x in xrange( start,end) ] ) ) ) E.info( "set %s annotated with %i segments" % (feature, end-start) ) elif options.section == "annotations-genes": for filename in options.files: E.info( "adding filename %s" % filename ) start = nsegments assert filename.endswith(".gtf") or filename.endswith(".gtf.gz"), \ "requiring .gtf files for gene list filtering, received %s" % filename infile = IOTools.openFile( filename ) iterator = GTF.iterator_filtered( GTF.iterator( infile ), feature=options.feature ) E.debug("processing %s" % (filename)) if not options.subsets or filename not in options.subsets: ## output all for contig, gffs in GTF.readAsIntervals( iterator ).items(): if options.remove_regex and options.remove_regex.search( contig ): continue for x in gffs: options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, x[0],x[1] ) ) nsegments += 1 options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join( ["%i" % x for x in range(start, nsegments) ] ) ) ) E.info( "set %s annotated with %i segments" % (filename, nsegments - start) ) else: ## create subsets E.debug("applying subsets for %s" % filename ) geneid2label, label2segments = collections.defaultdict(list) , {} for label, filename_ids in options.subsets[filename]: gene_ids = IOTools.readList( open(filename_ids, "r") ) for gene_id in gene_ids: geneid2label[gene_id].append( label ) label2segments[label] = [] for contig, gffs in GTF.readAsIntervals( iterator, with_gene_id = True ).items(): if options.remove_regex and options.remove_regex.search( contig ): continue for start, end, gene_id in gffs: if gene_id not in geneid2label: continue for label in geneid2label[gene_id]: label2segments[label].append(nsegments) options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) ) nsegments += 1 for label, segments in label2segments.iteritems(): options.stdout.write("##Ann\t%s\t%s\n" % (label, "\t".join( ["%i" % x for x in segments ] ) ) ) E.info( "set %s (%s) annotated with %i segments" % (label, filename, len(segments)) ) E.info( "ninput=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" % (ninput, ncontigs, nsegments, ndiscarded)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--bamfile", dest="bamfile", type="string", help="input bamfile to filter reads from") parser.add_option("-r", "--reads", dest="reads", type="choice", choices=("mapped", "unmapped"), help="type of read to keep") parser.add_option("-s", "--scriptsdir", dest="scriptsdir", type="string", help="CGAT scripts directory") parser.add_option("-i", "--invert", dest="invert", action="store_true", help="invert selection - if for example unmapped reads \ aren't output") parser.set_defaults(bamfile=None, reads="mapped", scriptsdir="/ifs/devel/nicki/cgat_git/cgat/scripts", invert=False) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) c = E.Counter() c.input_alignments = 0 c.input_reads = 0 c.output_reads = 0 # output text file for reads TO KEEP bam = pysam.Samfile(options.bamfile, "rb") temp = P.getTempFile(".") E.info("iterating over bam file") for alignment in bam.fetch(until_eof=True): c.input_alignments += 1 if options.reads == "unmapped": if alignment.is_unmapped: #c.input_alignments += 1 temp.write(alignment.qname + "\n") elif options.reads == "mapped": if not alignment.is_unmapped: #c.input_alignments += 1 temp.write(alignment.qname + "\n") temp.close() tempname = temp.name E.info("filtering fastq file") # filter fastq file ids = set(IOTools.readList(IOTools.openFile(tempname).readlines())) c.input_alignments = len(ids) for fastq in Fastq.iterate(options.stdin): c.input_reads += 1 if (fastq.identifier.endswith("/1") or fastq.identifier.endswith("/2") ) and " " not in fastq.identifier: identifier = fastq.identifier[:-2] elif len(fastq.identifier.split(" ")) == 2: identifier = fastq.identifier.split(" ")[0] else: identifier = fastq.identifier if not options.invert: if identifier in ids: c.output_reads += 1 options.stdout.write("%s\n" % fastq) else: if identifier in ids: continue c.output_reads += 1 options.stdout.write("%s\n" % fastq) E.info(c) os.unlink(tempname) # write footer and output benchmark information. E.Stop()
if len(args) < 2: raise ValueError( "please supply at least two filenames.") headers, titles, sets = [], [], [] if options.headers: if options.headers == "-": headers=args else: headers=options.headers.split(",") if len(headers) != len(args): raise ValueError ("please supply the same number of headers as there are filenames." ) for f in args: if options.with_title: title, data = IOTools.readList( open(f,"r"), with_title = options.with_title ) titles.append( title ) else: data = IOTools.readList( open(f,"r") ) sets.append( set( data )) if not headers and titles: headers = titles else: headers = args for x in range(len(sets)-1): set1=sets[x] for y in range(x+1, len(sets)): set2=sets[y]