def score(args): """ %prog score blastfile query.fasta A.ids Add up the scores for each query seq. Go through the lines and for each query sequence, add up the scores when subject is in each pile by A.ids. """ from jcvi.formats.base import SetFile from jcvi.formats.fasta import Fasta p = OptionParser(score.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) blastfile, fastafile, idsfile = args ids = SetFile(idsfile) blast = Blast(blastfile) scores = defaultdict(int) for b in blast: query = b.query subject = b.subject if subject not in ids: continue scores[query] += b.score logging.debug("A total of {0} ids loaded.".format(len(ids))) f = Fasta(fastafile) for s in f.iterkeys_ordered(): sc = scores.get(s, 0) print "\t".join((s, str(sc)))
def script(args): """ %prog script gffile cdna.fasta genome.fasta Parse gmap gff and produce script for sim4db to refine. """ p = OptionParser(script.__doc__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(p.print_help()) gffile, cdnafasta, genomefasta = args scriptfile = gffile + ".script" gff = Gff(gffile) fw = open(scriptfile, "w") cdnas = Fasta(cdnafasta, lazy=True) cdnas = dict((x, i) for (i, x) in enumerate(cdnas.iterkeys_ordered())) genomes = Fasta(genomefasta, lazy=True) genomes = dict((x, i) for (i, x) in enumerate(genomes.iterkeys_ordered())) extra = 50000 # 50-kb region surrounding the locus for g in gff: if g.type != "mRNA": continue cdna = g.attributes["Name"][0] genome = g.seqid ci = cdnas[cdna] gi = genomes[genome] strand = "-r" if g.strand == "-" else "-f" start, end = g.start, g.end start = max(0, start - extra) end += extra print >> fw, "{0} -e {1} -D {2} {3} {4}"\ .format(strand, ci, gi, start, end)
def prepare(args): """ %prog prepare --rearray_lib=<rearraylibrary> --orig_lib_file=<origlibfile> Inferred file names --------------------------------------------- `lookuptblfile` : rearraylibrary.lookup `rearraylibfile`: rearraylibrary.fasta Pick sequences from the original library file and the rearrayed library file based on the mapping information provided in the `lookuptblfile`. # lookuptblfile format: column number (index) # 1 (0) 2 (1) 3 (2) 4 (3) 5 (4) 6 (5) # source_clone source_plate source_well dest_clone dest_plate dest_well The 1st and 4th column in the `lookuptblfile` form the pair of clones which constitute the elements used for the per-clone assembly. """ from operator import itemgetter from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(prepare.__doc__) p.add_option("--rearray_lib", default=None, help="name of the rearrayed library [default: %default]") p.add_option( "--orig_lib_file", help= "fasta file containing reads from the original libraries [default: %default]" ) g = OptionGroup(p, "Optional parameters") g.add_option( "--output_folder", default="to_assemble", help="output folder to write the FASTA files to [default: %default]") p.add_option_group(g) opts, args = p.parse_args(args) if not opts.rearray_lib or not opts.orig_lib_file: logging.error("Please specify the required parameters") sys.exit(not p.print_help()) rearraylib, origlibfile = opts.rearray_lib, opts.orig_lib_file if not op.isfile(origlibfile): logging.error( "Original library reads file `{0}` does not exist!".format( origlibfile)) sys.exit() lookuptblfile = rearraylib + '.lookup' logging.debug(lookuptblfile) if not op.isfile(lookuptblfile): logging.error( "Lookup table file `{0}` does not exist!".format(lookuptblfile)) sys.exit() rearraylibfile = rearraylib + '.fasta' logging.debug(rearraylibfile) if not op.isfile(rearraylibfile): logging.error( "Rearrayed library reads file `{0}` does not exist!".format( rearraylibfile)) sys.exit() origlibFasta = Fasta(origlibfile) rearraylibFasta = Fasta(rearraylibfile) origlibids = [o for o in origlibFasta.iterkeys_ordered()] rearraylibids = [r for r in rearraylibFasta.iterkeys_ordered()] if not op.isdir(opts.output_folder): logging.warning( "Output directory `{0}` missing. Creating it now...".format( opts.output_folder)) os.makedirs(opts.output_folder) logfile = rearraylib + '.log' log = open(logfile, 'w') fp = open(lookuptblfile, 'r') for row in fp: origprefix, rearrayprefix = itemgetter(0, 3)(row.split('\t')) libpair = origprefix + '_' + rearrayprefix outfile = opts.output_folder + '/' + libpair + '.fasta' ofp = open(outfile, 'w') for o in origlibids: if re.match(origprefix, o): SeqIO.write(origlibFasta[o], ofp, 'fasta') for r in rearraylibids: if re.match(rearrayprefix, r): SeqIO.write(rearraylibFasta[r], ofp, 'fasta') ofp.close() print(outfile, file=log) log.close() logging.debug('Wrote log file `{0}`'.format(logfile))
def prepare(args): """ %prog prepare --rearray_lib=<rearraylibrary> --orig_lib_file=<origlibfile> Inferred file names --------------------------------------------- `lookuptblfile` : rearraylibrary.lookup `rearraylibfile`: rearraylibrary.fasta Pick sequences from the original library file and the rearrayed library file based on the mapping information provided in the `lookuptblfile`. # lookuptblfile format: column number (index) # 1 (0) 2 (1) 3 (2) 4 (3) 5 (4) 6 (5) # source_clone source_plate source_well dest_clone dest_plate dest_well The 1st and 4th column in the `lookuptblfile` form the pair of clones which constitute the elements used for the per-clone assembly. """ from operator import itemgetter from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(prepare.__doc__) p.add_option("--rearray_lib", default=None, help="name of the rearrayed library [default: %default]") p.add_option("--orig_lib_file", help="fasta file containing reads from the original libraries [default: %default]") g = OptionGroup(p, "Optional parameters") g.add_option("--output_folder", default="to_assemble", help="output folder to write the FASTA files to [default: %default]") p.add_option_group(g) opts, args = p.parse_args(args) if not opts.rearray_lib or not opts.orig_lib_file: logging.error("Please specify the required parameters") sys.exit(not p.print_help()) rearraylib, origlibfile = opts.rearray_lib, opts.orig_lib_file if not op.isfile(origlibfile): logging.error("Original library reads file `{0}` does not exist!".format(origlibfile)) sys.exit() lookuptblfile = rearraylib + '.lookup' logging.debug(lookuptblfile) if not op.isfile(lookuptblfile): logging.error("Lookup table file `{0}` does not exist!".format(lookuptblfile)) sys.exit() rearraylibfile = rearraylib + '.fasta' logging.debug(rearraylibfile) if not op.isfile(rearraylibfile): logging.error("Rearrayed library reads file `{0}` does not exist!".format(rearraylibfile)) sys.exit() origlibFasta = Fasta(origlibfile) rearraylibFasta = Fasta(rearraylibfile) origlibids = [o for o in origlibFasta.iterkeys_ordered()] rearraylibids = [r for r in rearraylibFasta.iterkeys_ordered()] if not op.isdir(opts.output_folder): logging.warning("Output directory `{0}` missing. Creating it now...".format(opts.output_folder)) os.makedirs(opts.output_folder) logfile = rearraylib + '.log' log = open(logfile, 'w') fp = open(lookuptblfile, 'r') for row in fp: origprefix, rearrayprefix = itemgetter(0,3)(row.split('\t')) libpair = origprefix + '_' + rearrayprefix outfile = opts.output_folder + '/' + libpair + '.fasta' ofp = open(outfile, 'w') for o in origlibids: if re.match(origprefix, o): SeqIO.write(origlibFasta[o], ofp, 'fasta') for r in rearraylibids: if re.match(rearrayprefix, r): SeqIO.write(rearraylibFasta[r], ofp, 'fasta') ofp.close() print >>log, outfile log.close() logging.debug('Wrote log file `{0}`'.format(logfile))