def action(args): if args.inplace and args.infile is sys.stdin: log.error('Error: cannot use the --inplace option with stdin') return if args.rename: raise NotImplementedError reader = csv.DictReader(args.infile) fieldnames = reader.fieldnames or [] new_fields = parse_extras(args.add) if args.add else {} if new_fields: fieldnames.extend(new_fields.keys()) reader = imap(lambda row: dict(row, **new_fields), reader) if args.inplace: outfile = tmp(args.infile.name) else: outfile = args.outfile with opener(outfile, 'w') as fout: writer = csv.DictWriter(fout, fieldnames, extrasaction='ignore') writer.writeheader() writer.writerows(reader) if args.inplace: os.rename(fout.name, args.infile.name)
def action(args): if args.is_file: seqs = fastalite(opener(args.seqs)) for s in seqs: seq = reversed(s.seq) seq = [rev_comp[se] for se in seq] seq = ''.join(seq) args.out_fasta.write('>{}\n{}\n'.format(s.description, seq)) else: seq = [rev_comp[s] for s in args.seqs] seq = ''.join(seq) args.out.write(seq) args.out.write('\n') if args.rlefile and args.out_rle: reader = csv.reader(args.rlefile) writer = csv.writer(args.out_rle) # try to determine if first row is a header; we'll assume that # the first row, second column is a run-length encoding if # it's at least half digits. name, rle = reader.next() if sum(c.isdigit() for c in rle) / float(len(rle)) > 0.5: writer.writerow([name, ''.join(reversed(rle))]) else: assert [name, rle] == rle_fieldnames writer.writerow([name, rle]) for name, rle in reader: writer.writerow([name, ''.join(reversed(rle))])
def test03(self): outdir = self.mkoutdir() fa = path.join(datadir, 'F1_3', 'trimmed_rle.fasta') rle = path.join(datadir, 'F1_3', 'trimmed_rle_nohead.csv.bz2') fa_out = path.join(outdir, 'rc.fasta') rle_out = path.join(outdir, 'rc.csv.bz2') self.main( ['--is-file', '--out-fasta', fa_out, '--out-rle', rle_out, '--rlefile', rle, fa]) self.assertTrue(path.exists(fa_out)) self.assertTrue(path.exists(rle_out)) with opener(rle) as infile, opener(rle_out) as outfile: self.assertEqual(len(list(infile)), len(list(outfile)))
def test03(self): outdir = self.mkoutdir() fa = path.join(datadir, 'F1_3', 'trimmed_rle.fasta') rle = path.join(datadir, 'F1_3', 'trimmed_rle_nohead.csv.bz2') fa_out = path.join(outdir, 'rc.fasta') rle_out = path.join(outdir, 'rc.csv.bz2') self.main([ '--is-file', '--out-fasta', fa_out, '--out-rle', rle_out, '--rlefile', rle, fa ]) self.assertTrue(path.exists(fa_out)) self.assertTrue(path.exists(rle_out)) with opener(rle) as infile, opener(rle_out) as outfile: self.assertEqual(len(list(infile)), len(list(outfile)))
def build_parser(parser): parser.add_argument('fasta', type = lambda f: fastalite(opener(f)), help = 'input file containing raw reads') parser.add_argument('--sample-id', help = 'sample id to pull reads for') parser.add_argument('--map-file', type = Csv2Dict(value = 'sample_id', fieldnames=['sequence_id','sample_id']), help = 'csv(.bz2) file containing sequence_id,sample_id in the rows.') parser.add_argument('-o', '--out', type = Opener('w'), default = sys.stdout, help = 'fasta output file')
def build_parser(parser): parser.add_argument('fasta', type=lambda f: fastalite(opener(f)), help='input file containing raw reads') parser.add_argument('--sample-id', help='sample id to pull reads for') parser.add_argument( '--map-file', type=Csv2Dict(value='sample_id', fieldnames=['sequence_id', 'sample_id']), help='csv(.bz2) file containing sequence_id,sample_id in the rows.') parser.add_argument('-o', '--out', type=Opener('w'), default=sys.stdout, help='fasta output file')
def action(args): fasta = fastalite(args.fasta) spec_map = DictReader(args.specimen_map, fieldnames = ['readname', 'specimen']) spec_map = {s['readname']:s['specimen'] for s in spec_map} def by_specimen(f): return spec_map[f.id] groups = sorted(fasta, key = by_specimen) groups = groupby(groups, key = by_specimen) for spec, fasta in groups: fasta = ('>{}\n{}'.format(f.description, f.seq) for f in fasta) fasta = '\n'.join(fasta) filename = path.join(args.outdir, '{}.fasta.bz2'.format(spec)) with opener(filename, 'w') as out: out.write(fasta)
def action(args): fasta = fastalite(args.fasta) spec_map = DictReader(args.specimen_map, fieldnames=['readname', 'specimen']) spec_map = {s['readname']: s['specimen'] for s in spec_map} def by_specimen(f): return spec_map[f.id] groups = sorted(fasta, key=by_specimen) groups = groupby(groups, key=by_specimen) for spec, fasta in groups: fasta = ('>{}\n{}'.format(f.description, f.seq) for f in fasta) fasta = '\n'.join(fasta) filename = path.join(args.outdir, '{}.fasta.bz2'.format(spec)) with opener(filename, 'w') as out: out.write(fasta)
def build_parser(parser): parser.add_argument( 'blast_file', nargs='?', default=sys.stdin, type=Opener('r'), help='CSV tabular blast file of query and subject hits') parser.add_argument( '--all-one-group', dest='all_one_group', action='store_true', help="""If --map is not provided, the default behavior is to treat all reads as one group; use this option to treat each read as a separate group [%(default)s]""") parser.add_argument( '-a', '--asterisk', default=100, metavar='PERCENT', type=float, help='Next to any species above a certain threshold [%(default)s]') parser.add_argument('--copy-numbers', metavar='CSV', type=Opener(), help='columns: tax_id, median') parser.add_argument( '-c', '--coverage', default=95, metavar='PERCENT', type=float, help='percent of alignment coverage of blast result [%(default)s]') parser.add_argument( '--details-identity', metavar='PERCENT', help='Minimum identity to include blast hits in details file', type=float, default=90) parser.add_argument( '--details-full', action='store_true', help='do not limit out_details to only larget cluster per assignment') parser.add_argument('--exclude-by-taxid', metavar='CSV', type=lambda f: set(e for e in DictReader( opener(f), fieldnames='tax_id')), default={}, help='column: tax_id') parser.add_argument( '--group-def', metavar='INT', action='append', default=[], help="""define a group threshold for a particular rank overriding --target-max-group-size. example: genus:2""") parser.add_argument('--group-label', metavar='LABEL', default='all', help='Single group label for reads') parser.add_argument( '-o', '--out', default=sys.stdout, type=Opener('w'), metavar='CSV', help="""columns: specimen, max_percent, min_percent, max_coverage, min_coverage, assignment_id, assignment, clusters, reads, pct_reads, corrected, pct_corrected, target_rank, hi, low, tax_ids""" ) parser.add_argument('-m', '--map', metavar='CSV', type=Opener(), default={}, help='columns: name, specimen') parser.add_argument( '--max-ambiguous', metavar='INT', default=3, type=int, help='Maximum ambiguous count in reference sequences [%(default)s]') parser.add_argument( '--max-identity', default=100, metavar='PERCENT', type=float, help='maximum identity threshold for accepting matches [<= %(default)s]' ) parser.add_argument( '--min-cluster-size', default=0, metavar='INT', type=int, help='minimum cluster size to include in classification output') parser.add_argument( '--min-identity', default=99, metavar='PERCENT', type=float, help='minimum identity threshold for accepting matches [> %(default)s]' ) parser.add_argument( '-s', '--seq-info', required=True, metavar='CSV', type=Opener(), help='seq info file(s) to match sequence ids to taxids [%(default)s]') parser.add_argument( '-t', '--taxonomy', required=True, metavar='CSV', type=Csv2Dict('tax_id'), help='tax table of taxids and species names [%(default)s]') parser.add_argument( '-O', '--out-detail', type=lambda f: DictWriter( opener(f, 'w'), extrasaction='ignore', fieldnames=[ 'specimen', 'assignment', 'assignment_id', 'qseqid', 'sseqid', 'pident', 'coverage', 'ambig_count', 'accession', 'tax_id', 'tax_name', 'target_rank', 'rank', 'hi', 'low' ]), metavar='CSV', help="""columns: specimen, assignment, assignment_id, qseqid, sseqid, pident, coverage, ambig_count, accession, tax_id, tax_name, target_rank, rank, hi, low""" ) parser.add_argument('--target-max-group-size', metavar='INTEGER', default=3, type=int, help="""group multiple target-rank assignments that excede a threshold to a higher rank [%(default)s]""") parser.add_argument( '--target-rank', metavar='RANK', help='Rank at which to classify. Default: "%(default)s"', default='species') parser.add_argument('-w', '--weights', metavar='CSV', type=Opener(), help='columns: name, weight') ### csv.Sniffer.has_header is *not* reliable enough parser.add_argument('--has-header', action='store_true', help='specify this if blast data has a header')
def write_pickle(self, pth, data): with opener(pth, 'wb') as f: cPickle.dump(data, f, protocol=cPickle.HIGHEST_PROTOCOL)
def action(args): if args.remote and not args.remote_database: log.error("bioy blast: error: please specify a remote database") return elif not args.remote and not args.database: log.error("bioy blast: error: please specify path to local database") return command = ['blastn'] command += ['-query', args.fasta] if args.remote: command += ['-remote'] command += ['-db', args.remote_database] else: command += ['-db', args.database] command += ['-num_threads', str(args.threads)] command += ['-perc_identity', args.id] command += ['-outfmt', '6 ' + args.outfmt.replace(',', ' ')] command += ['-strand', args.strand] if args.max: command += ['-max_target_seqs', args.max] log.info(' '.join(command)) if args.dry_run: sys.exit(0) pipe = Popen(command, stdout = PIPE, stderr = PIPE) results, errors = pipe.communicate() if errors: log.error(errors) # split tab lines lines = (r.strip().split('\t') for r in StringIO(results)) header = args.outfmt.split(',') # match with fieldnames lines = (zip(header, l) for l in lines) # make into dict lines = [dict(l) for l in lines] # Replace blast's local alignment query coverage with global coverage calculation if 'qcovs' in args.outfmt.split(',') or isinstance(args.coverage, float): for l in lines: l['qcovs'] = (float(l['qend']) - float(l['qstart']) + 1) \ / float(l['qlen']) * 100 l['qcovs'] = '{0:.2f}'.format(l['qcovs']) if isinstance(args.coverage, float): lines = [l for l in lines if float(l['qcovs']) >= args.coverage] if args.nohits: # to get nohits first we need to know about the hits qids = groupby(lines, key = itemgetter('qseqid')) qids = set(q for q,_ in qids) # now we can build a list of nohits nohits = [] for q in fastalite(opener(args.fasta)): if q.id not in qids: nohits.append(q) # convert nohits into DictWriter format nohits = (dict(qseqid = q.id) for q in nohits) # append to lines lines = chain(lines, nohits) out = DictWriter(args.out, fieldnames = header, extrasaction = 'ignore') if args.header: out.writeheader() out.writerows(lines)
def action(args): if args.remote and not args.remote_database: log.error("bioy blast: error: please specify a remote database") return elif not args.remote and not args.database: log.error("bioy blast: error: please specify path to local database") return command = ['blastn'] command += ['-query', args.fasta] if args.remote: command += ['-remote'] command += ['-db', args.remote_database] else: command += ['-db', args.database] command += ['-num_threads', str(args.threads)] command += ['-perc_identity', args.id] command += ['-outfmt', '6 ' + args.outfmt.replace(',', ' ')] command += ['-strand', args.strand] if args.max: command += ['-max_target_seqs', args.max] log.info(' '.join(command)) if args.dry_run: sys.exit(0) pipe = Popen(command, stdout=PIPE, stderr=PIPE) results, errors = pipe.communicate() if errors: log.error(errors) # split tab lines lines = (r.strip().split('\t') for r in StringIO(results)) header = args.outfmt.split(',') # match with fieldnames lines = (zip(header, l) for l in lines) # make into dict lines = [dict(l) for l in lines] # Replace blast's local alignment query coverage with global coverage calculation if 'qcovs' in args.outfmt.split(',') or isinstance(args.coverage, float): for l in lines: l['qcovs'] = (float(l['qend']) - float(l['qstart']) + 1) \ / float(l['qlen']) * 100 l['qcovs'] = '{0:.2f}'.format(l['qcovs']) if isinstance(args.coverage, float): lines = [l for l in lines if float(l['qcovs']) >= args.coverage] if args.nohits: # to get nohits first we need to know about the hits qids = groupby(lines, key=itemgetter('qseqid')) qids = set(q for q, _ in qids) # now we can build a list of nohits nohits = [] for q in fastalite(opener(args.fasta)): if q.id not in qids: nohits.append(q) # convert nohits into DictWriter format nohits = (dict(qseqid=q.id) for q in nohits) # append to lines lines = chain(lines, nohits) out = DictWriter(args.out, fieldnames=header, extrasaction='ignore') if args.header: out.writeheader() out.writerows(lines)
def build_parser(parser): parser.add_argument('blast_file', nargs = '?', default = sys.stdin, type = Opener('r'), help = 'CSV tabular blast file of query and subject hits') parser.add_argument('--all-one-group', dest = 'all_one_group', action = 'store_true', help = """If --map is not provided, the default behavior is to treat all reads as one group; use this option to treat each read as a separate group [%(default)s]""") parser.add_argument('-a', '--asterisk', default = 100, metavar='PERCENT', type = float, help = 'Next to any species above a certain threshold [%(default)s]') parser.add_argument('--copy-numbers', metavar = 'CSV', type = Opener(), help = 'columns: tax_id, median') parser.add_argument('-c', '--coverage', default = 95, metavar = 'PERCENT', type = float, help = 'percent of alignment coverage of blast result [%(default)s]') parser.add_argument('--details-identity', metavar = 'PERCENT', help = 'Minimum identity to include blast hits in details file', type = float, default = 90) parser.add_argument('--details-full', action = 'store_true', help = 'do not limit out_details to only larget cluster per assignment') parser.add_argument('--exclude-by-taxid', metavar = 'CSV', type = lambda f: set(e for e in DictReader(opener(f), fieldnames ='tax_id')), default = {}, help = 'column: tax_id') parser.add_argument('--group-def', metavar = 'INT', action = 'append', default = [], help = """define a group threshold for a particular rank overriding --target-max-group-size. example: genus:2""") parser.add_argument('--group-label', metavar = 'LABEL', default = 'all', help = 'Single group label for reads') parser.add_argument('-o', '--out', default = sys.stdout, type = Opener('w'), metavar = 'CSV', help = """columns: specimen, max_percent, min_percent, max_coverage, min_coverage, assignment_id, assignment, clusters, reads, pct_reads, corrected, pct_corrected, target_rank, hi, low, tax_ids""") parser.add_argument('-m', '--map', metavar = 'CSV', type = Opener(), default = {}, help = 'columns: name, specimen') parser.add_argument('--max-ambiguous', metavar = 'INT', default = 3, type = int, help = 'Maximum ambiguous count in reference sequences [%(default)s]') parser.add_argument('--max-identity', default = 100, metavar = 'PERCENT', type = float, help = 'maximum identity threshold for accepting matches [<= %(default)s]') parser.add_argument('--min-cluster-size', default = 0, metavar = 'INT', type = int, help = 'minimum cluster size to include in classification output') parser.add_argument('--min-identity', default = 99, metavar = 'PERCENT', type = float, help = 'minimum identity threshold for accepting matches [> %(default)s]') parser.add_argument('-s', '--seq-info', required = True, metavar = 'CSV', type = Opener(), help = 'seq info file(s) to match sequence ids to taxids [%(default)s]') parser.add_argument('-t', '--taxonomy', required = True, metavar = 'CSV', type = Csv2Dict('tax_id'), help = 'tax table of taxids and species names [%(default)s]') parser.add_argument('-O', '--out-detail', type = lambda f: DictWriter(opener(f, 'w'), extrasaction = 'ignore', fieldnames = [ 'specimen', 'assignment', 'assignment_id', 'qseqid', 'sseqid', 'pident', 'coverage', 'ambig_count', 'accession', 'tax_id', 'tax_name', 'target_rank', 'rank', 'hi', 'low' ]), metavar = 'CSV', help = """columns: specimen, assignment, assignment_id, qseqid, sseqid, pident, coverage, ambig_count, accession, tax_id, tax_name, target_rank, rank, hi, low""") parser.add_argument('--target-max-group-size', metavar = 'INTEGER', default = 3, type = int, help = """group multiple target-rank assignments that excede a threshold to a higher rank [%(default)s]""") parser.add_argument('--target-rank', metavar='RANK', help = 'Rank at which to classify. Default: "%(default)s"', default = 'species') parser.add_argument('-w', '--weights', metavar = 'CSV', type = Opener(), help = 'columns: name, weight') ### csv.Sniffer.has_header is *not* reliable enough parser.add_argument('--has-header', action = 'store_true', help = 'specify this if blast data has a header')