Пример #1
0
def action(args):
    if args.inplace and args.infile is sys.stdin:
        log.error('Error: cannot use the --inplace option with stdin')
        return

    if args.rename:
        raise NotImplementedError

    reader = csv.DictReader(args.infile)
    fieldnames = reader.fieldnames or []

    new_fields = parse_extras(args.add) if args.add else {}

    if new_fields:
        fieldnames.extend(new_fields.keys())
        reader = imap(lambda row: dict(row, **new_fields), reader)

    if args.inplace:
        outfile = tmp(args.infile.name)
    else:
        outfile = args.outfile

    with opener(outfile, 'w') as fout:
        writer = csv.DictWriter(fout, fieldnames, extrasaction='ignore')
        writer.writeheader()
        writer.writerows(reader)

    if args.inplace:
        os.rename(fout.name, args.infile.name)
Пример #2
0
def action(args):
    if args.is_file:
        seqs = fastalite(opener(args.seqs))
        for s in seqs:
            seq = reversed(s.seq)
            seq = [rev_comp[se] for se in seq]
            seq = ''.join(seq)
            args.out_fasta.write('>{}\n{}\n'.format(s.description, seq))
    else:
        seq = [rev_comp[s] for s in args.seqs]
        seq = ''.join(seq)
        args.out.write(seq)
        args.out.write('\n')

    if args.rlefile and args.out_rle:
        reader = csv.reader(args.rlefile)
        writer = csv.writer(args.out_rle)

        # try to determine if first row is a header; we'll assume that
        # the first row, second column is a run-length encoding if
        # it's at least half digits.
        name, rle = reader.next()
        if sum(c.isdigit() for c in rle) / float(len(rle)) > 0.5:
            writer.writerow([name, ''.join(reversed(rle))])
        else:
            assert [name, rle] == rle_fieldnames
            writer.writerow([name, rle])

        for name, rle in reader:
            writer.writerow([name, ''.join(reversed(rle))])
Пример #3
0
    def test03(self):
        outdir = self.mkoutdir()
        fa = path.join(datadir, 'F1_3', 'trimmed_rle.fasta')
        rle = path.join(datadir, 'F1_3', 'trimmed_rle_nohead.csv.bz2')

        fa_out = path.join(outdir, 'rc.fasta')
        rle_out = path.join(outdir, 'rc.csv.bz2')

        self.main(
            ['--is-file', '--out-fasta', fa_out, '--out-rle',
             rle_out, '--rlefile', rle, fa])

        self.assertTrue(path.exists(fa_out))
        self.assertTrue(path.exists(rle_out))

        with opener(rle) as infile, opener(rle_out) as outfile:
            self.assertEqual(len(list(infile)), len(list(outfile)))
Пример #4
0
    def test03(self):
        outdir = self.mkoutdir()
        fa = path.join(datadir, 'F1_3', 'trimmed_rle.fasta')
        rle = path.join(datadir, 'F1_3', 'trimmed_rle_nohead.csv.bz2')

        fa_out = path.join(outdir, 'rc.fasta')
        rle_out = path.join(outdir, 'rc.csv.bz2')

        self.main([
            '--is-file', '--out-fasta', fa_out, '--out-rle', rle_out,
            '--rlefile', rle, fa
        ])

        self.assertTrue(path.exists(fa_out))
        self.assertTrue(path.exists(rle_out))

        with opener(rle) as infile, opener(rle_out) as outfile:
            self.assertEqual(len(list(infile)), len(list(outfile)))
Пример #5
0
def build_parser(parser):
    parser.add_argument('fasta',
            type = lambda f: fastalite(opener(f)),
            help = 'input file containing raw reads')
    parser.add_argument('--sample-id',
            help = 'sample id to pull reads for')
    parser.add_argument('--map-file',
            type = Csv2Dict(value = 'sample_id', fieldnames=['sequence_id','sample_id']),
            help = 'csv(.bz2) file containing sequence_id,sample_id in the rows.')
    parser.add_argument('-o', '--out',
            type = Opener('w'),
            default = sys.stdout,
            help = 'fasta output file')
Пример #6
0
def build_parser(parser):
    parser.add_argument('fasta',
                        type=lambda f: fastalite(opener(f)),
                        help='input file containing raw reads')
    parser.add_argument('--sample-id', help='sample id to pull reads for')
    parser.add_argument(
        '--map-file',
        type=Csv2Dict(value='sample_id',
                      fieldnames=['sequence_id', 'sample_id']),
        help='csv(.bz2) file containing sequence_id,sample_id in the rows.')
    parser.add_argument('-o',
                        '--out',
                        type=Opener('w'),
                        default=sys.stdout,
                        help='fasta output file')
Пример #7
0
def action(args):
    fasta = fastalite(args.fasta)

    spec_map = DictReader(args.specimen_map, fieldnames = ['readname', 'specimen'])
    spec_map = {s['readname']:s['specimen'] for s in spec_map}

    def by_specimen(f):
        return spec_map[f.id]

    groups = sorted(fasta, key = by_specimen)
    groups = groupby(groups, key = by_specimen)

    for spec, fasta in groups:
        fasta = ('>{}\n{}'.format(f.description, f.seq) for f in fasta)
        fasta = '\n'.join(fasta)

        filename = path.join(args.outdir, '{}.fasta.bz2'.format(spec))

        with opener(filename, 'w') as out:
            out.write(fasta)
Пример #8
0
def action(args):
    fasta = fastalite(args.fasta)

    spec_map = DictReader(args.specimen_map,
                          fieldnames=['readname', 'specimen'])
    spec_map = {s['readname']: s['specimen'] for s in spec_map}

    def by_specimen(f):
        return spec_map[f.id]

    groups = sorted(fasta, key=by_specimen)
    groups = groupby(groups, key=by_specimen)

    for spec, fasta in groups:
        fasta = ('>{}\n{}'.format(f.description, f.seq) for f in fasta)
        fasta = '\n'.join(fasta)

        filename = path.join(args.outdir, '{}.fasta.bz2'.format(spec))

        with opener(filename, 'w') as out:
            out.write(fasta)
Пример #9
0
def build_parser(parser):
    parser.add_argument(
        'blast_file',
        nargs='?',
        default=sys.stdin,
        type=Opener('r'),
        help='CSV tabular blast file of query and subject hits')
    parser.add_argument(
        '--all-one-group',
        dest='all_one_group',
        action='store_true',
        help="""If --map is not provided, the default behavior is to treat
                    all reads as one group; use this option to treat
                    each read as a separate group [%(default)s]""")
    parser.add_argument(
        '-a',
        '--asterisk',
        default=100,
        metavar='PERCENT',
        type=float,
        help='Next to any species above a certain threshold [%(default)s]')
    parser.add_argument('--copy-numbers',
                        metavar='CSV',
                        type=Opener(),
                        help='columns: tax_id, median')
    parser.add_argument(
        '-c',
        '--coverage',
        default=95,
        metavar='PERCENT',
        type=float,
        help='percent of alignment coverage of blast result [%(default)s]')
    parser.add_argument(
        '--details-identity',
        metavar='PERCENT',
        help='Minimum identity to include blast hits in details file',
        type=float,
        default=90)
    parser.add_argument(
        '--details-full',
        action='store_true',
        help='do not limit out_details to only larget cluster per assignment')
    parser.add_argument('--exclude-by-taxid',
                        metavar='CSV',
                        type=lambda f: set(e for e in DictReader(
                            opener(f), fieldnames='tax_id')),
                        default={},
                        help='column: tax_id')
    parser.add_argument(
        '--group-def',
        metavar='INT',
        action='append',
        default=[],
        help="""define a group threshold for a particular rank overriding
                      --target-max-group-size. example: genus:2""")
    parser.add_argument('--group-label',
                        metavar='LABEL',
                        default='all',
                        help='Single group label for reads')
    parser.add_argument(
        '-o',
        '--out',
        default=sys.stdout,
        type=Opener('w'),
        metavar='CSV',
        help="""columns: specimen, max_percent, min_percent, max_coverage,
                      min_coverage, assignment_id, assignment, clusters, reads,
                      pct_reads, corrected, pct_corrected, target_rank, hi, low, tax_ids"""
    )
    parser.add_argument('-m',
                        '--map',
                        metavar='CSV',
                        type=Opener(),
                        default={},
                        help='columns: name, specimen')
    parser.add_argument(
        '--max-ambiguous',
        metavar='INT',
        default=3,
        type=int,
        help='Maximum ambiguous count in reference sequences [%(default)s]')
    parser.add_argument(
        '--max-identity',
        default=100,
        metavar='PERCENT',
        type=float,
        help='maximum identity threshold for accepting matches [<= %(default)s]'
    )
    parser.add_argument(
        '--min-cluster-size',
        default=0,
        metavar='INT',
        type=int,
        help='minimum cluster size to include in classification output')
    parser.add_argument(
        '--min-identity',
        default=99,
        metavar='PERCENT',
        type=float,
        help='minimum identity threshold for accepting matches [> %(default)s]'
    )
    parser.add_argument(
        '-s',
        '--seq-info',
        required=True,
        metavar='CSV',
        type=Opener(),
        help='seq info file(s) to match sequence ids to taxids [%(default)s]')
    parser.add_argument(
        '-t',
        '--taxonomy',
        required=True,
        metavar='CSV',
        type=Csv2Dict('tax_id'),
        help='tax table of taxids and species names [%(default)s]')
    parser.add_argument(
        '-O',
        '--out-detail',
        type=lambda f: DictWriter(
            opener(f, 'w'),
            extrasaction='ignore',
            fieldnames=[
                'specimen', 'assignment', 'assignment_id', 'qseqid', 'sseqid',
                'pident', 'coverage', 'ambig_count', 'accession', 'tax_id',
                'tax_name', 'target_rank', 'rank', 'hi', 'low'
            ]),
        metavar='CSV',
        help="""columns: specimen, assignment, assignment_id,
                      qseqid, sseqid, pident, coverage, ambig_count,
                      accession, tax_id, tax_name, target_rank, rank, hi, low"""
    )
    parser.add_argument('--target-max-group-size',
                        metavar='INTEGER',
                        default=3,
                        type=int,
                        help="""group multiple target-rank assignments that
                      excede a threshold to a higher rank [%(default)s]""")
    parser.add_argument(
        '--target-rank',
        metavar='RANK',
        help='Rank at which to classify. Default: "%(default)s"',
        default='species')
    parser.add_argument('-w',
                        '--weights',
                        metavar='CSV',
                        type=Opener(),
                        help='columns: name, weight')
    ### csv.Sniffer.has_header is *not* reliable enough
    parser.add_argument('--has-header',
                        action='store_true',
                        help='specify this if blast data has a header')
Пример #10
0
 def write_pickle(self, pth, data):
     with opener(pth, 'wb') as f:
         cPickle.dump(data, f, protocol=cPickle.HIGHEST_PROTOCOL)
Пример #11
0
def action(args):

    if args.remote and not args.remote_database:
        log.error("bioy blast: error: please specify a remote database")
        return
    elif not args.remote and not args.database:
        log.error("bioy blast: error: please specify path to local database")
        return

    command = ['blastn']
    command += ['-query', args.fasta]
    if args.remote:
        command += ['-remote']
        command += ['-db', args.remote_database]
    else:
        command += ['-db', args.database]
        command += ['-num_threads', str(args.threads)]
    command += ['-perc_identity', args.id]
    command += ['-outfmt', '6 ' + args.outfmt.replace(',', ' ')]
    command += ['-strand', args.strand]

    if args.max:
        command += ['-max_target_seqs', args.max]

    log.info(' '.join(command))

    if args.dry_run:
        sys.exit(0)

    pipe = Popen(command, stdout = PIPE, stderr = PIPE)

    results, errors = pipe.communicate()

    if errors:
       log.error(errors)

    # split tab lines
    lines = (r.strip().split('\t') for r in StringIO(results))

    header = args.outfmt.split(',')
    # match with fieldnames
    lines = (zip(header, l) for l in lines)

    # make into dict
    lines = [dict(l) for l in lines]

    # Replace blast's local alignment query coverage with global coverage calculation
    if 'qcovs' in args.outfmt.split(',') or isinstance(args.coverage, float):
        for l in lines:
            l['qcovs'] = (float(l['qend']) - float(l['qstart']) + 1) \
                    / float(l['qlen']) * 100
            l['qcovs'] = '{0:.2f}'.format(l['qcovs'])
    if isinstance(args.coverage, float):
        lines = [l for l in lines if float(l['qcovs']) >= args.coverage]

    if args.nohits:
        # to get nohits first we need to know about the hits
        qids = groupby(lines, key = itemgetter('qseqid'))
        qids = set(q for q,_ in qids)

        # now we can build a list of nohits
        nohits = []
        for q in fastalite(opener(args.fasta)):
            if q.id not in qids:
                nohits.append(q)

        # convert nohits into DictWriter format
        nohits = (dict(qseqid = q.id) for q in nohits)

        # append to lines
        lines = chain(lines, nohits)

    out = DictWriter(args.out,
                     fieldnames = header,
                     extrasaction = 'ignore')

    if args.header:
        out.writeheader()

    out.writerows(lines)
Пример #12
0
def action(args):

    if args.remote and not args.remote_database:
        log.error("bioy blast: error: please specify a remote database")
        return
    elif not args.remote and not args.database:
        log.error("bioy blast: error: please specify path to local database")
        return

    command = ['blastn']
    command += ['-query', args.fasta]
    if args.remote:
        command += ['-remote']
        command += ['-db', args.remote_database]
    else:
        command += ['-db', args.database]
        command += ['-num_threads', str(args.threads)]
    command += ['-perc_identity', args.id]
    command += ['-outfmt', '6 ' + args.outfmt.replace(',', ' ')]
    command += ['-strand', args.strand]

    if args.max:
        command += ['-max_target_seqs', args.max]

    log.info(' '.join(command))

    if args.dry_run:
        sys.exit(0)

    pipe = Popen(command, stdout=PIPE, stderr=PIPE)

    results, errors = pipe.communicate()

    if errors:
        log.error(errors)

    # split tab lines
    lines = (r.strip().split('\t') for r in StringIO(results))

    header = args.outfmt.split(',')
    # match with fieldnames
    lines = (zip(header, l) for l in lines)

    # make into dict
    lines = [dict(l) for l in lines]

    # Replace blast's local alignment query coverage with global coverage calculation
    if 'qcovs' in args.outfmt.split(',') or isinstance(args.coverage, float):
        for l in lines:
            l['qcovs'] = (float(l['qend']) - float(l['qstart']) + 1) \
                    / float(l['qlen']) * 100
            l['qcovs'] = '{0:.2f}'.format(l['qcovs'])
    if isinstance(args.coverage, float):
        lines = [l for l in lines if float(l['qcovs']) >= args.coverage]

    if args.nohits:
        # to get nohits first we need to know about the hits
        qids = groupby(lines, key=itemgetter('qseqid'))
        qids = set(q for q, _ in qids)

        # now we can build a list of nohits
        nohits = []
        for q in fastalite(opener(args.fasta)):
            if q.id not in qids:
                nohits.append(q)

        # convert nohits into DictWriter format
        nohits = (dict(qseqid=q.id) for q in nohits)

        # append to lines
        lines = chain(lines, nohits)

    out = DictWriter(args.out, fieldnames=header, extrasaction='ignore')

    if args.header:
        out.writeheader()

    out.writerows(lines)
Пример #13
0
def build_parser(parser):
    parser.add_argument('blast_file',
            nargs = '?',
            default = sys.stdin,
            type = Opener('r'),
            help = 'CSV tabular blast file of query and subject hits')
    parser.add_argument('--all-one-group',
            dest = 'all_one_group',
            action = 'store_true',
            help = """If --map is not provided, the default behavior is to treat
                    all reads as one group; use this option to treat
                    each read as a separate group [%(default)s]""")
    parser.add_argument('-a', '--asterisk',
            default = 100,
            metavar='PERCENT',
            type = float,
            help = 'Next to any species above a certain threshold [%(default)s]')
    parser.add_argument('--copy-numbers',
            metavar = 'CSV',
            type = Opener(),
            help = 'columns: tax_id, median')
    parser.add_argument('-c', '--coverage',
            default = 95,
            metavar = 'PERCENT',
            type = float,
            help = 'percent of alignment coverage of blast result [%(default)s]')
    parser.add_argument('--details-identity',
            metavar = 'PERCENT',
            help = 'Minimum identity to include blast hits in details file',
            type = float,
            default = 90)
    parser.add_argument('--details-full',
            action = 'store_true',
            help = 'do not limit out_details to only larget cluster per assignment')
    parser.add_argument('--exclude-by-taxid',
            metavar = 'CSV',
            type = lambda f: set(e for e in DictReader(opener(f), fieldnames ='tax_id')),
            default = {},
            help = 'column: tax_id')
    parser.add_argument('--group-def',
            metavar = 'INT',
            action = 'append',
            default = [],
            help = """define a group threshold for a particular rank overriding
                      --target-max-group-size. example: genus:2""")
    parser.add_argument('--group-label',
            metavar = 'LABEL',
            default = 'all',
            help = 'Single group label for reads')
    parser.add_argument('-o', '--out',
            default = sys.stdout,
            type = Opener('w'),
            metavar = 'CSV',
            help = """columns: specimen, max_percent, min_percent, max_coverage,
                      min_coverage, assignment_id, assignment, clusters, reads,
                      pct_reads, corrected, pct_corrected, target_rank, hi, low, tax_ids""")
    parser.add_argument('-m', '--map',
            metavar = 'CSV',
            type = Opener(),
            default = {},
            help = 'columns: name, specimen')
    parser.add_argument('--max-ambiguous',
            metavar = 'INT',
            default = 3,
            type = int,
            help = 'Maximum ambiguous count in reference sequences [%(default)s]')
    parser.add_argument('--max-identity',
            default = 100,
            metavar = 'PERCENT',
            type = float,
            help = 'maximum identity threshold for accepting matches [<= %(default)s]')
    parser.add_argument('--min-cluster-size',
            default = 0,
            metavar = 'INT',
            type = int,
            help = 'minimum cluster size to include in classification output')
    parser.add_argument('--min-identity',
            default = 99,
            metavar = 'PERCENT',
            type = float,
            help = 'minimum identity threshold for accepting matches [> %(default)s]')
    parser.add_argument('-s', '--seq-info',
            required = True,
            metavar = 'CSV',
            type = Opener(),
            help = 'seq info file(s) to match sequence ids to taxids [%(default)s]')
    parser.add_argument('-t', '--taxonomy',
            required = True,
            metavar = 'CSV',
            type = Csv2Dict('tax_id'),
            help = 'tax table of taxids and species names [%(default)s]')
    parser.add_argument('-O', '--out-detail',
            type  = lambda f: DictWriter(opener(f, 'w'), extrasaction = 'ignore', fieldnames = [
                'specimen', 'assignment', 'assignment_id', 'qseqid', 'sseqid', 'pident', 'coverage', 'ambig_count',
                'accession', 'tax_id', 'tax_name', 'target_rank', 'rank', 'hi', 'low'
                ]),
            metavar = 'CSV',
            help = """columns: specimen, assignment, assignment_id,
                      qseqid, sseqid, pident, coverage, ambig_count,
                      accession, tax_id, tax_name, target_rank, rank, hi, low""")
    parser.add_argument('--target-max-group-size',
            metavar = 'INTEGER',
            default = 3,
            type = int,
            help = """group multiple target-rank assignments that
                      excede a threshold to a higher rank [%(default)s]""")
    parser.add_argument('--target-rank',
            metavar='RANK',
            help = 'Rank at which to classify. Default: "%(default)s"',
            default = 'species')
    parser.add_argument('-w', '--weights',
            metavar = 'CSV',
            type = Opener(),
            help = 'columns: name, weight')
    ### csv.Sniffer.has_header is *not* reliable enough
    parser.add_argument('--has-header', action = 'store_true',
            help = 'specify this if blast data has a header')