def build_parser(parser): parser.add_argument('seqs', help="""Named sequences""") parser.add_argument('seq_info', help="""Sequence info file""") parser.add_argument('-t', '--taxonomy', help="""Taxonomy as taxtable; optional if a grouping term is available in seq_info""") parser.add_argument('--seq-info-out', help='subset of original seq_info') parser.add_argument('--derep-map-out', help=('mapping of input sequences to dereplicated ' 'representatives. `group` column corresponds to ' 'the field identified by --group-on.')) parser.add_argument('--seqs-out', default=sys.stdout, type=util.file_opener('w')) parser.add_argument('-g', '--group-on', default='species', help='Field in seq_info on which to group sequences') parser.add_argument('--id', default=1.0, type=float, help="""Clustering identity between 0 and 1 [default: %(default).3f]""") parser.add_argument('-i', '--include', type=util.file_opener('r'), help=('Optional file containing list ' 'of group labels to include')) parser.add_argument('--threads', help=('Number of threads to use for clustering each ' 'group [default is one thread per ' 'available CPU core]'))
def build_parser(p): p.add_argument('fasta_file', help="""sequence file""", type=file_opener('r')) p.add_argument('seqinfo_file', help="""Sequence metadata""", type=file_opener('r')) p.add_argument('--named-seqs', default='named.seqs.fasta', help='[default %(default)s]') p.add_argument('--named-info', default='named.seq_info.csv', help='[default %(default)s]') p.add_argument('--unnamed-seqs', default='unnamed.seqs.fasta', help='[default %(default)s]') p.add_argument('--unnamed-info', default='unnamed.seq_info.csv', help='[default %(default)s]') flt = p.add_argument_group('Filtering options') flt.add_argument( '-a', '--prop-ambig-cutoff', default=0.01, type=float, help="""Maximum proportion of characters in sequence which may be ambiguous [default: %(default).2f]""") flt.add_argument('-l', '--min-length', type=int, help="""Minimum sequence length [default: %(default)d]""", default=1200)
def build_parser(p): p.add_argument('infile', type=util.file_opener('r'), help="""Input file, gzipped""") p.add_argument('database', help="""Path to taxonomy database""") p.add_argument('fasta_out', type=util.file_opener('w'), help="""Path to write sequences in FASTA format. Specify '.gz' or '.bz2' extension to compress.""") p.add_argument('output', metavar='tax_out', type=argparse.FileType('w'), help="""Output path to write taxonomic information in CSV format""") p.add_argument('--no-header', action='store_false', dest='header', default=True, help="""Don't write a header""")
def build_parser(p): p.add_argument('fasta_file', help="""sequence file""", type=file_opener('r')) p.add_argument('seqinfo_file', help="""Sequence metadata""", type=file_opener('r')) p.add_argument('named_base') p.add_argument('unnamed_base') flt = p.add_argument_group('Filtering options') flt.add_argument('-a', '--prop-ambig-cutoff', default=0.01, type=float, help="""Maximum proportion of characters in sequence which may be ambiguous [default: %(default).2f]""") flt.add_argument('-l', '--min-length', type=int, help="""Minimum sequence length [default: %(default)d]""", default=1200)
def action(args): dtype = {'gi': str, 'tax_id': str, 'species': str} seq_info = pandas.read_csv(args.seq_info, dtype=dtype, index_col='seqname') log.info('reading sequences') with util.file_opener()(args.sequences) as sequences_in: seqhashes = dict() for record in util.Counter(SeqIO.parse(sequences_in, 'fasta')): seq = str(record.seq).replace('\n', '').upper() seqhashes[record.name] = hashlib.sha1(seq).hexdigest() seqhash = pandas.Series(data=seqhashes, name='seqhash') seqhash.index.name = 'seqname' seq_info = seq_info.join(seqhash) group_by = ['seqhash'] if args.group_by: group_by.extend(args.group_by.split(',')) def choose_rep(df): if args.prefer_columns: df = df.sort_values(by=args.prefer_columns.split(',')) rep = df.tail(1) rep['weight'] = len(df) return rep log.info('choosing seq_info representatives') seq_info = seq_info.groupby( by=group_by, group_keys=False).apply(choose_rep) seq_info = seq_info.drop('seqhash', axis=1) log.info('writing seqinfo') seq_info.to_csv(args.out_info, quoting=csv.QUOTE_NONNUMERIC) log.info('writing dedup file') with util.file_opener()(args.sequences) as sequences_in, \ util.file_opener('w')(args.out) as sequences_out: for record in util.Counter(SeqIO.parse(sequences_in, 'fasta')): if record.name in seq_info.index: fasta_out = '>{}\n{}\n'.format(record.name, str(record.seq)) sequences_out.write(fasta_out)
def action(args): seq_info = pandas.read_csv(args.seq_info, dtype=str, index_col='seqname') log.info('reading sequences') with util.file_opener()(args.sequences) as sequences_in: seqhashes = dict() for record in util.Counter(SeqIO.parse(sequences_in, 'fasta')): seq = str(record.seq).replace('\n', '').upper() seqhashes[record.name] = hashlib.sha1(seq).hexdigest() seqhash = pandas.Series(data=seqhashes, name='seqhash') seqhash.index.name = 'seqname' seq_info = seq_info.join(seqhash) group_by = ['seqhash'] if args.group_by: group_by.extend(args.group_by.split(',')) def choose_rep(df): if args.prefer_columns: df = df.sort_values(by=args.prefer_columns.split(',')) rep = df.tail(1) rep['weight'] = len(df) return rep log.info('choosing seq_info representatives') seq_info = seq_info.groupby( by=group_by, group_keys=False).apply(choose_rep) seq_info = seq_info.drop('seqhash', axis=1) log.info('writing seqinfo') seq_info.to_csv(args.out_info, quoting=csv.QUOTE_NONNUMERIC) log.info('writing dedup file') with util.file_opener()(args.sequences) as sequences_in, \ util.file_opener('w')(args.out) as sequences_out: for record in util.Counter(SeqIO.parse(sequences_in, 'fasta')): if record.name in seq_info.index: fasta_out = '>{}\n{}\n'.format(record.name, str(record.seq)) sequences_out.write(fasta_out)
def build_parser(p): # inputs p.add_argument( 'fasta', metavar='FASTA', help="""sequence file""", type=util.file_opener('r')) p.add_argument( 'seqinfo', metavar='CSV', help="""Sequence metadata""") p.add_argument( '--references', metavar='CSV', help='csv file with columns: version,pubmed_id') # outputs info_outs = p.add_argument_group('outputs for seq_info\'s') info_outs.add_argument( '--named-info', metavar='CSV', type=util.file_opener('w'), help='taxid_classified column is True') info_outs.add_argument( '--unnamed-info', metavar='CSV', type=util.file_opener('w'), help='taxid_classified column is False') info_outs.add_argument( '--type-info', type=util.file_opener('w'), metavar='CSV', help='rows where is_type column is True') info_outs.add_argument( '--published-info', type=util.file_opener('w'), metavar='CSV', help="""requires references.csv. Return seq_info with pubmed_ids""") info_outs.add_argument( '--references-info', type=util.file_opener('w'), metavar='CSV', help=('requires references.csv. ' 'Return columns [version, accession, pubmed_id]')) seq_outs = p.add_argument_group('outputs for sequences') seq_outs.add_argument( '--named-seqs', type=util.file_opener('w'), metavar='FASTA', help='where taxid_classified column is True') seq_outs.add_argument( '--unnamed-seqs', type=util.file_opener('w'), metavar='FASTA', help='where taxid_classified column is False') seq_outs.add_argument( '--type-seqs', type=util.file_opener('w'), metavar='FASTA', help='where is_type column is True') seq_outs.add_argument( '--published-seqs', type=util.file_opener('w'), metavar='FASTA', help="""requires references.csv. Return sequences with pubmed_ids""") # filtering switches flt = p.add_argument_group('filtering options') flt.add_argument('-a', '--prop-ambig-cutoff', type=float, help=('Maximum proportion of characters in ' 'sequence which may be ambiguous')) flt.add_argument('-l', '--min-length', type=int, help='Minimum sequence length')