def revcomp(par): with utils.openw(par["out_f"]) as outf: if par["complement"] and par["reverse"]: res = ( r.reverse_complement(id=r.id, description="RC") for r in SeqIO.parse(utils.openr(par["inp_f"]), "fasta") ) elif par["reverse"]: res = (r.reverse(id=r.id, description="R") for r in SeqIO.parse(utils.openr(par["inp_f"]), "fasta")) elif par["complement"]: res = (r.complement(id=r.id, description="C") for r in SeqIO.parse(utils.openr(par["inp_f"]), "fasta")) else: res = [] SeqIO.write(res, outf, "fasta")
def revcomp(par): with utils.openw(par['out_f']) as outf: if par['complement'] and par['reverse']: res = (r.reverse_complement(id=r.id, description="RC") for r in SeqIO.parse(utils.openr(par['inp_f']), "fasta")) elif par['reverse']: res = (r.reverse(id=r.id, description="R") for r in SeqIO.parse(utils.openr(par['inp_f']), "fasta")) elif par['complement']: res = (r.complement(id=r.id, description="C") for r in SeqIO.parse(utils.openr(par['inp_f']), "fasta")) else: res = [] SeqIO.write(res, outf, "fasta")
def parse_primersearch( fn ): seqs = {} cur,seq,fs,rs,al,rseq,fseq = None, None, None, None, None, None, None with utils.openr( fn, "U" ) as inpf: for l in inpf: line = l.strip() if line.startswith("Amplimer") and 'Amplimer length' not in line: cur = line elif line.startswith("Sequence"): seq = line.split("Sequence:")[1].strip() elif 'hits forward strand at ' in line: fseq = line.split()[0] fs = int(line.split("hits forward strand at ")[1].split("with")[0]) elif 'hits reverse strand at ' in line: rseq = line.split()[0] rs = int(line.split("hits reverse strand at ")[1].split("with")[0].strip()[1:-1]) elif 'Amplimer length' in line: al = int(line.split("Amplimer length: ")[1].split("bp")[0]) seqs[cur] = { 'seq' : seq, 'fs' : fs, 'rs' : rs, 'al' : al, 'rseq' : rseq, 'fseq' : fseq } cur,seq,fs,rs,al,rseq,fseq = None, None, None, None, None, None, None return seqs
def blast_ncbi_outfmt6_screen(par): finp,fout = bool(par['inp_f']), bool(par['out_f']) inp_mat = (l.rstrip('\n').split("\t") for l in (utils.openr(par['inp_f']) if finp else sys.stdin)) out_mat =(l for l in inp_mat if float(l[par['pid_col']-1]) >= par['pid'] and float(l[par['length_col']-1]) >= par['length'] and float(l[par['evalue_col']-1]) <= par['evalue'] and float(l[par['bitscore_col']-1]) >= par['bitscore'] ) if 's' in par and par['s']: if par['s'] == 'pid': col = par['pid']-1 elif par['s'] == 'evalue': col = par['evalue_col'] elif par['s'] == 'length': col = par['length_col']-1 elif par['s'] == 'bitscore': col = par['bitscore_col']-1 out_mat = sorted( out_mat, key=lambda x: float(x[col-1]) ) if 'n' in par and par['n'] > -1: out_mat = out_mat[:par['n']] unique_queries = collections.defaultdict( int ) with utils.openw(par['out_f']) if fout else sys.stdout as out_file: if 't' in par and par['t'] > -1: for l in out_mat: unique_queries[l[0]] += 1 if unique_queries[l[0]] > par['t']: continue out_file.write("\t".join(l)+"\n") else: for l in out_mat: out_file.write("\t".join(l)+"\n")
def blast_ncbi_outfmt6_screen(par): finp, fout = bool(par['inp_f']), bool(par['out_f']) inp_mat = (l.rstrip('\n').split("\t") for l in (utils.openr(par['inp_f']) if finp else sys.stdin)) out_mat = (l for l in inp_mat if float(l[par['pid_col'] - 1]) >= par['pid'] and float(l[par['length_col'] - 1]) >= par['length'] and float(l[par['evalue_col'] - 1]) <= par['evalue'] and float(l[par['bitscore_col'] - 1]) >= par['bitscore']) if 's' in par and par['s']: if par['s'] == 'pid': col = par['pid'] - 1 elif par['s'] == 'evalue': col = par['evalue_col'] elif par['s'] == 'length': col = par['length_col'] - 1 elif par['s'] == 'bitscore': col = par['bitscore_col'] - 1 out_mat = sorted(out_mat, key=lambda x: float(x[col - 1])) if 'n' in par and par['n'] > -1: out_mat = out_mat[:par['n']] unique_queries = collections.defaultdict(int) with utils.openw(par['out_f']) if fout else sys.stdout as out_file: if 't' in par and par['t'] > -1: for l in out_mat: unique_queries[l[0]] += 1 if unique_queries[l[0]] > par['t']: continue out_file.write("\t".join(l) + "\n") else: for l in out_mat: out_file.write("\t".join(l) + "\n")
import utils try: import argparse as ap import bz2 except ImportError: sys.stderr.write( "argparse not found" ) sys.exit(-1) def read_params( args ): p = ap.ArgumentParser(description='Convert txt files to libsvm\n') p.add_argument( 'txt', nargs='?', default=None, type=str, help= "the input txt file [stdin if not present]") p.add_argument('ls', nargs='?', default=None, type=str, help= "the output ilibsvm file compressed if fiven with bz2 extension\n" "[stdout if not present]") return vars( p.parse_args() ) if __name__ == "__main__": args = read_params( sys.argv ) uc2cl = collections.defaultdict( set ) with utils.openr(args['txt']) as inp: data = zip(*[l.strip().split('\t') for l in inp]) outd = [[d[0]]+[str(i+1)+":"+dd for i,dd in enumerate(d[1:])] for d in data[1:]] with utils.openw(args['ls']) as out: for o in outd: out.write( "\t".join(o) +"\n" )
arg("-a", default=None, type=int, help="number of char after the match to report") arg("-n", default=None, type=int, help="number of matching primers") parser.add_argument("-s", metavar="Subsequene to look for", required=True, type=str) return vars(parser.parse_args()) if __name__ == "__main__": par = read_params(sys.argv) ss = par["s"].lower() ssr = Seq(par["s"]).reverse_complement().lower() f = os.path.basename(par["inp_f"]).split(".")[0] with utils.openw(par["out_f"]) as outf: for r in SeqIO.parse(utils.openr(par["inp_f"]), "fasta"): rl = r.seq.lower() if ss in rl or ssr in rl: if par["a"]: if ss in rl: i = str(rl).index(str(ss)) subs = rl[i : i + len(ss) + par["a"]] if i + len(ss) + par["a"] < len(rl) else rl[i:] else: i = str(rl).index(str(ssr)) subs = rl[i : i + len(ssr) + par["a"]] if i + len(ssr) + par["a"] < len(rl) else rl[i:] outf.write(f + "\t" + str(r.id) + "\t" + str(subs) + "\n") else: if par["n"]: n = str(rl).count(str(ss)) + str(rl).count(str(ssr)) outf.write(f + "\t" + str(r.id) + "\t" + str(n) + "\n") else:
uc2cl = collections.defaultdict( set ) if not args['g2t'] and not args['t2g']: sys.stdout.write("Error one of --t2g and --g2t must be provided\n") sys.exit(0) g2t = {} if args['g2t']: with open( args['g2t'] ) as inp: g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp)) elif args['t2g']: with open( args['t2g'] ) as inp: for ll in (l.strip().split('\t') for l in inp): for g in ll[1:]: g2t[int(g)] = int(ll[0]) with utils.openr( args['ctxt'] ) as inp: valin = (l.strip().split('\t') for l in inp) g2c = collections.defaultdict( set ) if args['b6o']: inp_mat = ((int(a),int(b)) for a,b in (l.rstrip('\n').split("\t")[:2] for l in utils.openr(args['b6o']))) #all_targets = set() for fr,to in inp_mat: #all_targets.add( to ) if fr != to: g2c[fr].add( to ) n = args['n'] # if args['n'] else len(all_targets) n = float(n)
'--txt', required=True, default=None, type=str, help= "the table of the samples to profiles [tab-delimited, columns ID are profileName]" ) p.add_argument('--nmiss', default=0, type=int) return vars(p.parse_args()) if __name__ == "__main__": args = read_params(sys.argv) fna = SeqIO.to_dict(SeqIO.parse(utils.openr(args['fna']), "fasta")) fna_out = [] profiles = {} mlst_names = [] with utils.openr(args['txt']) as inp: for i, line in enumerate(inp): if i == 0: mlst_names = line.strip().split('\t')[1:] continue l = line.strip().split('\t') profiles[l[0]] = dict([(na, l[n + 1]) for n, na in enumerate(mlst_names)]) for s, p in profiles.items(): seq = "" skip = 0
def sss(par): subsample = bool(par['subsample']) select = bool(par['select']) randomize = bool(par['randomize']) if bool(par['out_f']): n = par['split'] #openw = bz2.BZ2File if par['out_f'].endswith(".bz2") else open if n == 1: out_stream = [utils.openw(par['out_f'])] else: out_stream = [ utils.openw(par['out_f'] + str(r).zfill(len(str(n))) + ".fna" + (".bz2" if par['out_f'].endswith(".bz2") else "")) for r in range(n) ] else: out_stream = [sys.stdout] # larger buffer? if select: if os.path.exists(par['ids']): #openr = bz2.BZ2File if par['ids'].endswith(".bz2") else open es = [s.strip().split('\t')[0] for s in utils.openr(par['ids'])] else: es = [(s.split("$")[1] if s.count("$") else s) for s in par['ids'].split(":::")] es = set(es) all_reads = [] nstreams = len(out_stream) p = par['subsample'] #reads = reader( par['inp_f'], par['min_len'], par['max_len'] ) cind = 0 lmin, lmax = par['min_len'], par['max_len'] for r in SeqIO.parse(utils.openr(par['inp_f']), "fasta"): if lmin and len(r.seq) < lmin: continue if lmax and len(r.seq) > lmax: continue if select: if par['reverse']: if r.id in es: continue elif r.id not in es: continue if subsample and rnd.random() > p: continue if randomize: all_reads.append(r) continue SeqIO.write(r, out_stream[cind], "fasta") cind = (cind + 1) % nstreams """ for r in reads: if select and r.n not in es: continue if subsample and rnd.random() > p: continue if randomize: all_reads.append( r ) continue out_stream[cind].write( str(r) ) cind = (cind + 1) % nstreams """ if randomize: rnd.shuffle(all_reads) step = len(all_reads) / nstreams for i, r in enumerate(all_reads): #out_stream[cind].write( str(r) ) SeqIO(r, out_stream[cind], "fasta") if not i % step: cind = (cind + 1) % nstreams for o in out_stream: o.close()
p = ap.ArgumentParser(description='Convert core gene files to core gene summaries\n') p.add_argument( 'cg', nargs='?', default=None, type=str, help= "the input cg file [stdin if not present]") p.add_argument('cgs', nargs='?', default=None, type=str, help= "the output summary file\n" "[stdout if not present]") return vars( p.parse_args() ) if __name__ == "__main__": args = read_params( sys.argv ) gid2cores = collections.defaultdict( set ) #with (open(args['uc']) if args['uc'] else sys.stdin) as inp: with utils.openr(args['cg']) as inp: for line in (l.split('\t') for l in inp): if int(line[0]) > 0: gid,clade,ncore,ngenomes,pv = line[:5] else: gid,clade,ncore,ngenomes,pv = line[1:6] gid2cores[gid].add( (clade,ncore,ngenomes,pv) ) clades2cores = collections.defaultdict( set ) for k,v in gid2cores.items(): if len(v) > 1: continue clades2cores[list(v)[0][0]].add( k ) #openw = bz2.BZ2File if args['txt'].endswith(".bz2") else open with utils.openw(args['cgs']) as out:
p.add_argument( 'fna', nargs='?', default=None, type=str, help= "the input uc file [stdin if not present]") p.add_argument('rxl', nargs='?', default=None, type=str, help= "the output txt file compresse if fiven with bz2 extension\n" "[stdout if not present]") """ p.add_argument('--subsample', metavar="Subsampling rate", default=1.0, type=float ) p.add_argument('-n', metavar="Minimum number of matching taxa", default=0, type=int ) p.add_argument('-p', metavar="Prefix for taxon names", default="", type=str ) """ return vars( p.parse_args() ) if __name__ == "__main__": args = read_params( sys.argv ) fna = SeqIO.to_dict(SeqIO.parse( utils.openr(args['fna']), "fasta")) with utils.openw(args['rxl']) as out: n = len(fna.values()[0]) out.write( str(len(fna))+" "+str(n)+"\n" ) for k,v in fna.items(): if len(k) > 14: k = k[:14] out.write( str(k)+" "*(15-len(str(k)[1:]))+str(v.seq) +"\n" )
def sss( par ): subsample = bool(par['subsample']) select = bool(par['select']) randomize = bool(par['randomize']) if bool(par['out_f']): n = par['split'] #openw = bz2.BZ2File if par['out_f'].endswith(".bz2") else open if n == 1: out_stream = [utils.openw( par['out_f'])] else: out_stream = [utils.openw( par['out_f']+str(r).zfill(len(str(n)))+".fna"+(".bz2" if par['out_f'].endswith(".bz2") else "")) for r in range(n)] else: out_stream = [sys.stdout] # larger buffer? if select: if os.path.exists(par['ids']): #openr = bz2.BZ2File if par['ids'].endswith(".bz2") else open es = [s.strip().split('\t')[0] for s in utils.openr(par['ids'])] else: es = [(s.split("$")[1] if s.count("$") else s) for s in par['ids'].split(":::")] es = set(es) all_reads = [] nstreams = len( out_stream ) p = par['subsample'] #reads = reader( par['inp_f'], par['min_len'], par['max_len'] ) cind = 0 lmin,lmax = par['min_len'], par['max_len'] for r in SeqIO.parse( utils.openr(par['inp_f']), "fasta"): if lmin and len(r.seq) < lmin: continue if lmax and len(r.seq) > lmax: continue if select: if par['reverse']: if r.id in es: continue elif r.id not in es: continue if subsample and rnd.random() > p: continue if randomize: all_reads.append( r ) continue SeqIO.write(r, out_stream[cind], "fasta") cind = (cind + 1) % nstreams """ for r in reads: if select and r.n not in es: continue if subsample and rnd.random() > p: continue if randomize: all_reads.append( r ) continue out_stream[cind].write( str(r) ) cind = (cind + 1) % nstreams """ if randomize: rnd.shuffle(all_reads) step = len(all_reads) / nstreams for i,r in enumerate(all_reads): #out_stream[cind].write( str(r) ) SeqIO(r, out_stream[cind], "fasta" ) if not i % step: cind = (cind + 1) % nstreams for o in out_stream: o.close()
def __init__( self, fn, min_len = None, max_len = None ): self.ret = False self.min_len, self.max_len = min_len, max_len #openr = bz2.BZ2File if bool(fn) and fn.endswith(".bz2") else open self.inp = utils.openr(fn) if bool(fn) else sys.stdin self.cr = read( )
if par['r'].count(":"): rn, par['r'] = par['r'].split(":") rn += ":" if par['l'].count(":"): ln, par['l'] = par['l'].split(":") ln += ":" ne = str(par['e']) c_r = par['r'].lower() c_r_rev = Seq(par['r']).reverse_complement().lower() c_l = par['l'].lower() c_l_rev = Seq(par['l']).reverse_complement().lower() f = os.path.basename(par['inp_f']).split(".")[0] with utils.openw(par['out_f']) as outf: for seq in SeqIO.parse(utils.openr(par['inp_f']), "fasta"): seql = str(seq.seq.lower()) r_rev, l_rev, r, l = c_r_rev, c_l_rev, c_r, c_l rr = regex.findall("(" + r + "){e<=" + ne + "}", seql) lr = regex.findall("(" + l + "){e<=" + ne + "}", seql) r_revr = regex.findall("(" + str(r_rev) + "){e<=" + ne + "}", seql) l_revr = regex.findall("(" + str(l_rev) + "){e<=" + ne + "}", seql) if len(rr) > 1: outf.write(str(rr) + " unspecific 1\n") if len(lr) > 1: outf.write(str(lr) + " unspecific 2\n") if len(r_revr) > 1: outf.write(str(r_revr) + " unspecific 3\n") if len(l_revr) > 1:
lt,last = a[0],(a[3:] if "?" not in a else last) v += arr[-2:] return v if __name__ == "__main__": args = read_params( sys.argv ) uc2cl = collections.defaultdict( set ) tax_lev = "dpcofgs" tax_lev_exp = ['Domain','Phylum','Class','Order','Family','Genus','Species'] fp = tempfile.TemporaryFile() if args['corrections']: with utils.openr(args['corrections']) as inp: frto = {} frtoid = {} for pat in (l.split('\t') for l in inp): if len(pat) == 2: frto[pat[0].strip()] = pat[1].strip() else: frtoid[pat[0].strip()] = (pat[1].strip(),pat[2].strip()) with utils.openr(args['img'],"rU") as inpf: nfa = [] for l in inpf: nf = l for f,t in frto.items(): nf = nf.replace(f,t) for i,(f,t) in frtoid.items(): if l.startswith(i+"\t"):
return vars(p.parse_args()) if __name__ == "__main__": args = read_params(sys.argv) uc2cl = collections.defaultdict(set) gint = str if args['sk'] else int if not args['g2t'] and not args['t2g']: sys.stdout.write("Error one of --t2g and --g2t must be provided\n") sys.exit(0) g2t = {} if args['g2t']: with utils.openr(args['g2t']) as inp: #g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp)) for l in inp: f, t = l.strip().split('\t') g2t[gint(f)] = gint(t) elif args['t2g']: with utils.openr(args['t2g']) as inp: for ll in (l.strip().split('\t') for l in inp): for g in ll[1:]: g2t[gint(g)] = gint(ll[0]) with utils.openw(args['txt']) as out: with utils.openr(args['ctxt']) as inp: for l in inp: valin = [gint(a) for a in l.strip().split('\t')]
uc2cl = collections.defaultdict(set) if not args['g2t'] and not args['t2g']: sys.stdout.write("Error one of --t2g and --g2t must be provided\n") sys.exit(0) g2t = {} if args['g2t']: with open(args['g2t']) as inp: g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp)) elif args['t2g']: with open(args['t2g']) as inp: for ll in (l.strip().split('\t') for l in inp): for g in ll[1:]: g2t[int(g)] = int(ll[0]) with utils.openr(args['ctxt']) as inp: valin = (l.strip().split('\t') for l in inp) g2c = collections.defaultdict(set) if args['b6o']: inp_mat = ((int(a), int(b)) for a, b in (l.rstrip('\n').split("\t")[:2] for l in utils.openr(args['b6o']))) #all_targets = set() for fr, to in inp_mat: #all_targets.add( to ) if fr != to: g2c[fr].add(to)
if par['r'].count(":"): rn,par['r'] = par['r'].split(":") rn+=":" if par['l'].count(":"): ln,par['l'] = par['l'].split(":") ln+=":" ne = str(par['e']) c_r = par['r'].lower() c_r_rev = Seq(par['r']).reverse_complement().lower() c_l = par['l'].lower() c_l_rev = Seq(par['l']).reverse_complement().lower() f = os.path.basename(par['inp_f']).split(".")[0] with utils.openw( par['out_f'] ) as outf: for seq in SeqIO.parse( utils.openr(par['inp_f']), "fasta"): seql = str(seq.seq.lower()) r_rev,l_rev,r,l = c_r_rev,c_l_rev,c_r,c_l rr = regex.findall( "("+r+"){e<="+ne+"}", seql ) lr = regex.findall( "("+l+"){e<="+ne+"}", seql ) r_revr = regex.findall( "("+str(r_rev)+"){e<="+ne+"}", seql ) l_revr = regex.findall( "("+str(l_rev)+"){e<="+ne+"}", seql ) if len(rr) > 1: outf.write( str(rr) +" unspecific 1\n" ) if len(lr) > 1: outf.write( str(lr) +" unspecific 2\n" ) if len(r_revr) > 1: outf.write( str(r_revr) +" unspecific 3\n" ) if len(l_revr) > 1:
def __init__(self, fn, min_len=None, max_len=None): self.ret = False self.min_len, self.max_len = min_len, max_len #openr = bz2.BZ2File if bool(fn) and fn.endswith(".bz2") else open self.inp = utils.openr(fn) if bool(fn) else sys.stdin self.cr = read()
def read_params(args): parser = argparse.ArgumentParser( description='List the genes in the genome file') arg = parser.add_argument arg('inp_f', metavar='INPUT_FILE', default=None, type=str, help="the input fna file") arg('out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str, help="the output txt file [stdout if not present]") return vars(parser.parse_args()) def genome_id(fn): return str(-int(os.path.basename(fn).split(".")[0])) if __name__ == '__main__': par = read_params(sys.argv) ids = [r.id for r in SeqIO.parse(utils.openr(par['inp_f']), "fasta")] with utils.openw(par['out_f']) as out: out.write("\t".join([genome_id(par['inp_f'])] + ids) + "\n")
return vars( p.parse_args() ) if __name__ == "__main__": args = read_params( sys.argv ) uc2cl = collections.defaultdict( set ) gint = str if args['sk'] else int if not args['g2t'] and not args['t2g']: sys.stdout.write("Error one of --t2g and --g2t must be provided\n") sys.exit(0) g2t = {} if args['g2t']: with utils.openr( args['g2t'] ) as inp: #g2t = dict(([int(a) for a in l.strip().split('\t')] for l in inp)) for l in inp: f,t = l.strip().split('\t') g2t[gint(f)] = gint(t) elif args['t2g']: with utils.openr( args['t2g'] ) as inp: for ll in (l.strip().split('\t') for l in inp): for g in ll[1:]: g2t[gint(g)] = gint(ll[0]) with utils.openw(args['txt']) as out: with utils.openr( args['ctxt'] ) as inp: for l in inp: valin = [gint(a) for a in l.strip().split('\t')]
arg( 'inp_f', metavar='INPUT_FILE', nargs='?', default=None, type=str, help="the input fna file [stdin if not present]") arg( 'out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str, help="the output fna file [stdout if not present]") parser.add_argument('--extract_targets', action='store_true', help="Select fna entries\n") parser.add_argument('-i', action='store_true', help="Add hit stats to fna entries\n") parser.add_argument('--bo6', metavar='Bo6 file', required=True, type = str ) return vars(parser.parse_args()) if __name__ == '__main__': par = read_params(sys.argv) inp_mat = (l.rstrip('\n').split("\t") for l in (utils.openr(par['bo6']))) if par['extract_targets']: toextr = ((l[1], l[2], l[3], l[11], int(l[8]), int(l[9])) for l in inp_mat) else: toextr = ((l[0], l[2], l[3], l[11], int(l[6]), int(l[7])) for l in inp_mat) inpfasta = SeqIO.to_dict(SeqIO.parse( utils.openr(par['inp_f']), "fasta")) out_seqs = [] for n,pid,l,bit,fr,to in toextr: n = inpfasta[n][min(fr,to):max(fr,to)] if par['i']: p = "_pid"+pid.strip()+"_l"+l.strip()+"_bs"+bit.strip() else: p = ""
import os import textwrap from collections import namedtuple as nt import random as rnd rnd.seed(1982) import utils from Bio import SeqIO def read_params(args): parser = argparse.ArgumentParser(description='List the genes in the genome file') arg = parser.add_argument arg( 'inp_f', metavar='INPUT_FILE', default=None, type=str, help="the input fna file") arg( 'out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str, help="the output txt file [stdout if not present]") return vars(parser.parse_args()) def genome_id( fn ): return str(-int(os.path.basename(fn).split(".")[0])) if __name__ == '__main__': par = read_params(sys.argv) ids = [r.id for r in SeqIO.parse( utils.openr(par['inp_f']), "fasta")] with utils.openw( par['out_f']) as out: out.write( "\t".join( [genome_id(par['inp_f'])]+ids ) + "\n" )
nargs='?', default=None, type=str, help="the output txt file compresse if fiven with bz2 extension\n" "[stdout if not present]") """ p.add_argument('--subsample', metavar="Subsampling rate", default=1.0, type=float ) p.add_argument('-n', metavar="Minimum number of matching taxa", default=0, type=int ) p.add_argument('-p', metavar="Prefix for taxon names", default="", type=str ) """ return vars(p.parse_args()) if __name__ == "__main__": args = read_params(sys.argv) fna = SeqIO.to_dict(SeqIO.parse(utils.openr(args['fna']), "fasta")) with utils.openw(args['rxl']) as out: n = len(fna.values()[0]) out.write(str(len(fna)) + " " + str(n) + "\n") for k, v in fna.items(): if len(k) > 14: k = k[:14] out.write( str(k) + " " * (15 - len(str(k)[1:])) + str(v.seq) + "\n")
def read_params( args ): p = ap.ArgumentParser(description='Create a fasta file with the' 'concatenated mlst sequence from a mlst table and the single sequences') p.add_argument( '--fna', required=True, default=None, type=str, help= "the file with all the MLST profiles [in the format >profilineName_profileID") p.add_argument( '--txt', required=True, default=None, type=str, help= "the table of the samples to profiles [tab-delimited, columns ID are profileName]") p.add_argument( '--nmiss', default = 0, type = int ) return vars( p.parse_args() ) if __name__ == "__main__": args = read_params( sys.argv ) fna = SeqIO.to_dict(SeqIO.parse( utils.openr(args['fna']), "fasta")) fna_out = [] profiles = {} mlst_names = [] with utils.openr(args['txt']) as inp: for i,line in enumerate(inp): if i == 0: mlst_names = line.strip().split('\t')[1:] continue l = line.strip().split('\t') profiles[l[0]] = dict([(na,l[n+1]) for n,na in enumerate(mlst_names)]) for s,p in profiles.items(): seq = "" skip = 0 for n in mlst_names:
sys.exit(-1) def read_params( args ): p = ap.ArgumentParser(description='Convert Usearch ".uc" files in tab-delimited' ' files with the seed as first field followed by the other IDs\n') p.add_argument( 'uc', nargs='?', default=None, type=str, help= "the input uc file [stdin if not present]") p.add_argument('txt', nargs='?', default=None, type=str, help= "the output txt file compressed if fiven with bz2 extension\n" "[stdout if not present]") return vars( p.parse_args() ) if __name__ == "__main__": args = read_params( sys.argv ) uc2cl = collections.defaultdict( set ) #with (open(args['uc']) if args['uc'] else sys.stdin) as inp: with utils.openr(args['uc']) as inp: for type,cln,seql,pid,strand,ing1,ign2,aln,query,target in (l.split('\t') for l in inp): if type == 'H': uc2cl[target.strip()].add( query ) elif type == 'S' and query not in uc2cl: uc2cl[query] = set() #openw = bz2.BZ2File if args['txt'].endswith(".bz2") else open with utils.openw(args['txt']) as out: for k,v in sorted(uc2cl.items(),key=lambda x:-len(x[1])): out.write( "\t".join([k]+list(v)) +"\n" )
metavar="Prefix for taxon names", default="", type=str) p.add_argument('--sk', action='store_true') return vars(p.parse_args()) if __name__ == "__main__": args = read_params(sys.argv) uc2cl = collections.defaultdict(set) gint = str if args['sk'] else int valin = [] with utils.openr(args['ctxt']) as inp: for l in inp: tset = set([gint(a) for a in l.strip().split('\t')][1:]) if len(tset) < args['n']: continue valin.append(tset) all_t = set() for v in valin: all_t |= v res = {} for t in all_t: #if len(t) < args['n']: # continue res[t] = [int(t in v) for v in valin]
cur,seq,fs,rs,al,rseq,fseq = None, None, None, None, None, None, None return seqs if __name__ == "__main__": args = read_params( sys.argv ) extr = parse_primersearch( args['ps'] ) seqs2extr = {} for k,v in extr.items(): if v['seq'] in seqs2extr: seqs2extr[v['seq']][k] = v else: seqs2extr[v['seq']] = { k: v } with utils.openw( args['out'] ) as outf: for r in SeqIO.parse( utils.openr(args['fna']), "fasta"): if r.id in seqs2extr: for pn,ext in seqs2extr[r.id].items(): sq = SeqRecord( r.id ) sq.id = r.id + " " + pn sq.description = r.description + " " + pn sq.seq = r.seq[ ext['fs']+len(ext['fseq']):len(r.seq)-ext['rs']-len(ext['rseq'])] SeqIO.write(sq, outf, "fasta")