def revcomp(par): with utils.openw(par["out_f"]) as outf: if par["complement"] and par["reverse"]: res = ( r.reverse_complement(id=r.id, description="RC") for r in SeqIO.parse(utils.openr(par["inp_f"]), "fasta") ) elif par["reverse"]: res = (r.reverse(id=r.id, description="R") for r in SeqIO.parse(utils.openr(par["inp_f"]), "fasta")) elif par["complement"]: res = (r.complement(id=r.id, description="C") for r in SeqIO.parse(utils.openr(par["inp_f"]), "fasta")) else: res = [] SeqIO.write(res, outf, "fasta")
def revcomp(par): with utils.openw(par['out_f']) as outf: if par['complement'] and par['reverse']: res = (r.reverse_complement(id=r.id, description="RC") for r in SeqIO.parse(utils.openr(par['inp_f']), "fasta")) elif par['reverse']: res = (r.reverse(id=r.id, description="R") for r in SeqIO.parse(utils.openr(par['inp_f']), "fasta")) elif par['complement']: res = (r.complement(id=r.id, description="C") for r in SeqIO.parse(utils.openr(par['inp_f']), "fasta")) else: res = [] SeqIO.write(res, outf, "fasta")
def blast_ncbi_outfmt6_screen(par): finp,fout = bool(par['inp_f']), bool(par['out_f']) inp_mat = (l.rstrip('\n').split("\t") for l in (utils.openr(par['inp_f']) if finp else sys.stdin)) out_mat =(l for l in inp_mat if float(l[par['pid_col']-1]) >= par['pid'] and float(l[par['length_col']-1]) >= par['length'] and float(l[par['evalue_col']-1]) <= par['evalue'] and float(l[par['bitscore_col']-1]) >= par['bitscore'] ) if 's' in par and par['s']: if par['s'] == 'pid': col = par['pid']-1 elif par['s'] == 'evalue': col = par['evalue_col'] elif par['s'] == 'length': col = par['length_col']-1 elif par['s'] == 'bitscore': col = par['bitscore_col']-1 out_mat = sorted( out_mat, key=lambda x: float(x[col-1]) ) if 'n' in par and par['n'] > -1: out_mat = out_mat[:par['n']] unique_queries = collections.defaultdict( int ) with utils.openw(par['out_f']) if fout else sys.stdout as out_file: if 't' in par and par['t'] > -1: for l in out_mat: unique_queries[l[0]] += 1 if unique_queries[l[0]] > par['t']: continue out_file.write("\t".join(l)+"\n") else: for l in out_mat: out_file.write("\t".join(l)+"\n")
def blast_ncbi_outfmt6_screen(par): finp, fout = bool(par['inp_f']), bool(par['out_f']) inp_mat = (l.rstrip('\n').split("\t") for l in (utils.openr(par['inp_f']) if finp else sys.stdin)) out_mat = (l for l in inp_mat if float(l[par['pid_col'] - 1]) >= par['pid'] and float(l[par['length_col'] - 1]) >= par['length'] and float(l[par['evalue_col'] - 1]) <= par['evalue'] and float(l[par['bitscore_col'] - 1]) >= par['bitscore']) if 's' in par and par['s']: if par['s'] == 'pid': col = par['pid'] - 1 elif par['s'] == 'evalue': col = par['evalue_col'] elif par['s'] == 'length': col = par['length_col'] - 1 elif par['s'] == 'bitscore': col = par['bitscore_col'] - 1 out_mat = sorted(out_mat, key=lambda x: float(x[col - 1])) if 'n' in par and par['n'] > -1: out_mat = out_mat[:par['n']] unique_queries = collections.defaultdict(int) with utils.openw(par['out_f']) if fout else sys.stdout as out_file: if 't' in par and par['t'] > -1: for l in out_mat: unique_queries[l[0]] += 1 if unique_queries[l[0]] > par['t']: continue out_file.write("\t".join(l) + "\n") else: for l in out_mat: out_file.write("\t".join(l) + "\n")
default=None, type=str, help="the input tree") p.add_argument('out_file', nargs='?', default=None, type=str, help="the output file (b2zipped if ending with '.bz2')\n" "[stdout if not present]") p.add_argument( '-n', action='store_true', help="Distances normalized with respect to the total branch length") return vars(p.parse_args()) if __name__ == "__main__": args = read_params(sys.argv) ppatree = ppa.PpaTree(args['intree']) dists = ppa.dist_matrix(ppatree.tree) tbl = ppatree.tree.total_branch_length() if args['n'] else 1.0 #tbl = ppatree.tree.total_branch_length()-1.0 if args['n'] else 1.0 with utils.openw(args['out_file']) as out: for k1, v1 in dists.items(): for k2, v2 in v1.items(): if k1 < k2: out.write("\t".join([k1, k2, str(v2 / tbl)]) + "\n")
def sss(par): subsample = bool(par['subsample']) select = bool(par['select']) randomize = bool(par['randomize']) if bool(par['out_f']): n = par['split'] #openw = bz2.BZ2File if par['out_f'].endswith(".bz2") else open if n == 1: out_stream = [utils.openw(par['out_f'])] else: out_stream = [ utils.openw(par['out_f'] + str(r).zfill(len(str(n))) + ".fna" + (".bz2" if par['out_f'].endswith(".bz2") else "")) for r in range(n) ] else: out_stream = [sys.stdout] # larger buffer? if select: if os.path.exists(par['ids']): #openr = bz2.BZ2File if par['ids'].endswith(".bz2") else open es = [s.strip().split('\t')[0] for s in utils.openr(par['ids'])] else: es = [(s.split("$")[1] if s.count("$") else s) for s in par['ids'].split(":::")] es = set(es) all_reads = [] nstreams = len(out_stream) p = par['subsample'] #reads = reader( par['inp_f'], par['min_len'], par['max_len'] ) cind = 0 lmin, lmax = par['min_len'], par['max_len'] for r in SeqIO.parse(utils.openr(par['inp_f']), "fasta"): if lmin and len(r.seq) < lmin: continue if lmax and len(r.seq) > lmax: continue if select: if par['reverse']: if r.id in es: continue elif r.id not in es: continue if subsample and rnd.random() > p: continue if randomize: all_reads.append(r) continue SeqIO.write(r, out_stream[cind], "fasta") cind = (cind + 1) % nstreams """ for r in reads: if select and r.n not in es: continue if subsample and rnd.random() > p: continue if randomize: all_reads.append( r ) continue out_stream[cind].write( str(r) ) cind = (cind + 1) % nstreams """ if randomize: rnd.shuffle(all_reads) step = len(all_reads) / nstreams for i, r in enumerate(all_reads): #out_stream[cind].write( str(r) ) SeqIO(r, out_stream[cind], "fasta") if not i % step: cind = (cind + 1) % nstreams for o in out_stream: o.close()
cur,seq,fs,rs,al,rseq,fseq = None, None, None, None, None, None, None return seqs if __name__ == "__main__": args = read_params( sys.argv ) extr = parse_primersearch( args['ps'] ) seqs2extr = {} for k,v in extr.items(): if v['seq'] in seqs2extr: seqs2extr[v['seq']][k] = v else: seqs2extr[v['seq']] = { k: v } with utils.openw( args['out'] ) as outf: for r in SeqIO.parse( utils.openr(args['fna']), "fasta"): if r.id in seqs2extr: for pn,ext in seqs2extr[r.id].items(): sq = SeqRecord( r.id ) sq.id = r.id + " " + pn sq.description = r.description + " " + pn sq.seq = r.seq[ ext['fs']+len(ext['fseq']):len(r.seq)-ext['rs']-len(ext['rseq'])] SeqIO.write(sq, outf, "fasta")
import utils try: import argparse as ap import bz2 except ImportError: sys.stderr.write( "argparse not found" ) sys.exit(-1) def read_params( args ): p = ap.ArgumentParser(description='Convert txt files to libsvm\n') p.add_argument( 'txt', nargs='?', default=None, type=str, help= "the input txt file [stdin if not present]") p.add_argument('ls', nargs='?', default=None, type=str, help= "the output ilibsvm file compressed if fiven with bz2 extension\n" "[stdout if not present]") return vars( p.parse_args() ) if __name__ == "__main__": args = read_params( sys.argv ) uc2cl = collections.defaultdict( set ) with utils.openr(args['txt']) as inp: data = zip(*[l.strip().split('\t') for l in inp]) outd = [[d[0]]+[str(i+1)+":"+dd for i,dd in enumerate(d[1:])] for d in data[1:]] with utils.openw(args['ls']) as out: for o in outd: out.write( "\t".join(o) +"\n" )
import os import textwrap from collections import namedtuple as nt import random as rnd rnd.seed(1982) import utils from Bio import SeqIO def read_params(args): parser = argparse.ArgumentParser(description='List the genes in the genome file') arg = parser.add_argument arg( 'inp_f', metavar='INPUT_FILE', default=None, type=str, help="the input fna file") arg( 'out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str, help="the output txt file [stdout if not present]") return vars(parser.parse_args()) def genome_id( fn ): return str(-int(os.path.basename(fn).split(".")[0])) if __name__ == '__main__': par = read_params(sys.argv) ids = [r.id for r in SeqIO.parse( utils.openr(par['inp_f']), "fasta")] with utils.openw( par['out_f']) as out: out.write( "\t".join( [genome_id(par['inp_f'])]+ids ) + "\n" )
nargs='?', default=None, type=str, help="the output txt file compresse if fiven with bz2 extension\n" "[stdout if not present]") """ p.add_argument('--subsample', metavar="Subsampling rate", default=1.0, type=float ) p.add_argument('-n', metavar="Minimum number of matching taxa", default=0, type=int ) p.add_argument('-p', metavar="Prefix for taxon names", default="", type=str ) """ return vars(p.parse_args()) if __name__ == "__main__": args = read_params(sys.argv) fna = SeqIO.to_dict(SeqIO.parse(utils.openr(args['fna']), "fasta")) with utils.openw(args['rxl']) as out: n = len(fna.values()[0]) out.write(str(len(fna)) + " " + str(n) + "\n") for k, v in fna.items(): if len(k) > 14: k = k[:14] out.write( str(k) + " " * (15 - len(str(k)[1:])) + str(v.seq) + "\n")
parser.add_argument('--bo6', metavar='Bo6 file', required=True, type = str ) return vars(parser.parse_args()) if __name__ == '__main__': par = read_params(sys.argv) inp_mat = (l.rstrip('\n').split("\t") for l in (utils.openr(par['bo6']))) if par['extract_targets']: toextr = ((l[1], l[2], l[3], l[11], int(l[8]), int(l[9])) for l in inp_mat) else: toextr = ((l[0], l[2], l[3], l[11], int(l[6]), int(l[7])) for l in inp_mat) inpfasta = SeqIO.to_dict(SeqIO.parse( utils.openr(par['inp_f']), "fasta")) out_seqs = [] for n,pid,l,bit,fr,to in toextr: n = inpfasta[n][min(fr,to):max(fr,to)] if par['i']: p = "_pid"+pid.strip()+"_l"+l.strip()+"_bs"+bit.strip() else: p = "" n.id = n.id+"_"+str(fr)+"_"+str(to)+p out_seqs.append( n ) SeqIO.write(out_seqs, utils.openw(par['out_f']), "fasta")
def read_params(args): parser = argparse.ArgumentParser( description='List the genes in the genome file') arg = parser.add_argument arg('inp_f', metavar='INPUT_FILE', default=None, type=str, help="the input fna file") arg('out_f', metavar='OUTPUT_FILE', nargs='?', default=None, type=str, help="the output txt file [stdout if not present]") return vars(parser.parse_args()) def genome_id(fn): return str(-int(os.path.basename(fn).split(".")[0])) if __name__ == '__main__': par = read_params(sys.argv) ids = [r.id for r in SeqIO.parse(utils.openr(par['inp_f']), "fasta")] with utils.openw(par['out_f']) as out: out.write("\t".join([genome_id(par['inp_f'])] + ids) + "\n")
p.add_argument( 'fna', nargs='?', default=None, type=str, help= "the input uc file [stdin if not present]") p.add_argument('rxl', nargs='?', default=None, type=str, help= "the output txt file compresse if fiven with bz2 extension\n" "[stdout if not present]") """ p.add_argument('--subsample', metavar="Subsampling rate", default=1.0, type=float ) p.add_argument('-n', metavar="Minimum number of matching taxa", default=0, type=int ) p.add_argument('-p', metavar="Prefix for taxon names", default="", type=str ) """ return vars( p.parse_args() ) if __name__ == "__main__": args = read_params( sys.argv ) fna = SeqIO.to_dict(SeqIO.parse( utils.openr(args['fna']), "fasta")) with utils.openw(args['rxl']) as out: n = len(fna.values()[0]) out.write( str(len(fna))+" "+str(n)+"\n" ) for k,v in fna.items(): if len(k) > 14: k = k[:14] out.write( str(k)+" "*(15-len(str(k)[1:]))+str(v.seq) +"\n" )
def sss( par ): subsample = bool(par['subsample']) select = bool(par['select']) randomize = bool(par['randomize']) if bool(par['out_f']): n = par['split'] #openw = bz2.BZ2File if par['out_f'].endswith(".bz2") else open if n == 1: out_stream = [utils.openw( par['out_f'])] else: out_stream = [utils.openw( par['out_f']+str(r).zfill(len(str(n)))+".fna"+(".bz2" if par['out_f'].endswith(".bz2") else "")) for r in range(n)] else: out_stream = [sys.stdout] # larger buffer? if select: if os.path.exists(par['ids']): #openr = bz2.BZ2File if par['ids'].endswith(".bz2") else open es = [s.strip().split('\t')[0] for s in utils.openr(par['ids'])] else: es = [(s.split("$")[1] if s.count("$") else s) for s in par['ids'].split(":::")] es = set(es) all_reads = [] nstreams = len( out_stream ) p = par['subsample'] #reads = reader( par['inp_f'], par['min_len'], par['max_len'] ) cind = 0 lmin,lmax = par['min_len'], par['max_len'] for r in SeqIO.parse( utils.openr(par['inp_f']), "fasta"): if lmin and len(r.seq) < lmin: continue if lmax and len(r.seq) > lmax: continue if select: if par['reverse']: if r.id in es: continue elif r.id not in es: continue if subsample and rnd.random() > p: continue if randomize: all_reads.append( r ) continue SeqIO.write(r, out_stream[cind], "fasta") cind = (cind + 1) % nstreams """ for r in reads: if select and r.n not in es: continue if subsample and rnd.random() > p: continue if randomize: all_reads.append( r ) continue out_stream[cind].write( str(r) ) cind = (cind + 1) % nstreams """ if randomize: rnd.shuffle(all_reads) step = len(all_reads) / nstreams for i,r in enumerate(all_reads): #out_stream[cind].write( str(r) ) SeqIO(r, out_stream[cind], "fasta" ) if not i % step: cind = (cind + 1) % nstreams for o in out_stream: o.close()
g2c = collections.defaultdict( set ) if args['b6o']: inp_mat = ((int(a),int(b)) for a,b in (l.rstrip('\n').split("\t")[:2] for l in utils.openr(args['b6o']))) #all_targets = set() for fr,to in inp_mat: #all_targets.add( to ) if fr != to: g2c[fr].add( to ) n = args['n'] # if args['n'] else len(all_targets) n = float(n) with utils.openw(args['mtxt']) as out: last,lastv = "",[] outbuf = [] gt = None for v in valin: gt = int(v[0]) if last == gt: lastv = "" continue if lastv: outbuf.append( lastv ) last = gt lastv = v if last and last != gt: outbuf.append( lastv ) for v in outbuf:
genomes = set([g2t[g] for g in cscores]) with open( args['g2c'] ) as inp: for l in inp: line = list(l.strip().split('\t')) #if int(line[0]) not in genomes: # continue #vals = [int(a) for a in line if utils.is_number(a)] vals = [a for a in line] if len(vals) > 1: g2c[int(vals[0])] = vals[1:] for g,c in g2c.items(): for cc in c: c2g[cc] = g with utils.openw( args['out'] ) as out: for gene_seed,cscores_t in cscores.items(): taxa = g2t[gene_seed] for clade, n, n_tot, coreness in cscores_t: out.write( "\t".join(["CSCORE",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness)]) +"\n" ) # anche sotto ??? if gene_seed in fwmarkers: taxa_id, clade, n, n_tot, coreness, n_ext_seeds, n_ext_taxa, uniqueness = fwmarkers[gene_seed] if uniqueness < 0.01: out.write( "\t".join(["FWMARKER",str(gene_seed),str(taxa),clade,str(n), str(n_tot), str(coreness), str(n_ext_seeds), str(n_ext_taxa), str(1.0-uniqueness)]) +"\n" ) if gene_seed in maps: ext_tax = set([(c2g[s] if s in c2g else 0) for s in maps[gene_seed]])
help= "the input tree [stdin if not present]") p.add_argument('outfile', nargs='?', default=None, type=str, help= "the output core file [stdout if not present]") p.add_argument('-f', metavar="File containing sets of taxa", default=None, type=str ) p.add_argument('-e', metavar="Error rate [def 0.95]", default=0.95, type=float ) p.add_argument('-s', metavar="Subtree of interest", default=None, type=str ) p.add_argument('--skip_qm', metavar="Whether to skip question mark clades or not", default=1, type=int ) return vars( p.parse_args() ) if __name__ == "__main__": args = read_params( sys.argv ) tree = ppa.PpaTree( args['intree'] ) cores = tree.find_cores(args['f'], error_rate = args['e'], subtree = args['s'], skip_qm = args['skip_qm']) with utils.openw( args['outfile'] ) as outf: for k,v in sorted(cores.items(),key=lambda x:x[0]): for vv in v: outf.write( "\t".join( [str(s) for s in [k]+list(vv)]) +"\n" ) #ctree.export_cores( args['outfile'] ) """ ctree.reroot( strategy = args['s'], tf = args['f'] ) ctree.export( args['outtree'] ) """
) arg("-a", default=None, type=int, help="number of char after the match to report") arg("-n", default=None, type=int, help="number of matching primers") parser.add_argument("-s", metavar="Subsequene to look for", required=True, type=str) return vars(parser.parse_args()) if __name__ == "__main__": par = read_params(sys.argv) ss = par["s"].lower() ssr = Seq(par["s"]).reverse_complement().lower() f = os.path.basename(par["inp_f"]).split(".")[0] with utils.openw(par["out_f"]) as outf: for r in SeqIO.parse(utils.openr(par["inp_f"]), "fasta"): rl = r.seq.lower() if ss in rl or ssr in rl: if par["a"]: if ss in rl: i = str(rl).index(str(ss)) subs = rl[i : i + len(ss) + par["a"]] if i + len(ss) + par["a"] < len(rl) else rl[i:] else: i = str(rl).index(str(ssr)) subs = rl[i : i + len(ssr) + par["a"]] if i + len(ssr) + par["a"] < len(rl) else rl[i:] outf.write(f + "\t" + str(r.id) + "\t" + str(subs) + "\n") else: if par["n"]: n = str(rl).count(str(ss)) + str(rl).count(str(ssr)) outf.write(f + "\t" + str(r.id) + "\t" + str(n) + "\n")
def read_params(args): p = ap.ArgumentParser(description='Convert txt files to libsvm\n') p.add_argument('txt', nargs='?', default=None, type=str, help="the input txt file [stdin if not present]") p.add_argument( 'ls', nargs='?', default=None, type=str, help="the output ilibsvm file compressed if fiven with bz2 extension\n" "[stdout if not present]") return vars(p.parse_args()) if __name__ == "__main__": args = read_params(sys.argv) uc2cl = collections.defaultdict(set) with utils.openr(args['txt']) as inp: data = zip(*[l.strip().split('\t') for l in inp]) outd = [[d[0]] + [str(i + 1) + ":" + dd for i, dd in enumerate(d[1:])] for d in data[1:]] with utils.openw(args['ls']) as out: for o in outd: out.write("\t".join(o) + "\n")
valin = [] with utils.openr(args['ctxt']) as inp: for l in inp: tset = set([gint(a) for a in l.strip().split('\t')][1:]) if len(tset) < args['n']: continue valin.append(tset) all_t = set() for v in valin: all_t |= v res = {} for t in all_t: #if len(t) < args['n']: # continue res[t] = [int(t in v) for v in valin] with utils.openw(args['txt']) as out: n = len(res.values()[0]) n_s = int(float(n) * args['subsample']) out.write(str(len(res)) + " " + str(n_s) + "\n") indok = set(random.sample(list(range(n)), n_s)) for k, v in res.items(): if isinstance(k, basestring) and len(k) > 15: k = k[:14] out.write(args['p'] + str(k) + " " * (15 - len(str(k)[1:])) + "".join([str(s) for i, s in enumerate(v) if i in indok]) + "\n")
except ImportError: sys.stderr.write( "argparse not found" ) sys.exit(-1) def read_params( args ): p = ap.ArgumentParser(description='Convert core gene txt file' ' substituting gene IDs with genomes IDs\n') p.add_argument( 't2g', nargs='?', default=None, type=str, help= "") p.add_argument('g2t', nargs='?', default=None, type=str, help= "") return vars( p.parse_args() ) if __name__ == "__main__": args = read_params( sys.argv ) uc2cl = collections.defaultdict( set ) g2t = {} with utils.openr( args['t2g'] ) as inp: for ll in (l.strip().split('\t') for l in inp): to = int(ll[0]) for g in ll[1:]: g2t[int(g)] = to with utils.openw(args['g2t']) as out: for g,t in g2t.iteritems(): out.write( "\t".join([str(g),str(t)]) +"\n" )