def random_selecet(a): fastq, number = a length = int( os.pipe( "zcat {} | parallel --pipe wc -l | awk '{i+=$1}END{print i}'". format(fastq))) fastq_handle = SeqIO(gzip.open(fastq), 'fastq') for record in fastq_handle: num = random.randint(length) if num < number: print(record) break
# coding: utf-8 from Bio import SeqIO peptides = SeqIO('Data/anti_microbial_peptide.fasta', 'fasta') peptides = SeqIO.parse('Data/anti_microbial_peptide.fasta', 'fasta') peptides peptides = [] for seq in SeqIO.p('Data/anti_microbial_peptide.fasta', 'fasta'): peptides.append(seq) for seq in SeqIO.parse('Data/anti_microbial_peptide.fasta', 'fasta'): peptides.append(seq) peptides import pandas as pd from protein_sequence_features import protein_features from protein_sequence_features import protein_features features = map(protein_features, peptides) features[0] peptides_dataset = pd.DataFrame(features, index=map(lambda x: x.id, peptides)) peptides_dataset peptides_dataset.to_csv('Data/anti_microbial_peptide_features.csv') from sklearn.manifold import TSNE tsne = TSNE() tsne.fit(peptides_dataset.values) X = tsne.fit_transform(peptides_dataset.values) import matplotlib.pyplot as plt import seaborn as sns plt.scatter(X) plt.scatter(X[:, 0], X[:, 1]) plt.imshow() plt.show()
def sss(par): subsample = bool(par['subsample']) select = bool(par['select']) randomize = bool(par['randomize']) if bool(par['out_f']): n = par['split'] #openw = bz2.BZ2File if par['out_f'].endswith(".bz2") else open if n == 1: out_stream = [utils.openw(par['out_f'])] else: out_stream = [ utils.openw(par['out_f'] + str(r).zfill(len(str(n))) + ".fna" + (".bz2" if par['out_f'].endswith(".bz2") else "")) for r in range(n) ] else: out_stream = [sys.stdout] # larger buffer? if select: if os.path.exists(par['ids']): #openr = bz2.BZ2File if par['ids'].endswith(".bz2") else open es = [s.strip().split('\t')[0] for s in utils.openr(par['ids'])] else: es = [(s.split("$")[1] if s.count("$") else s) for s in par['ids'].split(":::")] es = set(es) all_reads = [] nstreams = len(out_stream) p = par['subsample'] #reads = reader( par['inp_f'], par['min_len'], par['max_len'] ) cind = 0 lmin, lmax = par['min_len'], par['max_len'] for r in SeqIO.parse(utils.openr(par['inp_f']), "fasta"): if lmin and len(r.seq) < lmin: continue if lmax and len(r.seq) > lmax: continue if select: if par['reverse']: if r.id in es: continue elif r.id not in es: continue if subsample and rnd.random() > p: continue if randomize: all_reads.append(r) continue SeqIO.write(r, out_stream[cind], "fasta") cind = (cind + 1) % nstreams """ for r in reads: if select and r.n not in es: continue if subsample and rnd.random() > p: continue if randomize: all_reads.append( r ) continue out_stream[cind].write( str(r) ) cind = (cind + 1) % nstreams """ if randomize: rnd.shuffle(all_reads) step = len(all_reads) / nstreams for i, r in enumerate(all_reads): #out_stream[cind].write( str(r) ) SeqIO(r, out_stream[cind], "fasta") if not i % step: cind = (cind + 1) % nstreams for o in out_stream: o.close()