def split_seq_file(fa): from os import makedirs, path from fasta import fasta_iter import gzip makedirs('partials', exist_ok=True) basename = path.basename(fa) cur_n = MAX_SEQ_CHUNKS + 1 ix = 0 out = None partials = [] for h, seq in fasta_iter(fa): if cur_n >= MAX_SEQ_CHUNKS: if fa.endswith('.faa.gz'): partials.append(f'partials/{basename}_block_{ix:04}.faa.gz') elif fa.endswith('.fna.gz'): partials.append(f'partials/{basename}_block_{ix:04}.fna.gz') else: raise ValueError(f'Unexpected file name: {fa}') out = gzip.open(partials[-1], compresslevel = 1, mode = 'wt') cur_n = 0 ix += 1 out.write(f'>{h}\n{seq}\n') cur_n +=1 out.close() return partials
def dedup_fasta(infile): from fasta import fasta_iter import gzip import os print("start dedup") fasta = {} for ID, seq in fasta_iter(infile): if seq in fasta: fasta[seq][1] += 1 else: fasta[seq] = [ID, 1] outfile1 = infile.replace('.faa.gz', '.raw_number.tsv.gz') outfile2 = infile.replace('.faa.gz', '.dedup.faa.gz') out1 = gzip.open(outfile1, "wt", compresslevel=1) out2 = gzip.open(outfile2, "wt", compresslevel=1) print("start sort") for seq, (ID, count) in sorted(fasta.items()): out1.write(f"{count}\t{seq}\n") out2.write(f">{ID}\n{seq}\n") out1.close() out2.close() os.unlink(infile) print("finish dedup and sort") return (outfile1, outfile2)
def features(ifile): groups = [set(g) for g in (GROUPS_SA + GROUPS_HB)] seqs = [] headers = [] encodings = [] aaComp = [] for h, seq in fasta_iter(ifile): if seq[-1] == '*': seq = seq[:-1] seqs.append(seq) headers.append(h) encodings.append(ctdd(seq, groups)) aaComp.append(amino_acid_composition(seq)) # We can do this inside the loop so that we are not forced to pre-load all # the sequences into memory. However, it becomes much slower rpy2.robjects.globalenv['seq'] = seqs rfeatures = r(''' ch <- charge(seq=seq, pH=7, pKscale="EMBOSS") pI <- pI(seq=seq, pKscale="EMBOSS") aIndex <- aIndex(seq=seq) instaIndex <- instaIndex(seq=seq) boman <- boman(seq=seq) hydrophobicity <- hydrophobicity(seq=seq, scale="Eisenberg") hmoment <- hmoment(seq=seq, angle=100, window=11) cbind(ch, pI, aIndex, instaIndex, boman, hydrophobicity, hmoment) ''') features = np.hstack([aaComp, rfeatures, encodings]) features = pd.DataFrame(features, index=headers, columns=[ "tinyAA", "smallAA", "aliphaticAA", "aromaticAA", "nonpolarAA", "polarAA", "chargedAA", "basicAA", "acidicAA", "charge", "pI", "aindex", "instaindex", "boman", "hydrophobicity", "hmoment", "SA.Group1.residue0", "SA.Group2.residue0", "SA.Group3.residue0", "HB.Group1.residue0", "HB.Group2.residue0", "HB.Group3.residue0", ]) features.insert(0, 'group', 'Unk') features.insert(0, 'sequence', seqs) return features
def mergeseq(outfile): import heapq import gzip from glob import glob from fasta import fasta_iter with gzip.open(outfile, compresslevel=1, mode='wt') as output: inputs = [fasta_iter(f) for f in glob(f'*.dedup.faa.gz')] merged = heapq.merge(*inputs, key=lambda h_seq: (h_seq[1], h_seq[0])) preseq = "x" for h, seq in merged: if seq != preseq: output.write(f'>{h}\n{seq}\n') preseq = seq
def read_fasta_sequences(filename): """ Function to parse text from the given FASTA file into a screed database """ # Will raise an exception if the file doesn't exist theFile = open(filename, "rb") # Setup the iterator function iterfunc = fasta.fasta_iter(theFile) # Create the screed db create_db(filename, fasta.FieldTypes, iterfunc) theFile.close() return ScreedDB(filename)
def splitseq(infile): print("start splitseq") outputlist = [f'split/submetag_{ix:03}.faa.gz' for ix in range(256)] outputfiles = [ gzip.open(f'split/submetag_{ix:03}.faa.gz', compresslevel=1, mode='wt') for ix in range(256) ] for ID, seq in fasta_iter(infile): h = hashlib.sha256() h.update(seq.encode('ascii')) ix = int(h.hexdigest()[:2], 16) outputfiles[ix].write(f'>{ID}\n{seq}\n') for ot in outputfiles: ot.close() print("finish splitseq") return (outputlist)
def calseq(infile, outfile1, outfile2): from fasta import fasta_iter import gzip fasta = {} for ID, seq in fasta_iter(infile): if seq in fasta: fasta[seq][1] += 1 else: fasta[seq] = [ID, 1] out1 = gzip.open(outfile1, "wt", compresslevel=1) out2 = gzip.open(outfile2, "wt", compresslevel=1) for seq, (ID, count) in fasta.items(): out1.write(f"{count}\t{seq}\n") out2.write(f">{ID}\n{seq}\n") out1.close() out2.close()
def extract_seq(infile1,infile2,outfile1,outfile2): from fasta import fasta_iter import gzip fastaset=set() with gzip.open(infile1,"rt") as f: for line in f: line = line.strip() linelist = line.split("\t") if linelist[0] != "1": fastaset.add(linelist[1]) with gzip.open(outfile1, "wt", compresslevel=1) as out1, \ gzip.open(outfile2, "wt", compresslevel=1) as out2: for ID,seq in fasta_iter(infile2): if seq in fastaset: out1.write(f'>{ID}\n{seq}\n') else: out2.write(f'>{ID}\n{seq}\n')
def splitseq(infile, X, outfile): '''split inputfile according to max number of sequences X''' import gzip from fasta import fasta_iter ix = 0 n = 0 oname = outfile.format(ix=ix) out = gzip.open(oname, "wt", compresslevel=1) for ID, seq in fasta_iter(infile): if n < X: n += 1 out.write(f'>{ID}\n{seq}\n') else: n = 1 ix += 1 oname = outfile.format(ix=ix) out = gzip.open(oname, "wt", compresslevel=1) out.write(f'>{ID}\n{seq}\n') if not ID: break out.close()