def test_using_stdin(self): """Simple alignment using stdin""" input_file = "Fasta/f002" self.assertTrue(os.path.isfile(input_file)) records = list(SeqIO.parse(input_file,"fasta")) #Prepare the command... use Clustal output (with a MUSCLE header) cline = MuscleCommandline(muscle_exe, clw=True) self.assertEqual(str(cline).rstrip(), _escape_filename(muscle_exe) + " -clw") self.assertEqual(str(eval(repr(cline))), str(cline)) child = subprocess.Popen(str(cline), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=(sys.platform!="win32")) SeqIO.write(records, child.stdin, "fasta") child.stdin.close() #Alignment will now run... align = AlignIO.read(child.stdout, "clustal") align.sort() records.sort(key = lambda rec: rec.id) self.assertEqual(len(records),len(align)) for old, new in zip(records, align): self.assertEqual(old.id, new.id) self.assertEqual(str(new.seq).replace("-",""), str(old.seq)) self.assertEqual(0, child.wait()) child.stdout.close() child.stderr.close() del child
def loop(self, filename, format): original_records = list(SeqIO.parse(open(filename, "rU"), format)) # now open a connection to load the database server = BioSeqDatabase.open_database(driver = DBDRIVER, user = DBUSER, passwd = DBPASSWD, host = DBHOST, db = TESTDB) db_name = "test_loop_%s" % filename # new namespace! db = server.new_database(db_name) count = db.load(original_records) self.assertEqual(count, len(original_records)) server.commit() #Now read them back... biosql_records = [db.lookup(name=rec.name) for rec in original_records] #And check they agree self.assertTrue(compare_records(original_records, biosql_records)) #Now write to a handle... handle = StringIO() SeqIO.write(biosql_records, handle, "gb") #Now read them back... handle.seek(0) new_records = list(SeqIO.parse(handle, "gb")) #And check they still agree self.assertEqual(len(new_records), len(original_records)) for old, new in zip(original_records, new_records): #TODO - remove this hack because we don't yet write these (yet): for key in ["comment", "references", "db_source"]: if key in old.annotations and key not in new.annotations: del old.annotations[key] self.assertTrue(compare_record(old, new)) #Done server.close()
def test_fasta_out(self): """Check FASTQ to FASTA output""" records = SeqIO.parse("Quality/example.fastq", "fastq") h = StringIO() SeqIO.write(records, h, "fasta") with open("Quality/example.fasta") as expected: self.assertEqual(h.getvalue(), expected.read())
def write_fasta(filename, data): fd = open(filename, "w") seq_list = [] for i in data.keys(): seq_list.append(SeqRecord(Seq(data.get(i)), id=i, description="")) SeqIO.write(seq_list, fd, "fasta") fd.close()
def test_generated(self): """Write and read back odd SeqRecord objects""" record1 = SeqRecord(Seq("ACGT"*500, generic_dna), id="Test", description="Long "*500, letter_annotations={"phred_quality":[40,30,20,10]*500}) record2 = SeqRecord(MutableSeq("NGGC"*1000), id="Mut", description="very "*1000+"long", letter_annotations={"phred_quality":[0,5,5,10]*1000}) record3 = SeqRecord(UnknownSeq(2000,character="N"), id="Unk", description="l"+("o"*1000)+"ng", letter_annotations={"phred_quality":[0,1]*1000}) record4 = SeqRecord(Seq("ACGT"*500), id="no_descr", description="", name="", letter_annotations={"phred_quality":[40,50,60,62]*500}) record5 = SeqRecord(Seq("",generic_dna), id="empty_p", description="(could have been trimmed lots)", letter_annotations={"phred_quality":[]}) record6 = SeqRecord(Seq(""), id="empty_s", description="(could have been trimmed lots)", letter_annotations={"solexa_quality":[]}) record7 = SeqRecord(Seq("ACNN"*500), id="Test_Sol", description="Long "*500, letter_annotations={"solexa_quality":[40,30,0,-5]*500}) record8 = SeqRecord(Seq("ACGT"), id="HighQual", description="With very large qualities that even Sanger FASTQ can't hold!", letter_annotations={"solexa_quality":[0,10,100,1000]}) #TODO - Record with no identifier? records = [record1, record2, record3, record4, record5, record6, record7, record8] #TODO - Have a Biopython defined "DataLossWarning?" warnings.simplefilter('ignore', BiopythonWarning) #TODO - Include phd output? for format in ["fasta", "fastq", "fastq-solexa", "fastq-illumina", "qual"]: handle = StringIO() SeqIO.write(records, handle, format) handle.seek(0) compare_records(records, list(SeqIO.parse(handle, format)), truncation_expected(format)) warnings.filters.pop()
def blastclust_to_fasta(infname, seqfname, outdir): """Converts input BLASTCLUST output list to a subdirectory of FASTA files. Each individual FASTA file contains all sequences from a single cluster. The sequences matching the IDs listed in the BLASTCLUST output .lst file should all be found in the same file. Returns the output directory and a list of the files, as a tuple. """ outdirname = os.path.join(outdir, "blastclust_OTUs") if not os.path.exists(outdirname): os.makedirs(outdirname) seqdict = SeqIO.index(seqfname, 'fasta') outfnames = [] with open(infname, 'r') as fh: otu_id = 0 for line in fh: otu_id += 1 outfname = os.path.join(outdirname, "blastclust_OTU_%06d.fasta" % otu_id) SeqIO.write((seqdict[key] for key in line.split()), outfname, 'fasta') outfnames.append(outfname) return (outdirname, outfnames)
def run_pal2nal(fname_aln, fname_nuc, fname_prot): """ Generate a codon alignment via PAL2NAL. @param fname_aln: MSA of protein sequences in CLUSTAL format (.aln) @param fname_nuc: Nucleotide sequences in FASTA format (.fasta) @param fname_prot: Protein sequences in FASTA format (.fasta) @return: Codon alignment in CLUSTAL format (.aln), suitable for codeml 1""" sys.stderr.write("\nSTEP: run_pal2nal(%s, %s)\n" % (fname_aln, fname_nuc)) # Reorder fname_nuc according to the order of the proteins in fname_aln, which # was reordered due to CLUSTALW2. Note that the first protein in each of # these files remains the same as at the start, however; this first protein # is our original query protein. nuc_records = [record for record in SeqIO.parse(fname_nuc, "fasta")] prot_records = [record for record in SeqIO.parse(fname_prot, "fasta")] records_map = dict((pr.id, nr) for pr, nr in zip(prot_records, nuc_records)) fname_nuc2 = "homologs_ordered.dna.fasta" with open(fname_nuc2, "w") as f: for record in SeqIO.parse(fname_aln, "clustal"): SeqIO.write(records_map[record.id], f, "fasta") fname_codon = "homologs.codon.aln" # TODO: use subprocess os.system("%s/pal2nal.pl %s %s -output paml > %s" % (bin_dir(), fname_aln, fname_nuc2, fname_codon)) return fname_codon
def CutOutDomain(coords,filename, header=False, column_id=0, column_start=8, column_stop=9): """COMMENTS""" from Bio import SeqIO fh=open(coords) seqfile=open(filename) Towrite=[] CoordIDDic={} if header==True: print 'header set to True, first line of %s will be ignored'%coords skip_header=fh.readline() else: print 'header not set to True, first line of %s will be processed'%coords for unformatedLine in fh: l=unformatedLine.replace('\xa0', '').strip().split(',') if l[column_id] not in CoordIDDic: CoordIDDic[l[column_id]]=l[column_start], l[column_stop] else: for s in SeqIO.parse(seqfile, 'fasta'): if s.id in CoordIDDic: start=(int(CoordIDDic.get(s.id)[0])-1) stop=int(CoordIDDic.get(s.id)[1]) s.id=s.id+'_%s_%s'%((start+1), stop) Towrite.append(s[start:stop]) else: Output=open('CutOutdomain_%s'%filename, 'w') SeqIO.write(Towrite, Output, 'fasta')
def needle_score(seq1, seq2, verbose=False, keep=False): """ get needlman-wunsch score for aligning two sequences """ ntf = tempfile.NamedTemporaryFile with ntf(prefix='seq1', delete = not keep) as fh1, \ ntf(prefix='seq2', delete = not keep) as fh2, \ ntf(prefix='align_out') as outfile, \ open(os.devnull) as dn: SeqIO.write(seq1, fh1, 'fasta') fh1.flush() SeqIO.write(seq2, fh2, 'fasta') fh2.flush() cmd = ['needle', '-gapopen', '0', '-gapextend', '0', '-outfile', outfile.name, fh1.name, fh2.name] if verbose: print(' '.join(cmd)) subprocess.check_call(cmd, stderr=dn) result = outfile.read() pattern = re.compile(r'# Score: (.*)') score = pattern.search(result) if score is not None: return float(score.group(1)) return 0
def frameshift_writer(contigs, file): sys.stderr.write("[predict] writing frameshifts...") seqs = [SeqRecord(seq=c.seq, id=c.id, description=c.description) for c in contigs.values() if c.annotation['majority_frameshift']] SeqIO.write(seqs, file, "fasta") file.close() sys.stderr.write("\tdone.\n")
def no_relatives_writer(contigs, file): sys.stderr.write("[predict] writing contigs with no relatives...") seqs = [SeqRecord(seq=c.seq, id=c.id, description=c.description) for c in contigs.values() if c.annotation['num_relatives'] == 0] SeqIO.write(seqs, file, "fasta") file.close() sys.stderr.write("\tdone.\n")
def gather_est2genome_seqs(refseq_obj, est2genome_handle, log_line, velvet_file): seq_dir = log_line.split("\t")[1] tmp_refseq = seq_dir.split("/")[3].replace(".","%2E")#hardcoded in this position gff_file = refseq_obj.id + ".velvet_contigs.maker.output/" + seq_dir + "/" + tmp_refseq + ".gff" gff_handle = open(gff_file,'r') for gff_line in gff_handle: if(re.search("est2gneome",gff_line) and \ re.search("\texpressed_sequence_match\t",gff_line)): curr_start = int(gff_line.split("\t")[3]) curr_stop = int(gff_line.split("\t")[4]) curr_strand = gff_line.split("\t")[6] tmp_handle = open(velvet_file,'r') tmp_fasta = SeqIO.to_dict(SeqIO.parse(tmp_handle,"fasta")) tmp_handle.close() if seq_dir.split("/")[3] in tmp_fasta: curr_record = tmp_fasta[seq_dir.split("/")[3]] else: continue new_seq = curr_record.seq[curr_start - 1:curr_stop] if(curr_strand == "-"): new_seq = curr_record.seq[curr_start - 1:curr_stop].reverse_complement() new_record = SeqRecord(new_seq,id=seqname,name=seqname,description="") SeqIO.write(est2genome_handle,"fasta")
def __format__(self, format_spec): """Returns the record as a string in the specified file format. This method supports the python format() function added in Python 2.6/3.0. The format_spec should be a lower case string supported by Bio.SeqIO as an output file format. See also the SeqRecord's format() method. """ if not format_spec: #Follow python convention and default to using __str__ return str(self) from Bio import SeqIO if format_spec in SeqIO._BinaryFormats: #Return bytes on Python 3 try: #This is in Python 2.6+, but we need it on Python 3 from io import BytesIO handle = BytesIO() except ImportError: #Must be on Python 2.5 or older from StringIO import StringIO handle = StringIO() else: from StringIO import StringIO handle = StringIO() SeqIO.write(self, handle, format_spec) return handle.getvalue()
def splitFastaFile(infile, informat, outdir): for record in SeqIO.parse(open(infile), informat): iid = record.id if not os.path.exists(outdir): os.mkdir(outdir) f_out = os.path.join(outdir,iid+'.fasta') SeqIO.write([record],open(f_out,'w'),"fasta")
def check_convert_fails(in_filename, in_format, out_format, alphabet=None): qual_truncate = truncation_expected(out_format) #We want the SAME error message from parse/write as convert! err1 = None try: records = list(SeqIO.parse(in_filename,in_format, alphabet)) handle = StringIO() if qual_truncate: warnings.simplefilter('ignore', UserWarning) SeqIO.write(records, handle, out_format) if qual_truncate: warnings.filters.pop() handle.seek(0) assert False, "Parse or write should have failed!" except ValueError as err: err1 = err #Now do the conversion... try: handle2 = StringIO() if qual_truncate: warnings.simplefilter('ignore', UserWarning) SeqIO.convert(in_filename, in_format, handle2, out_format, alphabet) if qual_truncate: warnings.filters.pop() assert False, "Convert should have failed!" except ValueError as err2: assert str(err1) == str(err2), \ "Different failures, parse/write:\n%s\nconvert:\n%s" \ % (err1, err2)
def getFasta(sequences, header): with open('query.fa', 'w') as fasta: with open(sequences, 'rU') as input: SeqRecords = SeqIO.parse(input, 'fasta') for rec in SeqRecords: if rec.id == header: SeqIO.write(rec, fasta, 'fasta')
def compress(self,filename,cd,pos): filename.compdeep=cd-1 filename.comptype=pos[0:len(pos)-1] if filename.ext=='.gb': rec=SeqIO.read(filename.get_name(),"genbank") ln=len(rec.seq) else: rec=SeqIO.read(filename.get_name(),"fasta") ln=len(rec.seq) filename.compdeep=cd filename.comptype=pos numpos=int(pos[len(pos)-1]) compstep=self.compopt['compstep'] resseq=Seq('',rec.seq.alphabet) res=open(filename.get_name(),'w') oligolist=[] self.complete_oligolist(oligolist,'',compstep) for i in xrange(0,ln-ln%compstep,compstep): if str(rec.seq[i:i+compstep]).lower() in oligolist: resseq+=rec.seq[i:i+compstep][numpos] rec.seq=resseq if filename.ext=='.gb': SeqIO.write(rec,res,"genbank") else: SeqIO.write(rec,res,"fasta") res.close() return resseq
def gbk_to_fasta(genbank, fasta): ''' Converts a genbank to a fasta using BioPython ''' sequences = SeqIO.parse(genbank, "genbank") SeqIO.write(sequences, fasta, "fasta")
def output(target, option): if option == '1': for record in target: gene = record[0].id.split(sep='|')[-1] output_file = ''.join([ 'output/', sys.argv[2], '-', gene, '.fasta' ]) rename_seq = SeqRecord( seq=record[1].seq, id='|'.join([ gene, sys.argv[1], record[1].id ]), description='' ) SeqIO.write(rename_seq, output_file, 'fasta') else: output_file = open('output/' + sys.argv[1] + '-filtered.fasta', 'w' ) contig_id = {i[0].id for i in target} query_file = SeqIO.parse(sys.argv[1], 'fasta') for record in query_file: if record.id in contig_id: SeqIO.write(record, output_file, 'fasta')
def write_selected_pfam_genes(options, annot_genes_all): ''' For each Protein Family write a second multiple alignment file with just the annotated genes. ''' global q while 1: try: pf = q.get(block=True, timeout=0.1) except Empty: break else: if options.type == "pf": shutil.copy(options.dbdir+"align/"+pf.upper()+".fasta", options.outdir+pf+"/"+pf.upper()+".fasta") q.task_done() else: handle = open(options.dbdir+"align/"+pf.upper()+".fasta","r") handle_out = open(options.outdir+pf+"/"+pf.upper()+".fasta", "w") for nuc_rec in SeqIO.parse(handle, "fasta"): if nuc_rec.id[0:nuc_rec.id.find("/")] in annot_genes_all: SeqIO.write(SeqRecord(seq = nuc_rec.seq, id = nuc_rec.id, description = ""), handle_out, "fasta") handle_out.close() handle.close() q.task_done()
def check_seq_between(gb, insertion, start, end, name, temp): ''' Check the sequence between two ends to see if it matches the IS query or not, and what the coverage and %ID to the query. ''' genbank = SeqIO.read(gb, 'genbank') # Get sequence between left and right ends seq_between = genbank.seq[start:end] # Turn the sequence into a fasta file seq_between = SeqRecord(Seq(str(seq_between), generic_dna), id=name) SeqIO.write(seq_between, temp + name + '.fasta', 'fasta') # Perform the BLAST doBlast(temp + name + '.fasta', temp + name + '_out.txt', insertion) # Only want the top hit, so set count variable to 0 first_result = 0 # Open the BLAST output file with open(temp + name + '_out.txt') as summary: for line in summary: # Get coverage and % ID for top hit if first_result == 0: info = line.strip().split('\t') coverage = float(info[4]) / float(info[5]) * 100 hit = [info[3], coverage] first_result += 1 return hit # If there is not hit, just return an empty list hit = [] return []
def save_seqs_to_file(self): """Query sequences for each gene from database and save to local disk. Sets attribute `self.seq_file` containing necessary sequences from our database. """ if self.blast_type == 'new': self.seq_file = os.path.join(self.cwd, 'db', '_'.join(self.gene_codes) + "_seqs.fas", ) if self.gene_codes: # Taken from http://stackoverflow.com/a/1239602 Qr = None for gene_code in self.gene_codes: q = Q(gene_code=gene_code) if Qr: Qr = Qr | q else: Qr = q queryset = Sequences.objects.filter(Qr) else: queryset = Sequences.objects.all() my_records = [] for i in queryset: item_id = i.code_id + '|' + i.gene_code seq = self.strip_question_marks(i.sequences) if seq != '': seq_record = SeqRecord(Seq(seq), id=item_id) my_records.append(seq_record) SeqIO.write(my_records, self.seq_file, "fasta")
def as_refpkg(sequences, name='temp.refpkg', threads=FASTTREE_THREADS): """Context manager yielding a temporary reference package for a collection of aligned sequences. Builds a tree with FastTree, creates a reference package, yields. """ sequences = list(sequences) with ntf(prefix='fasttree-', suffix='.log') as log_fp, \ ntf(prefix='fasttree-', suffix='.tre') as tree_fp, \ tempdir(prefix='refpkg') as refpkg_dir: log_fp.close() fasttree(sequences, log_path=log_fp.name, output_fp=tree_fp, gtr=True, threads=threads) tree_fp.close() rp = Refpkg(refpkg_dir(name), create=True) rp.update_metadata('locus', '') rp.update_phylo_model('FastTree', log_fp.name) rp.update_file('tree', tree_fp.name) # FASTA and Stockholm alignment with ntf(suffix='.fasta') as f: SeqIO.write(sequences, f, 'fasta') f.close() rp.update_file('aln_fasta', f.name) with ntf(suffix='.sto') as f: SeqIO.write(sequences, f, 'stockholm') f.close() rp.update_file('aln_sto', f.name) logging.debug("Reference package written to %s", rp.path) yield rp
def getTree(cdhitProc,rrnaFile): record_dict = SeqIO.to_dict(SeqIO.parse(open(rrnaFile,'r'), "fasta")) rrnas = [] seen = set() for cluster in cdhitProc.clusters: members = cluster.seqs for mem in members: #Obtain accession IDs from cdhitProc acc = acc_reg.findall(mem)[0] try: if acc not in seen: record = record_dict[acc] rrnas.append(record) seen.add(acc) except Exception as k: print 'Accession missing',k #Obtain corresponding 16SRNAs #print "Number of rRNAs",len(rrnas) basename,_ = os.path.splitext(rrnaFile) tmp_rrna = "%s_filtered.fasta"%basename tree = "%s_filtered.tree"%basename SeqIO.write(rrnas, open(tmp_rrna,'w'), "fasta") #Run FastTree ft = UnAlignedFastTree(tmp_rrna,tree) ft.align() #Run multiple sequence alignment and spit out aligned fasta file ft.run() #Run fasttree on multiple alignment and spit out newick tree ft.cleanUp() #Clean up! return tree
def write_full_delta_files(self, deltaFileList): outFile = "./bstrap/bstrp_iteration_delta_"+str(self.p_iterN)+".fasta" ofile = open(outFile, "w") for dfile in deltaFileList: for record in SeqIO.parse(dfile, "fasta"): SeqIO.write(record, ofile, "fasta") ofile.close()
def test_long(self): """Simple muscle call using long file.""" #Create a large input file by converting some of another example file temp_large_fasta_file = "temp_cw_prot.fasta" handle = open(temp_large_fasta_file, "w") records = list(SeqIO.parse(open("NBRF/Cw_prot.pir", "rU"), "pir"))[:40] SeqIO.write(records, handle, "fasta") handle.close() #Prepare the command... cmdline = MuscleCommandline(muscle_exe) cmdline.set_parameter("in", temp_large_fasta_file) #Preserve input record order cmdline.set_parameter("stable", True) #Default None treated as False! #Use fast options cmdline.set_parameter("maxiters", 1) cmdline.set_parameter("diags", True) #Default None treated as False! #Use clustal output cmdline.set_parameter("clwstrict", True) #Default None treated as False! #Shoudn't need this, but just to make sure it is accepted cmdline.set_parameter("maxhours", 0.1) #No progress reports to stderr cmdline.set_parameter("quiet", True) #Default None treated as False! self.assertEqual(str(cmdline).rstrip(), muscle_exe + \ " -in temp_cw_prot.fasta -diags -maxhours 0.1" + \ " -maxiters 1 -clwstrict -stable -quiet") self.assertEqual(str(eval(repr(cmdline))), str(cmdline)) result, out_handle, err_handle = generic_run(cmdline) align = AlignIO.read(out_handle, "clustal") self.assertEqual(len(records), len(align)) for old, new in zip(records, align): self.assertEqual(old.id, new.id) self.assertEqual(str(new.seq).replace("-",""), str(old.seq)) os.remove(temp_large_fasta_file) #See if quiet worked: self.assertEqual("", err_handle.read().strip())
def extract_seq_from_file(seq_file, coords_file, output_file): # 记录reference sequence名称 chrs = [] # 存储片段 chr_seg = {} # 对片段计数 cnt = 0 seqio = SeqIO.parse(seq_file, 'fasta') for seq_record in seqio: chrs.append(seq_record.id) with open(coords_file, 'r') as f: for line in f: cnt += 1 line = line.strip('\n') regions = re.split('\s+', line) if regions[0] not in chrs: log.warning('{0} not in reference sequence'.format(regions[0])) if len(regions) < 3: log.warning('The numbers of this line are less than 3(required)') continue if regions[0] not in chr_seg: chr_seg[regions[0]] = [] chr_seg[regions[0]].append(regions) else: chr_seg[regions[0]].append(regions) log.info('Summary: {0} chromosomes, {1} segments processed'.format(len(chr_seg), cnt)) res_file_handle = open(output_file, 'w') # 遍历reference sequence seqio = SeqIO.parse(seq_file, 'fasta') for seq_record in seqio: if seq_record.id in chr_seg: for seg in chr_seg[seq_record.id]: try: # 创建SeqRecord对象 tmp_seq = SeqRecord.SeqRecord(seq=(seq_record.seq)[(int(seg[1])-1):int(seg[2])], id='{0}:{1}..{2}:{3}'.format(seg[0], seg[1], seg[2], seg[3])) # 当strang为-时, 进行反向互补处理 if seg[3] == '-': tmp_seq = tmp_seq.reverse_complement(id=True, name=True, description='reverse_complement') SeqIO.write(tmp_seq, res_file_handle, 'fasta') except Exception as e: log.error(e) else: log.warning(seq_record.id + ' not exists in reference sequences') res_file_handle.close()
def stitch_scaffolds(fa,outFile,len_limit=200000000,dist=500): """ This function merge multiple scaffold together to form a longer sequence. * fa: str. Reference fa file name * outFile: str. Filename output to the file. * len_limit: int. Maximum length of each merged scaffold. * dist: int. Distance between each scaffold. """ in_handle = open(fa,'r') out_handle = open(outFile,'w') sequence = '' n = 1 for record in SeqIO.parse(in_handle,'fasta'): sequence += str(record.seq) if len(sequence) >= len_limit: item = SeqRecord(Seq(sequence), id = 'chr'+str(n),description="") SeqIO.write(item,out_handle,'fasta') sequence = '' n += 1 else: sequence += 'N'*500 if sequence != '': item = SeqRecord(Seq(sequence[:-500]), id = 'chr'+str(n),description="") SeqIO.write(item,out_handle,'fasta') # output the last one handle = open(outFile) for record in SeqIO.parse(handle,'fasta'): print len(record.seq)
def generate_random_fasta(path): gt = None with open(__genome_table__) as f: gt = f.readlines() chr_map = {} for line in gt: line = line.strip() s = line.split('\t') chr_map[s[0]] = int(s[1]) records = list(SeqIO.parse(open(__genome__,'r'), 'fasta')) seqs = [] for i in range(0,1000): chr_index = randint(1, 22) chr_id = None for r in records: if r.id == 'chr{}'.format(chr_index): record = records[chr_index] chr_id = r.id ##We use limit from genome_table! ##I think it ignores N limit = chr_map[chr_id] start = randint(0, limit-200) end = start+200 data = record.seq[start:end] seq = SeqRecord(data,'{}_{}_-10'.format(chr_id, start),'','') seqs.append(seq) random_fasta = os.path.join(path,'random.fa') output_handle = open(random_fasta, 'w') logging.info('###########GeneratingRandomFA Start########################') SeqIO.write(seqs, output_handle, 'fasta') logging.info('###########GeneratingRandomFA End########################') output_handle.close() return random_fasta
def split_query_fasta(options, n_seq): ''' Split "query.fasta" for multithreading use of PfamScan ''' handle_in = open(options.outdir+"query.fasta","r") n_threads = int(options.threads/2) n_seq_split = (n_seq/n_threads)+1 i = 0 n_seq_temp = 0 handle_out = open(options.outdir+"query_temp"+str(i)+".fasta","w") for record in SeqIO.parse(handle_in, "fasta"): if n_seq_temp <= n_seq_split: SeqIO.write(SeqRecord(seq = record.seq, id = record.id, description = record.description), handle_out, "fasta") n_seq_temp = n_seq_temp + 1 else: handle_out.close() i = i + 1 n_seq_temp = 0 handle_out = open(options.outdir+"query_temp"+str(i)+".fasta","w") SeqIO.write(SeqRecord(seq = record.seq, id = record.id, description = record.description), handle_out, "fasta") n_seq_temp = n_seq_temp + 1 try: handle_out.close() except: pass handle_in.close() return i, n_threads
# reads commands files = sys.argv[1] Npercent= float(sys.argv[2]) # prepare the output file outname=files+".pN"+str(int(Npercent))+".fasta" output_handle = open(outname, "w") # prepare both (fasta and qual) input files indexing countN=[] records = PairedFastaQualIterator(open(files+".fasta"), open(files+".qual")) for record in records: s=list(record) for i in range(len(record.letter_annotations['phred_quality'])): if record.letter_annotations['phred_quality'][i] < cutoff: s[i]="N" snew="".join(s).strip("N") if snew=="": pass else: nbN=snew.count("N") if (float(nbN)/len(snew))< (Npercent/100): countN.append(nbN) newrecord = SeqRecord(Seq(snew,), id=record.id, description="length="+str(len(snew))) SeqIO.write(newrecord, output_handle, "fasta") output_handle.close() print "New fasta written in "+outname print "This file contains "+ str(sum(countN)/len(countN))+ "N in average within the sequences"
phylome_dict = fasta2dict(os.path.join(phylomedb_path, phylo_ID + '.raw.fasta')) full_region_dict, match_list = first_round_alignments2(phylome_dict, phylo_ID, paralog_ID, alignment_dict, chain_dict) record_list = fasta2list(os.path.join(phylomedb_path, phylo_ID + '.raw.fasta')) # Need to check if the paralog is in the phylome record_2 = phylome_dict.get(phylo_ID_2, -1) if type(record_2) != SeqRecord: record_2 = proteome_dict[paralog_ID_2] record_list.append(record_2) # Write the records and align outpath = output_009_phylo_PDB_aln + '/' + P1 + '_' + P2 + '.fasta' SeqIO.write(record_list, outpath, 'fasta') pair_align_dict = align_seqs(outpath) final_pair_dict = OrderedDict() for key, value in pair_align_dict.items(): if key in (phylo_ID, paralog_ID_2): final_pair_dict[key] = value out_files = save_final_sequences2(final_pair_dict, full_region_dict, paralog_ID_2, phylo_ID, aa2three_letter) same_phylome = 0 else: final_pair_dict = OrderedDict()
start_time = time.time() print("\nEntering python script: 'parse_for_barcode_qual.py'") # inputs fastq_file = sys.argv[1] # fastq file to parse qual_threshold = int(sys.argv[2]) # int for the quality desired output_file = sys.argv[3] # file to be written filtered_fastq = open(sys.argv[3], "a") # open results file # read your fastq file for read in SeqIO.parse(fastq_file, 'fastq'): if read.letter_annotations["phred_quality"][ 0] < qual_threshold: # check quality at position 0 continue elif read.letter_annotations["phred_quality"][ 1] < qual_threshold: # check quality at position 1 continue elif read.letter_annotations["phred_quality"][ 2] < qual_threshold: # check quality at position 2 continue else: SeqIO.write( read, filtered_fastq, 'fastq' ) # write the record if all qualities are above qual_threshold # Close file filtered_fastq.close() print("...time elapsed: " + str(time.time() - start_time) + " seconds") print("...exiting python script: 'parse_for_barcode_qual.py'")
filter_out = "/staton/projects/chestnut/psudochro/analysis_081718_annotation/12k_RNA_Qrobur_081718/8_fixInternalStops/3_renameGenes/genes_to_filter.txt" final_annotation = "/staton/projects/chestnut/psudochro/analysis_081718_annotation/12k_RNA_Qrobur_081718/8_fixInternalStops/3_renameGenes/Castanea_mollissima_scaffolds_v3.4_HQcds.fna" # Create a list with every gene you want to filter out. merged_list = [] with open(filter_out) as m: for line in m: merged_gene = line.rstrip() merged_list.append(merged_gene) m.close() # Use this list to filter out any sequences with a matching record ID: inhandle = open(input_annotations) outhandle = open(final_annotation, "w") count = 0 for record in SeqIO.parse(inhandle, "fasta"): id = record.id if id not in merged_list: SeqIO.write(record, outhandle, "fasta") else: count += 1 inhandle.close outhandle.close print("%d genes were filtered out" % count)
rec = list(SeqIO.parse(StringIO(out), "fasta"))[0] print rec metadata = find_sample(sample['sample_id']) print metadata """ {u'pregnancy_week': u'', u'municipality': u'murici', u'patient_sex': u'male', u'host_species': u'human', u'lab_internal_sample_id': u'', u'sample_id': u'ZBRD103', u'minion_barcodes': u'', u'ct': u'29.09', u'lab_id_lacen': u'150101004197', u'collection_date': u'2015-08-20', u'amplicon_concentration_pool_1': u'', u'pregnancy_trimester': u'', u'sample_number': u'103', u'symptoms': u'', u'creation_persistent_id': u'9EDCA6E1F234B3A6E160D5E819D8918D', u'state': u'alagoas', u'extraction_date': u'2016-06-13', u'creation_host_timestamp': u'09/08/2016 21:06:44', u'rt_positive': u'1', u'patient_age': u'25', u'modification_account_name': u'Admin', u'modification_persistent_id': u'9EDCA6E1F234B3A6E160D5E819D8918D', u'lab': u'lacen_maceio', u'onset_date': u'2015-08-18', u'microcephaly': u'', u'sample_type': u'', u'creation_account_name': u'Admin', u'modification_host_timestamp': u'', u'country': u'brazil', u'notes': u'', u'pregnant': u''} """ rec.id = "%s|%s|%s|%s|%s|%s" % ( metadata['lab_id_lacen'], metadata['sample_id'], run_name, metadata['municipality'], metadata['state'], metadata['collection_date']) if rec.seq.count('N') < 3000: SeqIO.write([rec], goodfh, "fasta") elif rec.seq.count('N') < 5500: SeqIO.write([rec], partialfh, "fasta") else: SeqIO.write([rec], badfh, "fasta") """ con = sqlite3.connect(sys.argv[1]) con.row_factory = sqlite3.Row cur = con.cursor() def lookup_sample(sample): cur.execute("select * from samples, runs where runs.Batch = ? and runs.sample_fk = samples.rowid", (sample,)) row = cur.fetchone() return row for rec in SeqIO.parse(sys.stdin, "fasta"):
recs = [rec for rec in SeqIO.parse(gbk, "genbank")] return (recs[0]) genomes = folder_list() # read and modify gbk files for genome in genomes: list_of_files = file_parser(genome) print('Parsing genome ', str(genome)) for i in range(len(list_of_files)): record = read_gbk(list_of_files[i]) record.id = genome path_to_save = os.path.join(path, genome, genome) + '.region00' + str(i + 1) + '.gbk' SeqIO.write(record, path_to_save, format='genbank') # move created files into the right folder to be analysed target_path = '/home/dani/Documents/MRC_postdoc/Pangenomic/phylo/original_data/bigscape_results/gbks' for genome in genomes: list_of_files = file_parser_NT(genome) for file in list_of_files: file_n = file.split('/')[-1] if 'NT' in file_n and 'region' in file_n: # condition to move files os.rename(file, os.path.join(target_path, file_n)) else: pass n = 0 for genome in genomes: if len(file_parser(genome)) > 0:
import sys from Bio import SeqIO # Define a function to check files exist, as a type for the argparse. def File(MyFile): if not os.path.isfile(MyFile): raise argparse.ArgumentTypeError(MyFile + ' does not exist or is not a file.') return MyFile # Set up the arguments for this script ExplanatoryMessage = ExplanatoryMessage.replace('\n', ' ').replace(' ', ' ') parser = argparse.ArgumentParser(description=ExplanatoryMessage) parser.add_argument('FastaFile', type=File) args = parser.parse_args() OutSeqs = [] for seq in SeqIO.parse(open(args.FastaFile), 'fasta'): empty = True for base in str(seq.seq): if not base in ["?", "-", "N"]: empty = False break if not empty: OutSeqs.append(seq) continue SeqIO.write(OutSeqs, sys.stdout, "fasta")
#Isolates a chromosome from an SGA file import pandas as pd from Bio import SeqIO chromosome = 1 annotations_file = "data/human_complete.sga" promoters_file = "data/human_complete.fa" annotations = pd.read_csv( annotations_file, sep='\t', names=["Id", "Type", "Position", "Strand", "Chromosome", "Gene"]) annotations['Chromosome'] = annotations.Id.str[7:9].astype(int) isolated_promoters = annotations[annotations['Chromosome'] == chromosome].Gene.tolist() record_list = [] with open(promoters_file, 'r') as handle: for record in SeqIO.parse(handle, "fasta"): if (record.description.split(' ')[1] in isolated_promoters): record_list.append(record) output_file = "data/blast/genome_database/human_promoters_chr{0}.fa".format( chromosome) with open(output_file, "w+") as output_handle: SeqIO.write(record_list, output_handle, "fasta")
## present in summary_complete. If it is not it is bad and should not ## be used. Also add a reasonable taxonomic name for each reference sequence ## to summary_complete as tax_id. with open(ref_dir_domain + 'combined_18S.' + domain + '.tax.fasta', 'w') as good_fasta_18S: for record in SeqIO.parse(ref_dir_domain + 'combined_18S.unique.fasta', 'fasta'): tax_name = str(record.id) genome = tax_name.split('|')[0] if genome in summary_complete.index: summary_complete.loc[genome, 'tax_name'] = tax_name keep = True kept_genomes.append(genome) SeqIO.write(record, good_fasta_18S, 'fasta') summary_complete = summary_complete[summary_complete.index.isin( kept_genomes)] ## Write out summary_complete and exit. summary_complete.to_csv(ref_dir_domain + 'genome_data.csv.gz') quit() ## For bacteria and archaea, find 16S rRNA genes in fna files. Get some paramenters on the genome; number of ## 16S genes, number of elements, size of genome, and add these to summary_complete. ## Generate two fasta files of the 16S rRNA genes. One will be used later to build ## the reference tree and has sensible taxonomic names. One is used to calculate ## the phi values and is named by assembly.
def identify_link_subclass(input_fastq, output_prefix, ambiguous_ok=False, find_prs_max=50): semi_good = [] still_bad = [] reader = SeqIO.parse(open(input_fastq), 'fastq') for r in reader: s = str(r.seq) s2 = str(r.seq.reverse_complement()) i, j = -1, -1 for x, name in prs_bcr.items(): i = s[:find_prs_max].find(x) if i > 0: print("precise found for ", x, " on + strand at pos", i) break j = s2[:find_prs_max].find(x) if j > 0: break if i < 0 and j < 0 and ambiguous_ok: for mutseq in bcr_mutation_dict[name]: i = s[:find_prs_max].find(mutseq) if i > 0: print("ambiguous found for ", mutseq, " on + strand at pos", i) break if i < 0 and j < 0 and ambiguous_ok: for mutseq in bcr_mutation_dict[name]: j = s2[:find_prs_max].find(mutseq) if j > 0: print("ambiguous found for ", mutseq, " on + strand at pos", j) break if i > 0: semi_good.append((r, len(r.seq) - i - 6, name, 's1')) elif j > 0: semi_good.append((r, len(r.seq) - j - 6, name, 's2')) else: still_bad.append((r, i)) #if len(semi_good)>=1000: break seen = defaultdict(lambda: []) # (umi,type) --> list of CCS id seen_debug = defaultdict( lambda: Counter()) # (umi,type) --> (insert) --> count f_by_type = {} for rep_seq, rep_name in prs_bcr.items(): f_by_type[rep_name] = open( "{o}_{n}.determined.fq".format(o=output_prefix, n=rep_name), 'w') f2 = open(output_prefix + '.info.csv', 'w') f3 = open(output_prefix + '.cluster_info.csv', 'w') f3.write("tag\tcount\tmembers\n") fu = open(output_prefix + '.undetermined.fq', 'w') writer = DictWriter(f2, fieldnames=INFO_FIELDNAMES, delimiter='\t') writer.writeheader() # # [r2 should mostly be blank] -- [12bp UMI] -- [insert] --- [primer] --- [r1, actually "C" region] # # for debugging linker_pos = [] for p in semi_good: info = { 'id': p[0].id, 'strand': p[-1], 'type': p[2], 'len': len(r.seq), 'ilen': 'NA', 'umi': 'NA', 'primer': 'NA', 'r2': 'NA', 'insert': 'NA', 'r1': 'NA' } if p[-1] == 's2': s = str(p[0].seq) else: s = str(p[0].seq.reverse_complement()) i = s.find(LINKER) if i > 0: linker_pos.append(i) if UMI_LEN <= i < UMI_LEN * 3: insert = s[i:p[1]] ilen = p[1] - i info['umi'] = s[i - 12:i] info['r2'] = s[:i - 12] info['primer'] = s[p[1]:p[1] + 6] info['r1'] = s[p[1] + 6:] info['insert'] = insert info['ilen'] = ilen tag = info['umi'] + '-' + info['type'] writer.writerow(info) seen[tag].append(p[0].id) seen_debug[tag][insert] += 1 else: SeqIO.write(p[0], fu, 'fastq') # now for each (umi,type), output the most common sequence umi_index = 0 for umi_type in seen_debug: umi_index += 1 umi, type = umi_type.split('-') major_seq, major_count = seen_debug[umi_type].most_common(1)[0] total_count = sum(seen_debug[umi_type].values()) f_by_type[type].write("@pacbio.{0} UMI:{1}:{2} type:{3} mcount:{4} count:{5}\n".format(\ umi_index, umi, 'G'*len(umi), type, major_count, total_count)) f_by_type[type].write("{0}\n+\n{1}\n".format(major_seq, 'I' * len(major_seq))) for f in f_by_type.values(): f.close() f2.close() fu.close() for k, v in seen.items(): f3.write("{0}\t{1}\t{2}\n".format(k, len(v), ",".join(v))) f3.close() linker_pos = np.array(linker_pos) print("DEBUG: # of linkers found", len(linker_pos)) print("DEBUG: # of linkers found at 12bp:", sum(linker_pos == 12)) print("DEBUG: # of linkers found > 12bp:", sum(linker_pos > 12)) print("DEBUG: # of linkers found < 12bp:", sum(linker_pos < 12))
def create_fasta_file(file_address, corpus, label): seq_id_pairs=[('.'.join([str(idx+1),label[idx]]),x) for idx, x in enumerate(corpus)] seq_recs=[ SeqRecord(Seq(seq,generic_dna),id=id, description='') for id,seq in seq_id_pairs] SeqIO.write(seq_recs, file_address, "fasta")
metavar='<ncRNA file>', required=True) #Getting arguments args = parser.parse_args() transcriptome_file = args.transcriptome_file protein_file = args.protein_file ncRNA_file = args.ncRNA_file #Loading transcriptomes seq ID and proteins features transcripts = list(SeqIO.parse(transcriptome_file, "fasta")) dict_proteins = SeqIO.to_dict(SeqIO.parse(protein_file, "fasta")) #Generate list with ncRNA sequences id ncRNA_list = [] transcriptome_index = transcriptome_file[:-5] + "index" for i in range(0, len(transcripts)): if transcripts[i].id not in dict_proteins.keys(): ncRNA_list.append(transcripts[i].id) #Generate index database for transcriptome (store sequence features) transcriptome_db = SeqIO.index_db(transcriptome_index, transcriptome_file, "fasta") #Generate ncRNA file with open(ncRNA_file, "w") as ncRNA_output: for i in ncRNA_list: if i in transcriptome_db: SeqIO.write(transcriptome_db[i], ncRNA_output, "fasta")
def multi_to_single(genbank, name, output): ''' Converts a multi entry genbank (where each entry is a contig) into a single entry genbank, preserving all annotations. ''' # total bases total = 0 handle = open(genbank, "rU") records = list(SeqIO.parse(handle, "genbank")) feature_count = 0 colour_count = 0 # make header genbank format friendly if len(name) >= 10: name = name[:9] for r in records: length = len(r) id = r.name seq = r.seq seq.alphabet = generic_dna if total > 0: newrecord.seq = newrecord.seq + seq else: # first sequence, initialise seqrecord newrecord = SeqRecord(seq=r.seq, name=name, id=name) newrecord.seq.alphabet = generic_dna # create feature for contig if colour_count % 2 == 0: newrecord.features.append( SeqFeature(FeatureLocation(total, total + length), type="fasta_record", qualifiers={ 'note': [r.name], 'colour': '11' })) colour_count = colour_count + 1 else: newrecord.features.append( SeqFeature(FeatureLocation(total, total + length), type="fasta_record", qualifiers={ 'note': [r.name], 'colour': '10' })) colour_count = colour_count + 1 # copy CDS features for f in r.features: feature_count += 1 f.qualifiers["locus_tag"] = str(feature_count) newrecord.features.append( SeqFeature(FeatureLocation(f.location.nofuzzy_start + total, f.location.nofuzzy_end + total), strand=f.strand, type=f.type, qualifiers=f.qualifiers)) total += length handle.close() #write out new single entry genbank SeqIO.write(newrecord, output, "genbank")
action="store_true") in_args = parser.parse_args() in_file = os.path.abspath(in_args.in_file) prot_seqs = [] with open(in_file, "r") as ifile: dna_seqs = SeqIO.parse(ifile, "fasta") for seq in dna_seqs: if in_args.strip_description: seq.description = "" seq.alphabet = IUPAC.protein seq.seq = seq.seq.translate(to_stop=True) prot_seqs.append(seq) tmp_file = MyFuncs.TempFile() with open(tmp_file.file, "w") as ofile: SeqIO.write(prot_seqs, ofile, "fasta") if not in_args.out_file: with open(tmp_file.file, "r") as ifile: print(ifile.read()) else: out_file = os.path.abspath(in_args.out_file) if os.path.exists(out_file) and not in_args.over_write: print( "Error: The outfile you've specified already exists. Use the -ow flag if you want to over-write it." ) else: shutil.move(tmp_file.file, out_file)
for i, arg in enumerate(sys.argv): if arg == "-f": filepath = sys.argv[i + 1] elif arg == "-w": window_size = int(sys.argv[i + 1]) elif arg == "-c": cutoff = int(sys.argv[i + 1]) elif arg == "-k": ksize = int(sys.argv[i + 1]) elif arg == "-o": outpath = sys.argv[i + 1] if os.path.isdir(filepath): filepaths = [os.path.join(filepath, fn) for fn in os.listdir(filepath)] else: filepaths = list(filepath.split(",")) haplotypes, haplo_freqs = predict_haplotypes(filepaths=filepaths, window_size=window_size, ksize=ksize, cutoff=cutoff) sequences = [ SeqRecord(Seq(haplo), str(i)) for i, haplo in enumerate(haplotypes) ] SeqIO.write(sequences, outpath, "fasta") freq_path = outpath + ".freqs.txt" with open(freq_path, "w") as freq_file: freq_file.writelines([",".join(haplo_freqs.astype(str))])
def main(): start_time = time.time() args = parse_args() samtools_runner = RunSamtools() # If the user gave an output directory and it doesn't already exist, # create it. if args.directory and not os.path.exists(args.directory): os.makedirs(args.directory) # Set up logfile if args.log is True: if args.output != '': logfile = os.path.join(args.directory, args.output + ".log") else: # come up with a different prefix logfile = os.path.join( args.directory, time.strftime("%d%m%y_%H%M", time.localtime()) + '.log') else: logfile = None logging.basicConfig(filename=logfile, level=logging.DEBUG, filemode='w', format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %H:%M:%S') logging.info('program started') logging.info('command line: {0}'.format(' '.join(sys.argv))) # Checks that the correct programs are installed check_command(['bwa'], 'bwa') #check_command(['samtools'], 'samtools') check_command(['makeblastdb'], 'blast') check_command(['bedtools'], 'bedtools') # Checks to make sure the runtype is valid and provides an error if not if args.runtype != "improvement" and args.runtype != "typing": logging.info('Invalid runtype selected: {}'.format(args.runtype)) logging.info( 'Runtype should be improvement or typing (see instructions for further details)' ) exit(-1) # Get feature types in correct format args.cds = ' '.join(args.cds) args.trna = ' '.join(args.trna) args.rrna = ' '.join(args.rrna) # Gather together the reads in pairs with their corresponding # assemblies (if required) fileSets = read_file_sets(args) # Start analysing each read set specified for sample in fileSets: forward_read = fileSets[sample][0] reverse_read = fileSets[sample][1] try: assembly = fileSets[sample][2] except IndexError: pass # Read in the queries query_records = SeqIO.parse(args.queries, 'fasta') # Cycle through each query on its own before moving onto the next one for query in query_records: # get the name of the query to set up file names query_name = query.id # Create the output file and folder names, # make the folders where necessary if args.directory == '': current_dir = os.getcwd() + '/' else: current_dir = args.directory if current_dir[-1] != '/': current_dir = current_dir + '/' temp_folder = current_dir + sample + '_' + query_name + '_temp/' output_sam = temp_folder + sample + '_' + query_name + '.sam' left_bam = temp_folder + sample + '_' + query_name + '_left.bam' right_bam = temp_folder + sample + '_' + query_name + '_right.bam' left_reads = temp_folder + sample + '_' + query_name + '_left.fastq' right_reads = temp_folder + sample + '_' + query_name + '_right.fastq' left_clipped_reads = temp_folder + sample + '_' + query_name + '_left_clipped.fastq' right_clipped_reads = temp_folder + sample + '_' + query_name + '_right_clipped.fastq' final_left_reads = temp_folder + sample + '_' + query_name + '_LeftFinal.fastq' final_right_reads = temp_folder + sample + '_' + query_name + '_RightFinal.fastq' no_hits_table = current_dir + sample + '_' + query_name + '_table.txt' make_directories([temp_folder]) # need to write out each query to a temp file # otherwise it can't be indexed etc query_tmp = temp_folder + query_name + '.fasta' SeqIO.write(query, query_tmp, 'fasta') # Index the IS query for BWA bwa_index(query_tmp) # Map to IS query run_command([ 'bwa', 'mem', '-t', args.t, query_tmp, forward_read, reverse_read, '>', output_sam ], shell=True) # Pull unmapped reads flanking IS run_command(samtools_runner.view(left_bam, output_sam, smallF=36), shell=True) run_command(samtools_runner.view(right_bam, output_sam, smallF=4, bigF=40), shell=True) # Turn bams to reads for mapping run_command( ['bedtools', 'bamtofastq', '-i', left_bam, '-fq', left_reads], shell=True) run_command([ 'bedtools', 'bamtofastq', '-i', right_bam, '-fq', right_reads ], shell=True) # Add corresponding clipped reads to their respective left and right ends print 'Usage before extracting soft-clipped reads' print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) logging.info( 'Extracting soft clipped reads, selecting reads that are <= ' + str(args.max_clip) + 'bp and >= ' + str(args.min_clip) + 'bp') extract_clipped_reads(output_sam, args.min_clip, args.max_clip, left_clipped_reads, right_clipped_reads) print 'Usage after reads written out, before concatentation' print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) run_command( ['cat', left_clipped_reads, left_reads, '>', final_left_reads], shell=True) run_command([ 'cat', right_clipped_reads, right_reads, '>', final_right_reads ], shell=True) print 'Usage after reads concatenated onto previous reads' print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) # Create BLAST database for IS query check_blast_database(query_tmp) if os.stat(final_left_reads)[6] == 0 or os.stat( final_right_reads)[6] == 0: logging.info( 'One or both read files are empty. This is probably due to no copies of the IS of interest being present in this sample. Program quitting.' ) with open(no_hits_table, 'w') as f: if args.runtype == 'typing': header = [ "region", "orientation", "x", "y", "gap", "call", "%ID", "%Cov", "left_gene", "left_strand", "left_distance", "right_gene", "right_strand", "right_distance", "functional_prediction" ] f.write('\t'.join(header) + '\nNo hits found') else: header = ['contig', 'end', 'x', 'y'] f.write('\t'.join(header) + '\nNo hits found') remove_temp_directory(args.temp, temp_folder) continue # Improvement mode if args.runtype == "improvement": # Get prefix for output filenames left_header = sample + '_left' right_header = sample + '_right' left_to_ref_sam = temp_folder + left_header + '_' + query_name + '.sam' right_to_ref_sam = temp_folder + right_header + '_' + query_name + '.sam' left_to_ref_bam = temp_folder + left_header + '_' + query_name + '.bam' right_to_ref_bam = temp_folder + right_header + '_' + query_name + '.bam' left_bam_sorted = current_dir + left_header + '_' + query_name + '.sorted' right_bam_sorted = current_dir + right_header + '_' + query_name + '.sorted' left_cov_bed = temp_folder + left_header + '_' + query_name + '_cov.bed' right_cov_bed = temp_folder + right_header + '_' + query_name + '_cov.bed' left_final_cov = current_dir + left_header + '_' + query_name + '_finalcov.bed' right_final_cov = current_dir + right_header + '_' + query_name + '_finalcov.bed' left_merged_bed = current_dir + left_header + '_' + query_name + '_merged.sorted.bed' right_merged_bed = current_dir + right_header + '_' + query_name + '_merged.sorted.bed' final_genbankSingle = current_dir + sample + '_' + query_name + '_annotatedSingle.gbk' # create fasta file from genbank if required if args.extension == '.gbk': assembly_gbk = assembly (file_path, file_name_before_ext, full_ext) = get_readFile_components(assembly_gbk) assembly_fasta = os.path.join( temp_folder, file_name_before_ext) + '.fasta' gbk_to_fasta(assembly, assembly_fasta) assembly = assembly_fasta # Map ends back to contigs bwa_index(assembly) if args.a == True: run_command([ 'bwa', 'mem', 'a', '-T', args.T, '-t', args.t, assembly, final_left_reads, '>', left_to_ref_sam ], shell=True) run_command([ 'bwa', 'mem', 'a', '-T', args.T, '-t', args.t, assembly, final_right_reads, '>', right_to_ref_sam ], shell=True) else: run_command([ 'bwa', 'mem', '-t', args.t, assembly, final_left_reads, '>', left_to_ref_sam ], shell=True) run_command([ 'bwa', 'mem', '-t', args.t, assembly, final_right_reads, '>', right_to_ref_sam ], shell=True) run_command(samtools_runner.view(left_to_ref_bam, left_to_ref_sam), shell=True) run_command(samtools_runner.view(right_to_ref_bam, right_to_ref_sam), shell=True) run_command(samtools_runner.sort(left_bam_sorted, left_to_ref_bam), shell=True) run_command(samtools_runner.sort(right_bam_sorted, right_to_ref_bam), shell=True) run_command(samtools_runner.index(left_bam_sorted), shell=True) run_command(samtools_runner.index(right_bam_sorted), shell=True) # Create BED file with coverage information run_command([ 'bedtools', 'genomecov', '-ibam', left_bam_sorted + '.bam', '-bg', '>', left_cov_bed ], shell=True) run_command([ 'bedtools', 'genomecov', '-ibam', right_bam_sorted + '.bam', '-bg', '>', right_cov_bed ], shell=True) filter_on_depth(left_cov_bed, left_final_cov, args.cutoff) filter_on_depth(right_cov_bed, right_final_cov, args.cutoff) run_command([ 'bedtools', 'merge', '-i', left_final_cov, '-d', args.merging, '>', left_merged_bed ], shell=True) run_command([ 'bedtools', 'merge', '-i', right_final_cov, '-d', args.merging, '>', right_merged_bed ], shell=True) # Create table and genbank if args.extension == '.fasta': run_command([ args.path + 'create_genbank_table.py', '--left_bed', left_merged_bed, '--right_bed', right_merged_bed, '--assembly', assembly, '--type fasta', '--output', current_dir + sample + '_' + query_name ], shell=True) elif args.extension == '.gbk': run_command([ args.path + 'create_genbank_table.py', '--left_bed', left_merged_bed, '--right_bed', right_merged_bed, '--assembly', assembly_gbk, '--type genbank', '--output', current_dir + sample + '_' + query_name ], shell=True) #create single entry genbank multi_to_single(sample + '_' + query_name + '_annotated.gbk', sample, final_genbankSingle) # Typing mode if args.runtype == "typing": # Get prefix of typing reference for output filenames (file_path, file_name) = os.path.split(args.typingRef) typingName = file_name.split('.g')[0] typingRefFasta = temp_folder + typingName + '.fasta' # Create reference fasta from genbank gbk_to_fasta(args.typingRef, typingRefFasta) # Create bwa index file for typing reference bwa_index(typingRefFasta) # Set up file names for output files left_header = sample + '_left_' + typingName right_header = sample + '_right_' + typingName left_to_ref_sam = temp_folder + left_header + '_' + query_name + '.sam' right_to_ref_sam = temp_folder + right_header + '_' + query_name + '.sam' left_to_ref_bam = temp_folder + left_header + '_' + query_name + '.bam' right_to_ref_bam = temp_folder + right_header + '_' + query_name + '.bam' left_bam_sorted = current_dir + left_header + '_' + query_name + '.sorted' right_bam_sorted = current_dir + right_header + '_' + query_name + '.sorted' left_cov_bed = temp_folder + left_header + '_' + query_name + '_cov.bed' right_cov_bed = temp_folder + right_header + '_' + query_name + '_cov.bed' left_cov_merged = temp_folder + left_header + '_' + query_name + '_cov_merged.sorted.bed' right_cov_merged = temp_folder + right_header + '_' + query_name + '_cov_merged.sorted.bed' left_final_cov = current_dir + left_header + '_' + query_name + '_finalcov.bed' right_final_cov = current_dir + right_header + '_' + query_name + '_finalcov.bed' left_merged_bed = current_dir + left_header + '_' + query_name + '_merged.sorted.bed' right_merged_bed = current_dir + right_header + '_' + query_name + '_merged.sorted.bed' bed_intersect = current_dir + sample + '_' + typingName + '_' + query_name + '_intersect.bed' bed_closest = current_dir + sample + '_' + typingName + '_' + query_name + '_closest.bed' bed_unpaired_left = current_dir + sample + '_' + typingName + '_' + query_name + '_left_unpaired.bed' bed_unpaired_right = current_dir + sample + '_' + typingName + '_' + query_name + '_right_unpaired.bed' # Map reads to reference, sort if args.a == True: run_command([ 'bwa', 'mem', '-a', '-T', args.T, '-t', args.t, typingRefFasta, final_left_reads, '>', left_to_ref_sam ], shell=True) run_command([ 'bwa', 'mem', '-a', '-T', args.T, '-t', args.t, typingRefFasta, final_right_reads, '>', right_to_ref_sam ], shell=True) else: run_command([ 'bwa', 'mem', '-t', args.t, typingRefFasta, final_left_reads, '>', left_to_ref_sam ], shell=True) run_command([ 'bwa', 'mem', '-t', args.t, typingRefFasta, final_right_reads, '>', right_to_ref_sam ], shell=True) run_command(samtools_runner.view(left_to_ref_bam, left_to_ref_sam), shell=True) run_command(samtools_runner.view(right_to_ref_bam, right_to_ref_sam), shell=True) run_command(samtools_runner.sort(left_bam_sorted, left_to_ref_bam), shell=True) run_command(samtools_runner.sort(right_bam_sorted, right_to_ref_bam), shell=True) run_command(samtools_runner.index(left_bam_sorted), shell=True) run_command(samtools_runner.index(right_bam_sorted), shell=True) # Create BED files with coverage information run_command([ 'bedtools', 'genomecov', '-ibam', left_bam_sorted + '.bam', '-bg', '>', left_cov_bed ], shell=True) run_command([ 'bedtools', 'genomecov', '-ibam', right_bam_sorted + '.bam', '-bg', '>', right_cov_bed ], shell=True) run_command([ 'bedtools', 'merge', '-d', args.merging, '-i', left_cov_bed, '>', left_cov_merged ], shell=True) run_command([ 'bedtools', 'merge', '-d', args.merging, '-i', right_cov_bed, '>', right_cov_merged ], shell=True) # Filter coveraged BED files on coverage cutoff (so only take # high coverage regions for further analysis) filter_on_depth(left_cov_bed, left_final_cov, args.cutoff) filter_on_depth(right_cov_bed, right_final_cov, args.cutoff) run_command([ 'bedtools', 'merge', '-d', args.merging, '-i', left_final_cov, '>', left_merged_bed ], shell=True) run_command([ 'bedtools', 'merge', '-d', args.merging, '-i', right_final_cov, '>', right_merged_bed ], shell=True) # Find intersects and closest points of regions run_command([ 'bedtools', 'intersect', '-a', left_merged_bed, '-b', right_merged_bed, '-wo', '>', bed_intersect ], shell=True) # if one or more of the bed files are empty, then closestBed returns an error # that needs to be caught try: run_command([ 'closestBed', '-a', left_merged_bed, '-b', right_merged_bed, '-d', '>', bed_closest ], shell=True) except BedtoolsError: with open(no_hits_table, 'w') as f: header = [ "region", "orientation", "x", "y", "gap", "call", "%ID", "%Cov", "left_gene", "left_strand", "left_distance", "right_gene", "right_strand", "right_distance", "functional_prediction" ] f.write('\t'.join(header) + '\nNo hits found') continue # Create all possible closest bed files for checking unpaired hits # If any of these fail, just make empty unapired files to pass to create_typing_out try: run_command([ 'closestBed', '-a', left_merged_bed, '-b', right_cov_merged, '-d', '>', bed_unpaired_left ], shell=True) except BedtoolsError: if not os.path.isfile(bed_unpaired_left) or os.stat( bed_unpaired_left)[6] == 0: open(bed_unpaired_left, 'w').close() try: run_command([ 'closestBed', '-a', left_cov_merged, '-b', right_merged_bed, '-d', '>', bed_unpaired_right ], shell=True) except BedtoolsError: if not os.path.isfile(bed_unpaired_right) or os.stat( bed_unpaired_right)[6] == 0: open(bed_unpaired_right, 'w').close() # Create table and annotate genbank with hits if args.igv: igv_flag = '1' else: igv_flag = '0' run_command([ args.path + 'create_typing_out.py', '--intersect', bed_intersect, '--closest', bed_closest, '--left_bed', left_merged_bed, '--right_bed', right_merged_bed, '--left_unpaired', bed_unpaired_left, '--right_unpaired', bed_unpaired_right, '--seq', query_tmp, '--ref', args.typingRef, '--temp', temp_folder, '--cds', args.cds, '--trna', args.trna, '--rrna', args.rrna, '--min_range', args.min_range, '--max_range', args.max_range, '--output', current_dir + sample + '_' + query_name, '--igv', igv_flag, '--chr_name', args.chr_name ], shell=True) # remove temp folder if required remove_temp_directory(args.temp, temp_folder) remove_bams(args.bam, left_bam_sorted, right_bam_sorted) total_time = time.time() - start_time time_mins = float(total_time) / 60 logging.info('ISMapper finished in ' + str(time_mins) + ' mins.')
blast_out_cog = open("%s/cog_%s.blast" % (seq_record.id,seq_record.id)) blast_lines_cog = blast_out_cog.readlines() blast_out_cog.close() j=1 best_blast_lines_cog = {} while j < len(blast_lines_cog): id = blast_lines_cog[j].split("\t")[0] best_blast_lines_cog[ id ] = blast_lines_cog[j].split("\t")[1].strip() while j < len(blast_lines_cog) and id == blast_lines_cog[j].split("\t")[0] : j+=1 j=1 for cogseq in cog_db_iterator: if cogseq.id in best_blast_lines_cog.values(): SeqIO.write(cogseq, fasta_cog, "fasta") fasta_cog.close() fasta_cog_db.close() blastout = 'tmpbia/bbhcog_%s.blast' % seq_record.id if not os.path.exists("tempgenoma"): getoutput("cat */orfs_*.faa > tempgenoma") print getoutput("formatdb -i tempgenoma " ) getoutput("blastall -p blastp -i '%s/cog_%s.faa' -d tempgenoma -e %s -a 1 -v 30 -b 30 -o %s" % (seq_record.id,seq_record.id,params["e"],blastout)) #Parseo del archivo blast generado fileoutHandler = open(blastout)
#! /usr/bin/env python import sys from Bio import SeqIO name = sys.argv[1] n = sys.argv[2] if "fasta" not in name: sys.exit("bad name") try: n = int(n) except: sys.exit("bad number") fcount = 0 scount = 0 for record in SeqIO.parse(name, "fasta"): if fcount == 0 or scount == n: if fcount > 0: fh.close() scount = 0 fcount += 1 fh = open(name.replace(".fasta", "-%04d.fasta" % (fcount)), "w") scount += 1 SeqIO.write(record, fh, "fasta") fh.close()
# were specified). if not args.match_start: SeqsNotFound = [seq for seq in args.SequenceName \ if not seq in AllSeqNamesEncountered] if len(SeqsNotFound) != 0: print('The following sequences were not found in', args.FastaFile+':', \ ' '.join(SeqsNotFound) +'\nQuitting.', file=sys.stderr) exit(1) # Trim to the specified window and/or gap strip, if desired if args.window != None: LeftCoord, RightCoord = args.window for seq in SeqsWeWant: if RightCoord > len(seq.seq): print('A window', LeftCoord, '-', RightCoord, 'was specified but', \ seq.id, 'is only', len(seq.seq), 'bases long. Quitting.', file=sys.stderr) exit(1) seq.seq = seq.seq[LeftCoord - 1:RightCoord] if args.gap_strip: seq.seq = seq.seq.ungap("-").ungap("?") # Skip blank sequences if desired if args.skip_blanks: NewSeqsWeWant = [] for seq in SeqsWeWant: if len(seq.seq.ungap("-").ungap("?")) != 0: NewSeqsWeWant.append(seq) SeqsWeWant = NewSeqsWeWant SeqIO.write(SeqsWeWant, sys.stdout, "fasta")
'Cond_B_CPM_media' ]) df_final = pd.merge(df, df_media) asc_A = pd.DataFrame(df_final.nlargest(5, 'Cond_A_CPM_media')) asc_B = pd.DataFrame(df_final.nlargest(5, 'Cond_B_CPM_media')) id_gene = asc_A['gene_id'].append(asc_B['gene_id']) gene_cond = list(id_gene) count = 1 for i in dd: for e in gene_cond: if i.id == e: arg = "gene_" + count.__str__() arquivo = SeqIO.write(i, arg, "fasta") count = count + 1 else: continue arquivo_genes = [ "gene_1", "gene_2", "gene_3", "gene_4", "gene_5", "gene_6", "gene_7", "gene_8", "gene_9", "gene_10" ] for e in arquivo_genes: refArquivo = SeqIO.read( f"C:\\Users\\bia_g\\PycharmProjects\\pythonProject\\{e}", "fasta") comand_line = NcbiblastxCommandline(cmd=blastx_path, query=refArquivo, subject=dm, out=meuOutput,
shasta = pipenv + "/" + shasta_os + " --input " + cible_shasta + " --Reads.minReadLength " + str( tailleread) + " --Align.maxTrim " + str( trim) + " --output " + tempo_out + "Shastarun" os.system(shasta) mv = "mv " + tempo_out + "Shastarun/Assembly.fasta " + tempo_out os.system(mv) rm = "rm" + " -r " + tempo_out + "Shastarun" os.system(rm) assembly = tempo_out + "Assembly.fasta" nbcontig = len([x for x in SeqIO.parse(assembly, "fasta")]) longueur_A = sum([len(x.seq) for x in SeqIO.parse(assembly, "fasta")]) if nbcontig == 2 and longueur_A > longueur_ref: num_contig = 1 for rec in SeqIO.parse(assembly, "fasta"): nom = tempo_out + str(num_contig) + '.fasta' SeqIO.write(rec, nom, "fasta") num_contig = num_contig + 1 os.system("""makeblastdb -in """ + un + """ -out """ + tempo_out + """target -dbtype 'nucl'""") os.system( """blastn -query """ + deux + """ -db """ + tempo_out + """target -out """ + tempo_out + """contig.fasta -outfmt "10 sstrand sseqid" -evalue 0.01""") tailletest = open(tempo_out + "contig.fasta", "r") tailletest2 = tailletest.readlines() tailletest.close() trim = trim + 1 if len(tailletest2) == 0: ref_ok = True longueur_ref = longueur_A cp1 = "cp " + un + " " + tempo_out + "1f.fasta"
def direct_blast(infile, database, outfile=None, in_type="fasta", cores=1, patent_db=None, min_aa_size=0, psi_blast=False, min_bitscore=50): """ direct_blast is the worker function which takes an input filename and blasts it against a target database, then calculates identity against the query and against some other database Positional Arguments: :param infile: str Input file path :param database: str Input database path. Goes directly into command line blast, requires the database name as well. Keyword Arguments: :param outfile: str Output file path :param in_type: str Input file type. Takes anything SeqIO can take. :param cores: int Passed to blast command line as num_threads, and number of pool processes to spawn to analyze hits :param patent_db: str Screening database path. Goes directly into command line blast, requires the database name as well. :param min_aa_size: int Minimum size of the protein in amino acids :param psi_blast: bool Flag to use psi-blast instead of blastp :param min_bitscore: int The minimum required bitscore of a BLAST high scoring pair. HSPs with lower scores will be filtered """ # Use a generic name for the outfile if it hasn't been explicitly set if outfile is None: outfile = infile + ".db.out" # Make sure the databases exist # Easier then catching it way down the line if not (os.path.isfile(database + ".phd") or os.path.isfile(database + ".00.phd")): raise FileNotFoundError("BLAST database {} not located".format(database)) if patent_db is not None and not (os.path.isfile(patent_db + ".phd") or os.path.isfile(patent_db + ".00.phd")): raise FileNotFoundError("BLAST database {} not located".format(database)) # Set the BLAST hsp minimums hsp_filter = lambda hsp: hsp.aln_span > 50 and hsp.bitscore > min_bitscore # Open and read in the query file as SeqRecords with open(outfile, mode="w") as out_fh, open(infile, mode="rU") as in_fh: for query_sequence in SeqIO.parse(in_fh, format=in_type): # Count the DNA bases to do a lazy job determining if this is a protein or a DNA sequence dna = 0 for base in ["A", "T", "G", "C", "N"]: dna += str(query_sequence.seq).upper().count(base) blast_out_file = tempfile.mkstemp(suffix=".blast.xml") # Decide which blast command line arguments to use for the query sequence # Also set sequence alphabet # If the sequence looks like protein if dna / len(query_sequence) < 0.95: query_sequence.seq.alphabet = generic_protein # If it's probably DNA else: # Translate the sequence if it looks like DNA and use the translated sequence for downstream try: query_sequence.seq = query_sequence.seq.translate(cds=True) query_sequence.seq.alphabet = generic_protein except TranslationError as trans_err: print("Input Sequence {} Not a CDS: {}".format(infile, trans_err.args)) try: query_sequence.seq = query_sequence.seq.translate() query_sequence.seq.alphabet = generic_protein except TranslationError: print("Translation Error") exit(0) # Write the translated protein to a file to use as the query blast_query_temp = tempfile.mkstemp(suffix=".fasta") with open(blast_query_temp[0], mode="w") as blast_temp_fh: SeqIO.write(query_sequence, blast_temp_fh, format="fasta") infile = blast_query_temp[1] # Run psiblast if the psi flag is set otherwise blastp if psi_blast: blast_cmd = ["psiblast", "-db", database, "-query", infile, "-outfmt", "5", "-out", blast_out_file[1], "-num_threads", str(cores), "-max_target_seqs", str(5000)] else: blast_cmd = ["blastp", "-db", database, "-query", infile, "-outfmt", "5", "-out", blast_out_file[1], "-num_threads", str(cores), "-task", "blastp", "-max_target_seqs", str(5000)] subprocess.call(blast_cmd) # Read in the blast output file as a QueryResult with open(blast_out_file[0], mode="rU") as blast_fh: try: query = SearchIO.read(blast_fh, format='blast-xml') except ParseError: print("BLAST Command Failed") exit(0) # Preprocess the BLAST query result with the hsp filter object filter_query = query.hsp_filter(hsp_filter) print("{} BLAST results [{} filtered]".format(len(filter_query), len(query) - len(filter_query))) # Pass the control arguments to the hit processor and then multiprocess hits through mp.Pool heavy_hitter = HitProcess(database, query_sequence, min_aa=min_aa_size, patentdb=patent_db) blast_process_runner = multiprocessing.Pool(processes=cores).imap_unordered(heavy_hitter.process_hits, (hit for hit in filter_query)) print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format("Query ID", "Hit ID", "Hit Description", "Hit Identity", "Hit Similarity", "Patent DB Identity", "Hit Sequence"), file=out_fh) # Iterate through the processing results and print them for hit_id, hit_seq, hit_ident, hit_simil, patent_ident in blast_process_runner: if hit_ident is not None: print("{}\t{}\t{}\t{}\t{}\t{}\t{}".format(query_sequence.id, hit_id, hit_seq.description, hit_ident, hit_simil, patent_ident, str(hit_seq.seq)), file=out_fh) os.remove(blast_out_file[1])
def write_unlabelled_seqs(seq_dict, outdir): with open('{}/silix/seqs.fasta'.format(outdir), 'w') as out: for seq in list(seq_dict.keys()): SeqIO.write(seq_dict.pop(seq), out, 'fasta') return seq_dict
print("Low confidence viral predictions by VirFinder identified") return (HC_viral_predictions, LC_viral_predictions, prophage_predictions) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Write fasta files with predicted viral contigs sorted in categories and putative prophages") parser.add_argument("-a", "--assemb", dest="assemb", help="Metagenomic assembly fasta file", required=True) parser.add_argument("-f", "--vfout", dest="finder", help="Absolute or relative path to VirFinder output file", required=True) parser.add_argument("-s", "--vsdir", dest="sorter", help="Absolute or relative path to directory containing VirSorter output", required=True) parser.add_argument("-o", "--outdir", dest="outdir", help="Absolute or relative path of directory where output viral prediction files should be stored (default: cwd)", default=".") if len(sys.argv) == 1: parser.print_help() else: args = parser.parse_args() viral_predictions = virus_parser(assembly_file=args.assemb, VF_output=args.finder, VS_output=args.sorter) if sum([len(x) for x in viral_predictions]) > 0: if len(viral_predictions[0]) > 0: SeqIO.write(viral_predictions[0], os.path.join(args.outdir, "High_confidence_putative_viral_contigs.fna"), "fasta") if len(viral_predictions[1]) > 0: SeqIO.write(viral_predictions[1], os.path.join(args.outdir, "Low_confidence_putative_viral_contigs.fna"), "fasta") if len(viral_predictions[2]) > 0: SeqIO.write(viral_predictions[2], os.path.join(args.outdir, "Putative_prophages.fna"), "fasta") else: print("Overall, no putative viral contigs or prophages were detected in the analysed metagenomic assembly")
help='Replace all invalid bases (not A,C,T,G,a,c,t,g,-) with "N"' ) parser = argparse.ArgumentParser() add_arguments(parser) args = parser.parse_args() #_________________________________________________ # Insert a function here, which replaces all invalid characters with N's def replace_bad_chars(alignment): '...your code here...' #_________________________________________________ # !!!!!!!!!!!!NOTE:!!!!!!!!!!!!!! You can call the input variables in the following manner: args.name_of_variable, # e.g. args.input, args.input_format, etc. # 1. read the alignment, using the AlignIO.read() function alignment=AlignIO.read() # 2. apply the function you defined above, in case the fix_invalid_characters option is activated if args.fix_invalid_characters: replace_bad_chars(alignment) # 3. write the alignment to the defined output file (args.output) using the SeqIO.write() function SeqIO.write() print('\n\nNew alignment written to file %s\n\n' %args.output)
def handle(self, *args, **options): dbname = options['dbname'] query = options['query'] out_format = options['format'] if options['start'] < 1: raise CommandError(f'start must be greater than 0, "{options["start"]}" passed') if options['end'] and options['end'] < 0: raise CommandError(f'end must be greater than 0, "{options["start"]}" passed') options['start'] = options['start'] - 1 if (out_format == "auto") and dbname: out_format = "fasta" else: out_format = "list" if query: try: query = json.loads(query) except JSONDecodeError: if dbname: query = {"name": query} else: query = {"accession": query} if dbname: query["biodatabase__name"] = dbname query_manager = Bioentry.objects else: query_manager = Biodatabase.objects if not options['end']: if (out_format == "list") and (options['end'] == None): options['end'] = 10 self.stderr.write(f"quering... {json.dumps(query)}") qs = query_manager.filter(**query) self.stderr.write(f"retreived sequences: {qs.count()}") seqstore = SeqStore.instance() seqtype = "genome" if dbname.endswith(Biodatabase.PROT_POSTFIX): dbname = dbname[:-len(Biodatabase.PROT_POSTFIX)] seqtype = "proteome" seq_qs = seqstore.qs(dbname,seqtype) qs2 = qs if options['start'] and options['end']: qs2 = qs2[options['start']:options['end']] elif options['start']: qs2 = qs2[options['start']:] elif options['end']: qs2 = qs2[:options['end']] for be in self.tqdm(qs2, total=qs.count()): if out_format == "fasta": r = be.to_seq_record(seq_qs) bpio.write(r, self.stdout, out_format) else: self.stdout.write(be.name) if not dbname: if not options["end"]: options["end"] = qs.count() self.stderr.write(f'exported from {options["start"]} to {options["end"]} of {qs.count()}') self.stderr.write("finished!")
return args if __name__ == '__main__': args = parse_args() file, seq_format, fh = args.infile, args.format, None, if file: if not seq_format: found = re.search(r'(?i)(fasta|fa|fastq|fq)(.gz)?$', file) if not found: print("invalid file name suffix.\nfile name should like this: infile.[fasfa|fa|fastq|fq][.gz]", file=sys.stderr) sys.exit(1) seq_format, is_gz = found.groups() if seq_format == 'fa': seq_format = 'fasta' if seq_format == 'fq': seq_format = 'fastq' fh = gzip.open(file, 'rt') if file.endswith('.gz') else open(file, 'r') else: fh = sys.stdin seq_format = args.format for seq in SeqIO.parse(fh, seq_format): SeqIO.write([SeqRecord(seq.seq.translate(table=args.table), id=seq.id, description='')], sys.stdout, 'fasta') fh.close()
def write_clusters(eggnog_dict, seq_dict, outdir): for nog in eggnog_dict.keys(): with open("{}/{}.fasta".format(outdir, nog), 'w') as out: for seq in eggnog_dict[nog]: SeqIO.write(seq_dict.pop(seq), out, 'fasta') return seq_dict
df = df.append({ "id": name, "seq": sequence, "length": length }, ignore_index=True) #sort table by sequence length df.sort_values(by=['length'], ascending=False) # Save sequences remaining = [] pooled = [] for index, row in df.iterrows(): # each sequence above threshold into one file if row['length'] >= length_threshold and index + 1 <= max_sequences: print("write " + out_base + "." + str(index + 1) + ".fa") out = (SeqRecord(Seq(row['seq'], generic_dna), id=row['id'])) SeqIO.write(out, out_base + "." + str(index + 1) + ".fa", "fasta") # contigs to retain elif row['length'] >= min_length_to_retain_contig: pooled.append(SeqRecord(Seq(row['seq'], generic_dna), id=row['id'])) # remaining sequences else: remaining.append(SeqRecord(Seq(row['seq'], generic_dna), id=row['id'])) print("write " + out_base + ".pooled.fa") SeqIO.write(pooled, out_base + ".pooled.fa", "fasta") print("write " + out_base + ".remaining.fa") SeqIO.write(remaining, out_base + ".remaining.fa", "fasta")
#!/usr/bin/env python3 from Bio import SeqIO import os # IO output_dir = 'output/fasta_chunks' trinity_fasta = 'data/Trinity.fasta' # build an index of the fasta file record_index = SeqIO.index(trinity_fasta, 'fasta') record_keys = list(record_index.keys()) number_of_records = len(record_index) # write batch_size records to fasta file batch_size = 2000 i = 0 for start in range(0, number_of_records, batch_size): i += 1 end = min(number_of_records, start + batch_size) file_name = ('trinity_chunk%(num)03i.fasta' % {'num': i}) file_path = os.path.join(output_dir, file_name) keys_to_write = record_keys[start:end] records_to_write = (record_index[x] for x in keys_to_write) SeqIO.write(sequences=records_to_write, handle=file_path, format='fasta')