def gather_est2genome_seqs(refseq_obj, est2genome_handle, log_line, velvet_file): seq_dir = log_line.split("\t")[1] tmp_refseq = seq_dir.split("/")[3].replace(".","%2E")#hardcoded in this position gff_file = refseq_obj.id + ".velvet_contigs.maker.output/" + seq_dir + "/" + tmp_refseq + ".gff" gff_handle = open(gff_file,'r') for gff_line in gff_handle: if(re.search("est2gneome",gff_line) and \ re.search("\texpressed_sequence_match\t",gff_line)): curr_start = int(gff_line.split("\t")[3]) curr_stop = int(gff_line.split("\t")[4]) curr_strand = gff_line.split("\t")[6] tmp_handle = open(velvet_file,'r') tmp_fasta = SeqIO.to_dict(SeqIO.parse(tmp_handle,"fasta")) tmp_handle.close() if seq_dir.split("/")[3] in tmp_fasta: curr_record = tmp_fasta[seq_dir.split("/")[3]] else: continue new_seq = curr_record.seq[curr_start - 1:curr_stop] if(curr_strand == "-"): new_seq = curr_record.seq[curr_start - 1:curr_stop].reverse_complement() new_record = SeqRecord(new_seq,id=seqname,name=seqname,description="") SeqIO.write(est2genome_handle,"fasta")
def standard_test_procedure(self, cline): """Standard testing procedure used by all tests.""" # Overwrite existing files. cline.force = True # Mark output files for later cleanup. self.add_file_to_clean(cline.outfile) if cline.guidetree_out: self.add_file_to_clean(cline.guidetree_out) input_records = SeqIO.to_dict(SeqIO.parse(cline.infile, "fasta")) self.assertEqual(str(eval(repr(cline))), str(cline)) output, error = cline() self.assertTrue(not output or output.strip().startswith("CLUSTAL")) # Test if ClustalOmega executed successfully. self.assertTrue(error.strip() == "" or error.startswith("WARNING: Sequence type is DNA.") or error.startswith("WARNING: DNA alignment is still experimental.")) # Check the output... align = AlignIO.read(cline.outfile, "clustal") output_records = SeqIO.to_dict(SeqIO.parse(cline.outfile, "clustal")) self.assertEqual(len(set(input_records.keys())), len(set(output_records.keys()))) for record in align: self.assertEqual(str(record.seq), str(output_records[record.id].seq)) # TODO - Try and parse this with Bio.Nexus? if cline.guidetree_out: self.assertTrue(os.path.isfile(cline.guidetree_out))
def load_examples_from_fasta(signal, org, data_path): """ load examples from fasta file signal """ fn_pos = "%s/%s_sig_%s_example.fa" % (data_path, signal, "pos") fn_neg = "%s/%s_sig_%s_example.fa" % (data_path, signal, "neg") print "loading: \n %s \n %s" % (fn_pos, fn_neg) # parse file xt_pos = [str(rec.seq) for rec in SeqIO.parse(fn_pos, "fasta")] xt_neg = [str(rec.seq) for rec in SeqIO.parse(fn_neg, "fasta")] labels = [+1] * len(xt_pos) + [-1] * len(xt_neg) examples = xt_pos + xt_neg print ( "organism: %s, signal %s,\t num_labels: %i,\t num_examples %i,\t num_positives: %i,\t num_negatives: %i" % (org, signal, len(labels), len(examples), len(xt_pos), len(xt_neg)) ) examples_shuffled, labels_shuffled = helper.coshuffle(examples, labels) ret = {"examples": numpy.array(examples_shuffled), "labels": numpy.array(labels_shuffled)} return ret
def filter_reads_by_length(fq1, fq2, quality_format, min_length=20): """ removes reads from a pair of fastq files that are shorter than a minimum length. removes both ends of a read if one end falls below the threshold while maintaining the order of the reads """ logger.info("Removing reads in %s and %s that " "are less than %d bases." % (fq1, fq2, min_length)) fq1_out = utils.append_stem(fq1, ".fixed") fq2_out = utils.append_stem(fq2, ".fixed") fq1_single = utils.append_stem(fq1, ".singles") fq2_single = utils.append_stem(fq2, ".singles") if all(map(utils.file_exists, [fq1_out, fq2_out, fq2_single, fq2_single])): return [fq1_out, fq2_out] fq1_in = SeqIO.parse(fq1, quality_format) fq2_in = SeqIO.parse(fq2, quality_format) with open(fq1_out, 'w') as fq1_out_handle, open(fq2_out, 'w') as fq2_out_handle, open(fq1_single, 'w') as fq1_single_handle, open(fq2_single, 'w') as fq2_single_handle: for fq1_record, fq2_record in izip(fq1_in, fq2_in): if len(fq1_record.seq) >= min_length and len(fq2_record.seq) >= min_length: fq1_out_handle.write(fq1_record.format(quality_format)) fq2_out_handle.write(fq2_record.format(quality_format)) else: if len(fq1_record.seq) > min_length: fq1_single_handle.write(fq1_record.format(quality_format)) if len(fq2_record.seq) > min_length: fq2_single_handle.write(fq2_record.format(quality_format)) return [fq1_out, fq2_out]
def loop(self, filename, format): original_records = list(SeqIO.parse(open(filename, "rU"), format)) # now open a connection to load the database server = BioSeqDatabase.open_database(driver = DBDRIVER, user = DBUSER, passwd = DBPASSWD, host = DBHOST, db = TESTDB) db_name = "test_loop_%s" % filename # new namespace! db = server.new_database(db_name) count = db.load(original_records) self.assertEqual(count, len(original_records)) server.commit() #Now read them back... biosql_records = [db.lookup(name=rec.name) for rec in original_records] #And check they agree self.assertTrue(compare_records(original_records, biosql_records)) #Now write to a handle... handle = StringIO() SeqIO.write(biosql_records, handle, "gb") #Now read them back... handle.seek(0) new_records = list(SeqIO.parse(handle, "gb")) #And check they still agree self.assertEqual(len(new_records), len(original_records)) for old, new in zip(original_records, new_records): #TODO - remove this hack because we don't yet write these (yet): for key in ["comment", "references", "db_source"]: if key in old.annotations and key not in new.annotations: del old.annotations[key] self.assertTrue(compare_record(old, new)) #Done server.close()
def test_fastq_1000(self): """Read and write back simple example with mixed case 1000bp read""" data = "@%s\n%s\n+\n%s\n" \ % ("id descr goes here", "ACGTNncgta"*100, "abcd!!efgh"*100) handle = StringIO() self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq")) self.assertEqual(data, handle.getvalue())
def test_generated(self): """Write and read back odd SeqRecord objects""" record1 = SeqRecord(Seq("ACGT"*500, generic_dna), id="Test", description="Long "*500, letter_annotations={"phred_quality":[40,30,20,10]*500}) record2 = SeqRecord(MutableSeq("NGGC"*1000), id="Mut", description="very "*1000+"long", letter_annotations={"phred_quality":[0,5,5,10]*1000}) record3 = SeqRecord(UnknownSeq(2000,character="N"), id="Unk", description="l"+("o"*1000)+"ng", letter_annotations={"phred_quality":[0,1]*1000}) record4 = SeqRecord(Seq("ACGT"*500), id="no_descr", description="", name="", letter_annotations={"phred_quality":[40,50,60,62]*500}) record5 = SeqRecord(Seq("",generic_dna), id="empty_p", description="(could have been trimmed lots)", letter_annotations={"phred_quality":[]}) record6 = SeqRecord(Seq(""), id="empty_s", description="(could have been trimmed lots)", letter_annotations={"solexa_quality":[]}) record7 = SeqRecord(Seq("ACNN"*500), id="Test_Sol", description="Long "*500, letter_annotations={"solexa_quality":[40,30,0,-5]*500}) record8 = SeqRecord(Seq("ACGT"), id="HighQual", description="With very large qualities that even Sanger FASTQ can't hold!", letter_annotations={"solexa_quality":[0,10,100,1000]}) #TODO - Record with no identifier? records = [record1, record2, record3, record4, record5, record6, record7, record8] #TODO - Have a Biopython defined "DataLossWarning?" warnings.simplefilter('ignore', BiopythonWarning) #TODO - Include phd output? for format in ["fasta", "fastq", "fastq-solexa", "fastq-illumina", "qual"]: handle = StringIO() SeqIO.write(records, handle, format) handle.seek(0) compare_records(records, list(SeqIO.parse(handle, format)), truncation_expected(format)) warnings.filters.pop()
def test_fasta_out(self): """Check FASTQ to FASTA output""" records = SeqIO.parse("Quality/example.fastq", "fastq") h = StringIO() SeqIO.write(records, h, "fasta") with open("Quality/example.fasta") as expected: self.assertEqual(h.getvalue(), expected.read())
def test_fastq_2000(self): """Read and write back simple example with upper case 2000bp read""" data = "@%s\n%s\n+\n%s\n" \ % ("id descr goes here", "ACGT"*500, "!@a~"*500) handle = StringIO() self.assertEqual(1, SeqIO.write(SeqIO.parse(StringIO(data), "fastq"), handle, "fastq")) self.assertEqual(data, handle.getvalue())
def blastclust_to_fasta(infname, seqfname, outdir): """Converts input BLASTCLUST output list to a subdirectory of FASTA files. Each individual FASTA file contains all sequences from a single cluster. The sequences matching the IDs listed in the BLASTCLUST output .lst file should all be found in the same file. Returns the output directory and a list of the files, as a tuple. """ outdirname = os.path.join(outdir, "blastclust_OTUs") if not os.path.exists(outdirname): os.makedirs(outdirname) seqdict = SeqIO.index(seqfname, 'fasta') outfnames = [] with open(infname, 'r') as fh: otu_id = 0 for line in fh: otu_id += 1 outfname = os.path.join(outdirname, "blastclust_OTU_%06d.fasta" % otu_id) SeqIO.write((seqdict[key] for key in line.split()), outfname, 'fasta') outfnames.append(outfname) return (outdirname, outfnames)
def run_pal2nal(fname_aln, fname_nuc, fname_prot): """ Generate a codon alignment via PAL2NAL. @param fname_aln: MSA of protein sequences in CLUSTAL format (.aln) @param fname_nuc: Nucleotide sequences in FASTA format (.fasta) @param fname_prot: Protein sequences in FASTA format (.fasta) @return: Codon alignment in CLUSTAL format (.aln), suitable for codeml 1""" sys.stderr.write("\nSTEP: run_pal2nal(%s, %s)\n" % (fname_aln, fname_nuc)) # Reorder fname_nuc according to the order of the proteins in fname_aln, which # was reordered due to CLUSTALW2. Note that the first protein in each of # these files remains the same as at the start, however; this first protein # is our original query protein. nuc_records = [record for record in SeqIO.parse(fname_nuc, "fasta")] prot_records = [record for record in SeqIO.parse(fname_prot, "fasta")] records_map = dict((pr.id, nr) for pr, nr in zip(prot_records, nuc_records)) fname_nuc2 = "homologs_ordered.dna.fasta" with open(fname_nuc2, "w") as f: for record in SeqIO.parse(fname_aln, "clustal"): SeqIO.write(records_map[record.id], f, "fasta") fname_codon = "homologs.codon.aln" # TODO: use subprocess os.system("%s/pal2nal.pl %s %s -output paml > %s" % (bin_dir(), fname_aln, fname_nuc2, fname_codon)) return fname_codon
def main(gbdir, outdir): os.makedirs(gbdir, exist_ok=True) os.makedirs(outdir, exist_ok=True) tempq = 'tempquery.fasta' tempdb = 'tempdb.fasta' for org in tqdm(Organism.objects.all()): # get genbank and convert to fasta fpath = os.path.join(gbdir, '{}.gb'.format(org.accession)) if not os.path.isfile(fpath): print('\nFetching {} with accession {}'.format( org.name, org.accession )) fetch(fpath) SeqIO.convert(fpath, 'genbank', tempdb, 'fasta') # get spacers of organism and convert to fasta spacers = Spacer.objects.filter(loci__organism=org) fastatext = ''.join(['>{}\n{}\n'.format(spacer.id, spacer.sequence) for spacer in spacers]) with open(tempq, 'w') as f: f.write(fastatext) # run blast and save output outpath = os.path.join(outdir, '{}.json'.format(org.accession)) commandargs = ['blastn', '-query', tempq, '-subject', tempdb, '-out', outpath, '-outfmt', '15'] subprocess.run(commandargs, stdout=subprocess.DEVNULL) os.remove(tempq) os.remove(tempdb)
def needle_score(seq1, seq2, verbose=False, keep=False): """ get needlman-wunsch score for aligning two sequences """ ntf = tempfile.NamedTemporaryFile with ntf(prefix='seq1', delete = not keep) as fh1, \ ntf(prefix='seq2', delete = not keep) as fh2, \ ntf(prefix='align_out') as outfile, \ open(os.devnull) as dn: SeqIO.write(seq1, fh1, 'fasta') fh1.flush() SeqIO.write(seq2, fh2, 'fasta') fh2.flush() cmd = ['needle', '-gapopen', '0', '-gapextend', '0', '-outfile', outfile.name, fh1.name, fh2.name] if verbose: print(' '.join(cmd)) subprocess.check_call(cmd, stderr=dn) result = outfile.read() pattern = re.compile(r'# Score: (.*)') score = pattern.search(result) if score is not None: return float(score.group(1)) return 0
def cluster_pid(folder): result = [] f_name = folder.split("/")[-1] try: genes = pd.read_csv(folder + "/report/" + f_name + "_genes.csv") genes = genes.loc[~(genes['cluster'].isin(['na', '0', 0])) & (genes['species'] == 'H**o sapiens')] if genes.shape[0] > 0: for cluster in set(genes['cluster']): pids = [] accs = genes.loc[genes['cluster'] == cluster, 'prot_acc'].values for seq1 in accs: for seq2 in accs: seq_1 = [x.seq for x in SeqIO.parse("../cgpf_ncbi/all_seqs.fa", 'fasta') if x.name.split("|")[2] == seq1] seq_2 = [x.seq for x in SeqIO.parse("../cgpf_ncbi/all_seqs.fa", 'fasta') if x.name.split("|")[2] == seq2] aln = pairwise2.align.globalxx(seq_1[0], seq_2[0])[0] mean_len = (len(aln[0]) + len(aln[1])) / 2 pids.append(aln[2] / mean_len) n_genes = len(pids) mean_pid = np.mean(pids) sd_id = np.std(pids) result.append(cluster, n_genes, mean_pid, sd_id) print(cluster) return result except OSError: return None
def setUp(self): self.aln_file = [TEST_ALIGN_FILE1, TEST_ALIGN_FILE2, TEST_ALIGN_FILE3, TEST_ALIGN_FILE4, TEST_ALIGN_FILE5, TEST_ALIGN_FILE6] alns = [] for i in self.aln_file: if i[1] == 'parse': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet) elif i[1] == 'index': nucl = SeqIO.index(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, alphabet=codonalign.default_codon_alphabet, max_score=20) elif i[1] == 'id': nucl = SeqIO.parse(i[0][0], 'fasta', alphabet=IUPAC.IUPACUnambiguousDNA()) prot = AlignIO.read(i[0][1], 'clustal', alphabet=IUPAC.protein) with open(i[0][2]) as handle: id = dict((i.split()[0], i.split()[1]) for i in handle) with warnings.catch_warnings(): warnings.simplefilter('ignore') caln = codonalign.build(prot, nucl, corr_dict=id, alphabet=codonalign.default_codon_alphabet) alns.append(caln) nucl.close() # Close the indexed FASTA file self.alns = alns
def _validate_fasta(self, text): try: SeqIO.parse(text, 'fasta').next() return text except StopIteration: raise argparse.ArgumentTypeError( "{0} is not fasta file".format(text))
def main_build_markov(promotor_filename = "promotor.fa", genome_filename = "genom.fa", symbol_length = 2, load_cached = False, save_cache = True): ''' Na podstawie plików z sekwencjami promotorowymi i genomem funkcja buduje model Markova''' promotor_sequences = [ x for x in SeqIO.parse("promotor.fa", "fasta")] genome = [ x for x in SeqIO.parse("genom.fa", "fasta")] if not load_cached: promotor_freqs = calc_symbol_freq(promotor_sequences) genome_freqs = calc_symbol_freq(genome) if save_cache: dump_obj(promotor_freqs, Dumpfiles.promotor_freq) dump_obj(genome_freqs, Dumpfiles.genome_freq) else: promotor_freqs = load_obj(Dumpfiles.promotor_freq) genome_freqs = load_obj(Dumpfiles.genome_freq) promotor_counts = calc_counts(promotor_sequences) genome_counts = calc_counts(genome) print promotor_counts promotor_freqs = fold_and_normalize(promotor_freqs[symbol_length], symbol_length, promotor_counts[symbol_length]) genome_freqs = fold_and_normalize(genome_freqs[symbol_length], symbol_length, genome_counts[symbol_length]) for k in promotor_freqs: assert(k in genome_freqs) for k in genome_freqs: assert(k in genome_freqs) print promotor_freqs (markov, states) = build_markov(genome_freqs, promotor_freqs) return (markov, states)
def main(args): server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password) if args.database_name not in server.keys(): server.new_database(args.database_name) db = server[args.database_name] gen = [] if args.fasta is not None: for rec in SeqIO.parse(args.fasta, 'fasta'): gen.append(rec.name) elif args.genbank is not None: for rec in SeqIO.parse(args.genbank, 'genbank'): gen.append(rec.name) elif args.input is not None: with open(args.input) as fp: for line in fp: gen.append(line.rstrip()) if args.remove: taxon_id = None else: taxon_id = add_new_taxonomy(server, args.new_taxons, args.taxid) for rec in gen: server.adaptor.execute('update bioentry set taxon_id = %s where bioentry_id = %s',(taxon_id, db.adaptor.fetch_seqid_by_display_id(db.dbid, rec))) server.commit()
def count_overlap(filename): for seq_record in SeqIO.parse(filename, "fasta"): for seq_record_1 in SeqIO.parse(filename, "fasta"): s1 = seq_record.seq s2 = seq_record_1.seq if s1 != s2 and s1[-3:] == s2[0:3]: print(seq_record.id + " " + seq_record_1.id)
def _get_seq_dict(self): """Internal reusable function to get the sequence dictionary. """ seq_handle = open(self._test_seq_file) seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) seq_handle.close() return seq_dict
def illumina2sangerFq(inputfile): print help(SeqIO.convert) filename = inputfile[:-3]+'.fastq' SeqIO.convert(inputfile, "fastq-illumina", filename, "fastq")
def CutOutDomain(coords,filename, header=False, column_id=0, column_start=8, column_stop=9): """COMMENTS""" from Bio import SeqIO fh=open(coords) seqfile=open(filename) Towrite=[] CoordIDDic={} if header==True: print 'header set to True, first line of %s will be ignored'%coords skip_header=fh.readline() else: print 'header not set to True, first line of %s will be processed'%coords for unformatedLine in fh: l=unformatedLine.replace('\xa0', '').strip().split(',') if l[column_id] not in CoordIDDic: CoordIDDic[l[column_id]]=l[column_start], l[column_stop] else: for s in SeqIO.parse(seqfile, 'fasta'): if s.id in CoordIDDic: start=(int(CoordIDDic.get(s.id)[0])-1) stop=int(CoordIDDic.get(s.id)[1]) s.id=s.id+'_%s_%s'%((start+1), stop) Towrite.append(s[start:stop]) else: Output=open('CutOutdomain_%s'%filename, 'w') SeqIO.write(Towrite, Output, 'fasta')
def frameshift_writer(contigs, file): sys.stderr.write("[predict] writing frameshifts...") seqs = [SeqRecord(seq=c.seq, id=c.id, description=c.description) for c in contigs.values() if c.annotation['majority_frameshift']] SeqIO.write(seqs, file, "fasta") file.close() sys.stderr.write("\tdone.\n")
def no_relatives_writer(contigs, file): sys.stderr.write("[predict] writing contigs with no relatives...") seqs = [SeqRecord(seq=c.seq, id=c.id, description=c.description) for c in contigs.values() if c.annotation['num_relatives'] == 0] SeqIO.write(seqs, file, "fasta") file.close() sys.stderr.write("\tdone.\n")
def __format__(self, format_spec): """Returns the record as a string in the specified file format. This method supports the python format() function added in Python 2.6/3.0. The format_spec should be a lower case string supported by Bio.SeqIO as an output file format. See also the SeqRecord's format() method. """ if not format_spec: #Follow python convention and default to using __str__ return str(self) from Bio import SeqIO if format_spec in SeqIO._BinaryFormats: #Return bytes on Python 3 try: #This is in Python 2.6+, but we need it on Python 3 from io import BytesIO handle = BytesIO() except ImportError: #Must be on Python 2.5 or older from StringIO import StringIO handle = StringIO() else: from StringIO import StringIO handle = StringIO() SeqIO.write(self, handle, format_spec) return handle.getvalue()
def main(): if len (sys.argv) != 4 : print "Please provide file, the file format, and the desired file format " sys.exit (1) else: f = sys.argv[1] fout = "".join(f.split('.')[:-1]) formatin = sys.argv[2] formatout = sys.argv[3] if formatout == 'nexus': AlignIO.convert(f,formatin,fout+'.'+formatout,formatout,alphabet= IUPAC.ambiguous_dna) if formatout == 'mega': handle = open(f, "rU") record_dict = SeqIO.to_dict(SeqIO.parse(handle, "phylip-relaxed")) handle.close() outfile = open(fout+'.'+formatout,'w') outfile.write('#mega'+"\n") outfile.write('!Title Mytitle;'+"\n") outfile.write('!Format DataType=DNA indel=-;'+"\n\n") for n in record_dict: outfile.write('#'+n+"\n") newseq=wrap(str(record_dict[n].seq),60) for s in newseq: outfile.write(s+"\n") outfile.close() else: AlignIO.convert(f,formatin,fout+'.'+formatout,formatout)
def not_t_full_celegans(self): """Test the full C elegans chromosome and GFF files. This is used to test GFF on large files and is not run as a standard test. You will need to download the files and adjust the paths to run this. """ # read the sequence information seq_file = os.path.join(self._full_dir, "c_elegans.WS199.dna.fa") gff_file = os.path.join(self._full_dir, "c_elegans.WS199.gff3") seq_handle = open(seq_file) seq_dict = SeqIO.to_dict(SeqIO.parse(seq_handle, "fasta")) seq_handle.close() #with open(gff_file) as gff_handle: # possible_limits = feature_adder.available_limits(gff_handle) # pprint.pprint(possible_limits) rnai_types = [('Orfeome', 'PCR_product'), ('GenePair_STS', 'PCR_product'), ('Promoterome', 'PCR_product')] gene_types = [('Non_coding_transcript', 'gene'), ('Coding_transcript', 'gene'), ('Coding_transcript', 'mRNA'), ('Coding_transcript', 'CDS')] limit_info = dict(gff_source_type = rnai_types + gene_types) for rec in GFF.parse(gff_file, seq_dict, limit_info=limit_info): pass
def splitFastaFile(infile, informat, outdir): for record in SeqIO.parse(open(infile), informat): iid = record.id if not os.path.exists(outdir): os.mkdir(outdir) f_out = os.path.join(outdir,iid+'.fasta') SeqIO.write([record],open(f_out,'w'),"fasta")
def check_convert_fails(in_filename, in_format, out_format, alphabet=None): qual_truncate = truncation_expected(out_format) #We want the SAME error message from parse/write as convert! err1 = None try: records = list(SeqIO.parse(in_filename,in_format, alphabet)) handle = StringIO() if qual_truncate: warnings.simplefilter('ignore', UserWarning) SeqIO.write(records, handle, out_format) if qual_truncate: warnings.filters.pop() handle.seek(0) assert False, "Parse or write should have failed!" except ValueError as err: err1 = err #Now do the conversion... try: handle2 = StringIO() if qual_truncate: warnings.simplefilter('ignore', UserWarning) SeqIO.convert(in_filename, in_format, handle2, out_format, alphabet) if qual_truncate: warnings.filters.pop() assert False, "Convert should have failed!" except ValueError as err2: assert str(err1) == str(err2), \ "Different failures, parse/write:\n%s\nconvert:\n%s" \ % (err1, err2)
def test_acba_annot(self): replicon_filename = 'acba.007.p01.13' replicon_id = 'ACBA.007.P01_13' command = "integron_finder --outdir {out_dir} --func-annot --path-func-annot {annot_bank} --promoter-attI " \ "--gbk --keep-tmp " \ "{replicon}".format(out_dir=self.out_dir, annot_bank=self.resfams_dir, replicon=self.find_data(os.path.join('Replicons', '{}.fst'.format(replicon_filename))) ) with self.catch_io(out=True, err=False): main(command.split()[1:], loglevel='WARNING') result_dir = os.path.join(self.out_dir, 'Results_Integron_Finder_{}'.format(replicon_filename)) gbk = '{}.gbk'.format(replicon_id) expected_gbk = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename), gbk)) gbk_test = os.path.join(result_dir, gbk) expected_gbk = SeqIO.read(expected_gbk, 'gb') gbk_test = SeqIO.read(gbk_test, 'gb') self.assertSeqRecordEqual(expected_gbk, gbk_test) output_filename = '{}.integrons'.format(replicon_filename) expected_result_path = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename), output_filename)) test_result_path = os.path.join(result_dir, output_filename) self.assertIntegronResultEqual(expected_result_path, test_result_path) output_filename = os.path.join('tmp_{}'.format(replicon_id), replicon_id + '_Resfams_fa_table.res') expected_result_path = self.find_data(os.path.join('Results_Integron_Finder_{}.annot'.format(replicon_filename), output_filename)) test_result_path = os.path.join(result_dir, output_filename) self.assertHmmEqual(expected_result_path, test_result_path)
def Screen_seqs_against_fasta(record_fasta, ref_fasta, word_size=17, allowed_hits=0, exclude_names=_adaptor_site_names, check_rc=True, save=False, save_folder=None, save_name=None, overwrite=False, return_kept_flag=False, verbose=True): """Function to screen sequences against a given fasta file Inputs: record_fasta: fasta filename or list of SeqRecord, str or list ref_fasta: filename for reference fasta file to screen against, string of file path word_size: word_size used for probe screening, int (default: 17) allowed_hits: allowed hits for one probe in the fasta, int (default: 8) exclude_names: list of names to be excluded, list (default: _adaptor_site_names) check_rc: whether check reverse-complement of the probe, bool (default: True) save: whether save result probe reports, bool (default: True) save_folder: folder to save selected probes, string of path (default: None, which means +'_filtered') overwrite: whether overwrite existing result probe reports, bool (default: False) return_kept_flag: whether return flags for whether keeping the record, bool (default:False) verbose: say something!, bool (default: True) """ ## Check inputs if verbose: print(f"- Screen sequences against given fasta file:{ref_fasta}") # load record-fasta if isinstance(record_fasta, str): with open(record_fasta, 'r') as _handle: _records = [] for _record in SeqIO.parse(_handle, "fasta"): _records.append(_record) elif isinstance(record_fasta, list): _records = record_fasta if verbose: print(f"-- {len(_records)} sequences loaded.") if not os.path.isfile(ref_fasta): raise IOError(f"Reference fasta:{ref_fasta} is not a file.") word_size = int(word_size) allowed_hits = int(allowed_hits) if save_folder is None: if isinstance(record_fasta, str): save_folder = os.path.dirname(record_fasta) else: save_folder = os.path.dirname(ref_fasta) if not os.path.exists(save_folder): os.makedirs(save_folder) if verbose: print(f"-- create {save_folder} to store filter probes") ## construct table for ref_fasta if verbose: print(f"-- constructing reference table for fasta file") _ref_names, _ref_seqs = ld.fastaread(ref_fasta, force_upper=True) # filter sequences by given reference name _kept_ref_seqs = [] for _n, _s in zip(_ref_names, _ref_seqs): if _n.split(' ')[0] in exclude_names: continue else: _kept_ref_seqs.append(_s) _ref_table = ld.OTmap(_kept_ref_seqs, word_size, use_kmer=True) ## filter records if check_rc: _hits = [_ref_table.get(str(_r.seq), rc=True) + _ref_table.get(str(_r.seq), rc=False) for _r in _records] else: _hits = [_ref_table.get(str(_r.seq), rc=False) for _r in _records] # filter _kept_records = [_r for _r, _h in zip( _records, _hits) if _h <= allowed_hits] if return_kept_flag: _kept_flags = [_h <= allowed_hits for _h in _hits] if verbose: print( f"-- {len(_kept_records)} sequences kept by allowing hits:{allowed_hits}") ## Save if save: if save_name is None and not isinstance(record_fasta, str): print(f"Save name not given in either save_name kwd and record_fasta, skip.") elif save_name is None: save_name = os.path.basename(record_fasta) if '.fasta' not in save_name: save_name += '.fasta' save_filename = os.path.join(save_folder, save_name) with open(save_filename, 'wb') as _output_handle: if verbose: print( f"-- saving {len(_kept_records)} kept records in file:{save_filename}") SeqIO.write(_kept_records, _output_handle, "fasta") if return_kept_flag: return _kept_records, np.array(_kept_flags, dtype=np.bool) else: return _kept_records
import sys import random from Bio import SeqIO import sys import os file_in = "all_4k_prediction.out" result = open(file_in, 'r') contig_list = set() for line in result: line = line.rstrip() fields = line.split() if float(fields[2]) >= 0.99995837748: contig_list.add(fields[0]) input_seq_iterator = SeqIO.parse(open("all_4k_contigs.fa", "rU"), "fasta") long_list = [] for record in input_seq_iterator: # print record.name # print record.id # exit() if record.name in contig_list: long_list.append(record) output_handle = open("all_4k_contigs_top1k.fa", "w") SeqIO.write(long_list, output_handle, "fasta") output_handle.close()
#Takes fasta and fasta from centroid and merges to bank file from Bio import SeqIO import sys rna_path = sys.argv[1] central_path = sys.argv[2] no_black = True records = list(SeqIO.parse(rna_path, "fasta")) seconds = list(SeqIO.parse(central_path, "fasta")) used_seqs = set() for id in range(len(records)): is_black = True record = records[id] l = len(record.seq) if no_black: for el in seconds[id].seq[l:l + l]: if el != ".": is_black = False if (record.seq not in used_seqs) and not (no_black and is_black): # print(id) used_seqs.add(record.seq) print("# File", record.id) print("# External source : RNACentral") print("# Type : ?") print("# Length :", l) print("# Description :", ' '.join(record.description.split(" ")[1:])) print() print(record.seq.transcribe()) print(seconds[id].seq[l:l + l]) print()
def getRevComp(file): count = 0 for record in SeqIO.parse(file, 'fasta'): if record.seq.reverse_complement() == record.seq: count += 1 print(count)
import sys _p = "/home/ksimmon/reference/ard/" sys.stderr.write("Retrieving antibiotic resistance genes\n") _l = set([]) descriptions = {} for i in open(_p + "categories.txt"): v = i.strip().split("\t") name = ".".join(v[0].split(".")[:-1]) descriptions.update({name: v}) aro_tags = {} for i in open(_p + "AROtags.txt"): v = i.strip().split("\t") #print v aro_tags.update({v[2]: v[1]}) print aro_tags for s in SeqIO.parse(_p + "ARmeta-genes.fa", "fasta"): id = s.description.split(" ")[0] species = s.description[s.description.rfind("[") + 1:s.description.rfind("]")] aro_tag = [ i.split(" ")[0] for i in s.description.split(". ") if "ARO:" in i and "ARO:1000001" not in i ] print id, species, descriptions[id][1], ",".join( [aro_tags[tag] for tag in aro_tag])
# Converting hits to nucleotide coordinates nucleotide_blast = {x: [] for x in gene_ids if x not in blast_missed} for gene_id in nucleotide_blast: nucleotide_blast[gene_id] =\ convert_coord_dict(features[gene_id], coordinate_sets[gene_id]) # Extract the gene sequences, if args.f is set, and export data if args.f: print('Loading sequences...', file=stderr) # Run without Biopython, if FASTA is not supplied from Bio import SeqIO processed = set() with open(args.f + '.genes', mode='w+') as gene_fasta: tsv = open(args.e, mode='w') for record in SeqIO.parse(open(args.f), 'fasta'): if record.id in features_by_source: for feature in features_by_source[record.id]: processed.add(feature.get_id_prefix()) segment = record[feature.start - args.flank_size:feature.end + args.flank_size] if feature.strand == '-': segment = segment.reverse_complement() segment.id = feature.get_id_prefix() segment.description = '' try: blast_regions = deepcopy( nucleotide_blast[feature.get_id_prefix()]) except KeyError: continue
def main(args): server = BioSeqDatabase.open_database(driver=args.driver, db=args.database, user=args.user, host=args.host, passwd=args.password) tax_name = False try: ncbi_tax = int(args.taxid) except ValueError: tax_name = True if not tax_name: print("interpreting as an NCBI taxon ID...", file=sys.stderr) taxon_id_lookup_sql = "SELECT bioentry_id, taxon_id, biodatabase.name FROM bioentry JOIN "\ "biodatabase USING(biodatabase_id) WHERE taxon_id IN "\ "(SELECT DISTINCT include.taxon_id FROM taxon "\ "INNER JOIN taxon as include ON (include.left_value "\ "BETWEEN taxon.left_value AND taxon.right_value) "\ "WHERE taxon.ncbi_taxon_id = %s AND include.right_value = include.left_value + 1)" rows = server.adaptor.execute_and_fetchall(taxon_id_lookup_sql, (ncbi_tax, )) else: print("interpreting as a taxon name...", file=sys.stderr) taxon_name_lookup_sql = "SELECT bioentry_id, taxon_id, biodatabase.name FROM bioentry JOIN "\ "biodatabase USING(biodatabase_id) WHERE taxon_id IN "\ "(SELECT DISTINCT include.taxon_id FROM taxon "\ "INNER JOIN taxon as include ON (include.left_value "\ "BETWEEN taxon.left_value AND taxon.right_value) "\ "WHERE taxon.taxon_id IN (SELECT taxon_id FROM taxon_name "\ "WHERE name like %s) AND include.right_value = include.left_value + 1)" rows = server.adaptor.execute_and_fetchall(taxon_name_lookup_sql, (args.taxid, )) if args.feature_type is not None: types = args.feature_type elif args.output_format == 'feat-prot': types = ['CDS'] elif args.output_format == 'feat-nucl': types = ['CDS', 'rRNA', 'tRNA'] dbids = {} for row in rows: dbids[(row[0], row[2])] = row[1] files = {} taxid_to_dbids = {} if args.split_species: taxon_file_mapping = {} for k, v in dbids.items(): tname = server.adaptor.execute_and_fetch_col0( "SELECT name from taxon_name where taxon_id = %s and name_class = %s", (v, 'scientific name'))[0] tname = tname.replace(' ', '_') if args.output_format == 'gb': tname += '.gb' elif args.output_format == 'feat-prot': tname += '.faa' else: tname += '.fna' files[v] = tname taxid_to_dbids.setdefault(v, []).append(k) if args.split_species: # got to save all of the records before printing them out outdata = {} for taxid, dbid_list in taxid_to_dbids.items(): for dbid, dbname in dbid_list: db = server[dbname] seq_rec = db[dbid] outdata.setdefault(taxid, []).append(seq_rec) for taxid, dbrecs in outdata.items(): with open(files[taxid], 'w') as fp: if 'feat' in args.output_format: for dbrec in dbrecs: extract_feature(dbrec, args.output_format, fp) else: SeqIO.write(dbrecs, fp, args.output_format) else: if args.output_format == 'feat-prot': extract_feature_sql(server, get_seqfeature_ids_for_bioseqs( server, [x[0] for x in dbids.keys()]), type=types, translate=True) elif args.output_format == 'feat-nucl': extract_feature_sql(server, get_seqfeature_ids_for_bioseqs( server, [x[0] for x in dbids.keys()]), type=types) else: for (dbid, dbname), taxid in dbids.items(): db = server[dbname] try: dbrec = db[dbid] SeqIO.write(dbrec, sys.stdout, args.output_format) except KeyError: pass
def Check_adaptors_against_fasta(readout_fasta, adaptor_site_fasta, ref_fasta, word_size=11, allowed_hits=0, exclude_names=_adaptor_site_names, check_rc=True, save=False, save_folder=_readout_folder, save_name=None, save_postfix='_kept', save_adaptors=False, overwrite=False, verbose=True): """Function to check adaptors against a list of fasta files, until get satisfying matches Inputs: Outputs: _kept_readouts: list of SeqRecords of which readouts are saved """ ## check inputs if verbose: print(f"- Check raedouts->adaptors against fasta") # readout_Fasta if not isinstance(readout_fasta, str): raise TypeError(f"Wrong input type of readout_fasta:{readout_fasta}") elif not os.path.isfile(readout_fasta): raise IOError( f"Input file readout_fasta:{readout_fasta} not exist, exit!") # adaptor_site_Fasta if not isinstance(adaptor_site_fasta, str): raise TypeError( f"Wrong input type of adaptor_site_fasta:{adaptor_site_fasta}") elif not os.path.isfile(adaptor_site_fasta): raise IOError( f"Input file adaptor_site_fasta:{adaptor_site_fasta} not exist, exit!") # ref_fasta if isinstance(ref_fasta, str): ref_fasta = [ref_fasta] if not isinstance(ref_fasta, list): raise TypeError( f"ref_fasta should be either one filename or list of filenames") for _fl in ref_fasta: if not os.path.isfile(_fl): raise IOError(f"input ref_fasta file:{_fl} not exist, exit.") # save etc. if save_name is None: save_name = os.path.basename(readout_fasta).replace( '.fasta', save_postfix+'.fasta') save_filename = os.path.join(save_folder, save_name) # load readouts with open(readout_fasta, 'r') as _handle: readouts = [] for _record in SeqIO.parse(_handle, "fasta"): readouts.append(_record) if verbose: print(f"-- {len(readouts)} readout loaded") # initialize adaptor selection flags _adaptor_flags = [] while(len(_adaptor_flags) != len(readouts)): # generate current adaptors _adaptors = Generate_adaptors(readouts, adaptor_site_fasta) # update whether keep the adaptor _adaptor_flags = np.ones(len(_adaptors), dtype=np.bool) for _fl in ref_fasta: _, _fl_kept = Screen_seqs_against_fasta(_adaptors, _fl, word_size=word_size, allowed_hits=allowed_hits, return_kept_flag=True, verbose=False) _adaptor_flags *= _fl_kept readouts = [_r for _r, _f in zip(readouts, _adaptor_flags) if _f] if verbose: print(f"-- {len(readouts)} readous are kept.") if save: with open(save_filename, 'w') as _output_handle: if verbose: print(f"-- saving filtered readouts to file: {save_filename}") SeqIO.write(readouts, _output_handle, "fasta") if save_adaptors: _adaptors = Generate_adaptors(readouts, adaptor_site_fasta) adaptor_save_filename = save_filename.replace( '.fasta', '_adaptor.fasta') if verbose: print( f"-- saving corresponding adaptors to file: {adaptor_save_filename}") with open(adaptor_save_filename, 'w') as _output_handle: SeqIO.write(readouts, _output_handle, "fasta") return readouts
continue ctg1 = sline[0].split("_")[0].strip("+").strip("-") ctg2 = sline[0].split("_")[1].strip("+").strip("-") ori1 = sline[0].split("_")[0][0] ori2 = sline[0].split("_")[1][0] distance = float(sline[1]) dist = int(distance) if args.blacklistfile: blacklist = {} with open(args.blacklistfile) as f: for line in f: sline = line.split() blacklist[sline[0]] = sline[1] for read in SeqIO.parse(args.contigfile, "fasta"): contigs[read.id] = len(read.seq) print("Nr. of scaffolds: " + str(len(contigs))) class Node: segments = {} # id1: (lc1, rc1) sizes_nc = [ ] # sizes of contigs that determine the average (that were Not Cut) def get_right_nb(self): pass def get_distance_table(self): pass
def Search_Candidates(source_readout_file, total_cand=200, existing_readout_file='cand_readouts.fasta', readout_folder=_readout_folder, GC_percent=[0.4,0.6], max_consecutive=4, max_rep=6, C_percent=[0.2, 0.28], blast_hsp_thres=10.0, save_name='selected_candidates.fasta', verbose=True): """Function to search readout sequences in a given pool compared with existing readouts Inputs: source_readout_file: filename for readout sequence pool, string (should be .fasta) total_cand: number of candidates we hope to generate, int (default: 1000) existing_readout_file: filename for existing readouts, string (should be ,fasta) readout_folder: folder to store readout information, string (default: globally given) GC_percent: whether check gc content, list of two values or False ([0.4, 0.6]) max_consecutive: maximum allowed consecutive bases, int (4) max_rep: maximum replicated sequence in a readout allowed, int (6) C_percent: percentage of base C, list of two values or False([0.22, 0.28]) blast_hsp_thres: threshold for blast hsp, no hsp larger than this allowed, int (10) blast_ref: file basename for fasta file of existing readouts in readout_folder, used for blast verbose: say something!, bool (default: True) Outputs: _cand_readouts: list of Bio.SeqRecord.SeqRecord objects """ ## check input files if not os.path.isfile(source_readout_file): source_readout_file = os.path.join(readout_folder, source_readout_file) if not os.path.isfile(source_readout_file): raise IOError(f"Wrong input source readout file:{source_readout_file}, not exist.") elif '.fasta' not in source_readout_file: raise IOError(f"Wrong input file type for {source_readout_file}") if not os.path.isfile(existing_readout_file): existing_readout_file = os.path.join(readout_folder, existing_readout_file) if not os.path.isfile(existing_readout_file): raise IOError(f"Wrong input source readout file:{existing_readout_file}, not exist.") elif '.fasta' not in existing_readout_file: raise IOError(f"Wrong input file type for {existing_readout_file}") # load candidate sequences and filter # start looping if verbose: print(f"- Start selecting readout candidates from {source_readout_file},\n\tfiltering with {existing_readout_file} ") _cand_records = [] _ct = 0 with open(source_readout_file, "rU") as _handle: for _record in SeqIO.parse(_handle, "fasta"): if len(_cand_records) >= total_cand: if verbose: print(f"-- {total_cand} new candidates acquired, stop iteration.") break if verbose: print (f"--- processing: {_record.seq}") for i in range(32): _new_seq = Extend_Readout(_record.seq) _keep = Filter_Readout(_new_seq,GC_percent=GC_percent, max_consecutive=max_consecutive, max_rep=max_rep, C_percent=C_percent, blast_hsp_thres=blast_hsp_thres, readout_folder=readout_folder, blast_ref=os.path.basename(existing_readout_file), verbose=False) if _keep: _kept_record = SeqRecord(_new_seq, id='cand_'+str(_ct+1), description='30mer_candidate') _cand_records.append(_kept_record) if verbose: print (f"--- candidate:{_ct} {_new_seq} saved") # Save to candidate records with open(existing_readout_file, "a") as _output_handle: SeqIO.write(_kept_record, _output_handle, "fasta") _ct += 1 break else: break # after selection, save selected_candidates _save_filename = os.path.join(readout_folder, save_name) with open(_save_filename, 'w') as _output_handle: if verbose: print(f"-- saving candidate readouts into file: {_save_filename}") SeqIO.write(_cand_records, _output_handle, "fasta") return _cand_records
from Bio import SeqIO records = list(SeqIO.parse("sequence.fasta", "fasta")) seq = str(records[0].seq[0:]) seqOcur = {} i = 0 j = 37 seqLen = len(seq) # Build a dictionary with key meaning the sequence and value meaning the amount of times that it appears while j <= seqLen: if (seq[i:j] in seqOcur): seqOcur[seq[i:j]] += 1 else: seqOcur[seq[i:j]] = 1 i += 1 j += 1 # Print the first 100 different sequences with their respective count print(len(seqOcur), "different subsequences of size 37 were found. \n\n") limit = 1 print("The first 100 sequences with their respective count:") for key, value in seqOcur.items(): if limit == 101: break print("Sequence:", key, "Count:", value)
elif output_entry != 'X': output_figure = output_entry + '_output_figure' output_sequence = output_entry + '_output_clean.fasta' output_rejected = output_entry + '_output_rejected.fasta' output_tabular = output_entry + '_output_analysis.csv' output_full_table = output_entry + '_full_comparison_table.csv' # Start timer start_time = time.time() # Initialize alignment_record_name_list = [] for record in SeqIO.parse(input_sequence, "fasta"): alignment_record_name_list.append(record.name) depth_of_alignment = (len(alignment_record_name_list)) record_sequence_trial_results = pd.DataFrame(alignment_record_name_list, columns=['Accession']) if number_in_small_test == 0: number_in_small_test = depth_of_alignment if number_in_small_test == depth_of_alignment: min_trials_for_each_sequence = 1 print("Analyzing '" + input_sequence + "'.") print('Flags are --IQR_coefficient: ' +
import gzip import sys from Bio import SeqIO fasta = gzip.open(sys.argv[1], 'rU') out_file = sys.argv[2] o = open(out_file, 'w') for i in SeqIO.parse(fasta, 'fasta'): name = i.id o.write(str(name) + '\n') fasta.close() o.close() #usage example: python get_ids.py calJac3.fa.masked.gz calJac3.fa.ids.txt
np.save(outlabel, train_label) if __name__ == '__main__': infile = sys.argv[1] itr1 = int(sys.argv[2]) outpath = sys.argv[3] if outpath[-1] == "/": outpath = outpath[:-1] ###### read fasta file ####### dataset = [] out = "" for record in SeqIO.parse(infile, "fasta"): id_part = record.id id_parts = id_part.split(",") seq_part = str(record.seq.upper()) # geneset : [[gene name, genelabel(mi=0,sno=1,t=2), sequence],... dataset = dataset + [[id_parts[0], int(id_parts[1]), seq_part]] out += id_parts[0] + ":" + id_parts[1] + "\n" f = open(outpath + "/genelabel.txt", "w") f.write(out) f.close() ############################## make_pairFASTA(dataset, itr1, outpath)
#Python code to check for number of reverse compliments in fasta file from Bio import SeqIO from Bio.Seq import Seq seq = [] cnt = 0 response = 0 def reverse_comp(nucleotide): flag = int(0) revcomp = nucleotide[::-1].complement() if (revcomp == nucleotide): flag = int(1) return flag for record in SeqIO.parse('data/rosalind_rvco.txt', 'fasta'): seq.append(record.seq) for i in range(0, len(seq)): response = reverse_comp(seq[i]) if (response == int(1)): cnt += 1 else: continue print(cnt)
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # required module imports import httplib2, urllib, json, sys, StringIO from Bio import SeqIO # setup http object http = httplib2.Http(".cache") iterations=1 if len(sys.argv) == 2 : iterations = int(sys.argv[1]) for x in range(0,iterations) : print str(x) #Get gene genomic seq resp, content = http.request("http://127.0.0.1:3000/sequence/id/ENSG00000139618.fasta", method="GET", headers={"Content-Type":"text/plain"}) # check response ok if not resp.status == 200: print "Invalid response: ", resp.status sys.exit() io = StringIO.StringIO(content) for record in SeqIO.parse(io, "fasta") : print "definition :" + record.id print "nalen :" + str(len(record.seq))
from Bio import SeqIO input_filename = 'source_downloads/hgb_ref_HetGla_female_1.0_chrUn.fa' for seq in SeqIO.parse(open(input_filename), 'fasta'): split_desc = seq.description.split('|') print '>{} {}\n{}'.format(split_desc[3], seq.description, str(seq.seq))
def retrieve_seq(acc_num): gb_file = "{}/{}.gb".format(acc_num, acc_num) entry = SeqIO.read(open(gb_file, "r"), "genbank") return entry
def produce_glocks_method(fn_fasta, fn_assays_list): ''' USED TO EFFICIENTLY SELECT GBLOCKS, GIVEN A FASTA FILE OF FULL LENGTH SEQUENCE AND A PRIMER FILE OF THE EXPECTED FORMAT LIKE THIS >ACF24861.1_1091_spec_F_1344 TGGCAGGCGGATAAATTCTT >ACF24861.1_1091_spec_R_1437 CGGCACTGTCAAACCCATAA Returns: A gblock fasta file ''' import sys from Bio import SeqIO handle = open(fn_fasta, "rU") record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta")) handle.close() # MAKE fh = open(fn_assays_list, 'r') L1 = [] GL = [] Ref_L = list() D = dict() import re for line in fh: line = line.strip().split('\t') m0 = re.match('>([a-zA-Z0-9]+\.[0-9]+)', line[0]) ref_seq_temp = m0.group(1) if ref_seq_temp not in D.keys(): D[ref_seq_temp] = list() Ref_L.append(ref_seq_temp) #print line[0] m1 = re.match('>[a-zA-Z0-9]+\.[0-9]+_.*F_([0-9]+)', line[0]) #>AAC60788.1_1_spec_F_1373 #print line[2] m2 = re.match('>[a-zA-Z0-9]+\.[0-9]+_.*R_([0-9]+)', line[2]) #>AAC60788.1_1_spec_R_1457 #print str(m1.group(1)) + "\t" + str(m2.group(1)) D[ref_seq_temp].append([int(m1.group(1)), int(m2.group(1))]) oh = open(fn_assays_list + ".gblocks", 'w') for r in Ref_L: print "#######%s#######" %(r) L1 = list(D[r]) print L1 print captured_list = list() best_count = 0 store_best = [0,0] for i in range(1800): gblock_start = i gblock_end = i + 490 captured_count = 0 for k in range(len(L1)): if int(L1[k][0]) - 20 > gblock_start and int(L1[k][1]) + 20 < gblock_end: captured_count = captured_count + 1 if captured_count > best_count: #print captured_count best_count = int(captured_count) store_best[0] = gblock_start store_best[1] = gblock_end L2 = list() for j in range(len(L1)): if int(L1[j][0]) < store_best[0] or int(L1[j][1]) > store_best[1]: L2.append(L1[j]) print store_best seq = str(record_dict[r].seq) sub_seq = seq[store_best[0]:store_best[1]] # TAKE THE INTEVAL oh.write('>' + record_dict[r].description + "_gBlock1_" + str(store_best[0]) + "_" + str(store_best[1]) + "_captures_" + str(len(L1) - len(L2)) + "\n" ) oh.write(sub_seq + "\n") print str(len(L1) - len(L2)) + " SEQUENCES WERE CAPTURED" if len(L2) > 0: # YOU FAILED TO CAPTURE ALL THE SEQUENCES IN ONE GBLOCK captured_list = list() best_count = 0 store_best = [0,0] for i in range(1800): gblock_start = i gblock_end = i + 490 captured_count = 0 for k in range(len(L2)): if int(L2[k][0]) - 20 > gblock_start and int(L2[k][1]) + 20 < gblock_end: captured_count = captured_count + 1 if captured_count > best_count: #print captured_count best_count = int(captured_count) store_best[0] = gblock_start store_best[1] = gblock_end L3 = list() for j in range(len(L2)): if int(L2[j][0]) < store_best[0] or int(L2[j][1]) > store_best[1]: L3.append(L2[j]) print store_best print str(len(L2) - len(L3)) + " SEQUENCES WERE CAPTURED" seq = str(record_dict[r].seq) sub_seq = seq[store_best[0]:store_best[1]] # TAKE THE INTEVAL oh.write('>' + record_dict[r].description + "_gBlock2_" + str(store_best[0]) + "_" + str(store_best[1]) + "_captures_" + str(len(L2) - len(L3)) + "\n" ) oh.write(sub_seq + "\n") if len(L3) > 0: # YOU FAILED TO CAPTURE ALL THE SEQUENCES IN ONE GBLOCK captured_list = list() best_count = 0 store_best = [0,0] for i in range(1800): gblock_start = i gblock_end = i + 490 captured_count = 0 for k in range(len(L3)): if int(L3[k][0]) - 20 > gblock_start and int(L3[k][1]) + 20 < gblock_end: captured_count = captured_count + 1 if captured_count > best_count: #print captured_count best_count = int(captured_count) store_best[0] = gblock_start store_best[1] = gblock_end non_captured_list = list() for j in range(len(L3)): if int(L3[j][0]) < store_best[0] or int(L3[j][1]) > store_best[1]: non_captured_list.append(L3[j]) print store_best print str(len(L3) - len(non_captured_list)) + " SEQUENCES WERE CAPTURED" seq = str(record_dict[r].seq) sub_seq = seq[store_best[0]:store_best[1]] # TAKE THE INTEVAL oh.write('>' + record_dict[r].description + "_gBlock3_" + str(store_best[0]) + "_" + str(store_best[1]) + "_captures_" + str(len(L3) - len(non_captured_list)) + "\n" ) oh.write(sub_seq + "\n")
import sys from Bio import SeqIO import glob import os.path from collections import defaultdict lookup = {} for rec in SeqIO.parse(open(sys.argv[1]), "fasta"): lookup[rec.id] = rec.description results = defaultdict(list) for sample in sys.argv[2:]: for fn in glob.glob(sample + '/clusters/*.cluster.fasta'): refid = os.path.basename(fn) hits = len(list(SeqIO.parse(open(fn), 'fasta'))) tofind = refid.replace('.cluster.fasta', '') results[sample].append((hits, lookup[tofind])) for sample, hits in results.iteritems(): total_hits = sum([h[0] for h in hits]) for h in hits: taxonomy = h[1].split(" ") print "%s\t%s\t%s %s\t%s\t%s\t%s" % (sample, taxonomy[1], taxonomy[1], taxonomy[2], h[1], h[0],
quiverStr = Tools.ExtractSeq((quiverName, quiverStart, quiverEnd), tFile, tFai) print "length of : " + targetName + " " + str(len(genomeStr)) quiverName = targetName + " quiver" print "length of : " + quiverName + " " + str(len(genomeStr)) # quiverStr = targetDict[targetName][tRegionStart:tRegionEnd] genomeFileName = tempfile.mktemp(suffix=".fasta", dir=".") quiverFileName = tempfile.mktemp(suffix=".fasta", dir=".") tmpFileNames.append(genomeFileName) tmpFileNames.append(quiverFileName) genomeFile = open(genomeFileName, 'w') quiverFile = open(quiverFileName, 'w') genomeName = "{}:{}-{}".format(chrom,start,end) quiverName = "{}:{}-{} quiver".format(chrom,start,end) SeqIO.write(SeqRecord.SeqRecord(seq=Seq.Seq(genomeStr), id=genomeName,name="",description=""), genomeFile, "fasta") SeqIO.write(SeqRecord.SeqRecord(seq=Seq.Seq(quiverStr), id=quiverName,name="",description=""), quiverFile, "fasta") genomeFile.close() quiverFile.close() command="/net/eichler/vol5/home/mchaisso/projects/PacBioSequencing/scripts/DotPlot.py --query {} --target {} --savefig {}_{}_{}.pdf --matches dot:11".format(quiverFileName, genomeFileName, chrom,start,end) print "running " + command subprocess.call(command.split()) for tmpfile in tmpFileNames: command = "/bin/rm -f " + tmpfile subprocess.call(command.split()) #chr10 102356490 102357412 chr10:100000000-110000000|quiver_2300000_2350000/0_50000 interior
def main(): usage = "\n%prog [options]\nNeeded software: Biopython, BLAST (2.3.0 for reproducibility), bedtools (2.28 for reproducibility)" parser = OptionParser(usage, version="%prog " + __version__) parser.add_option("-f", "--fasta", action="store", dest="fasta", help="Transcript file in FASTA format (required).") parser.add_option("-g", "--gtf", action="store", dest="gtf", help="Transcript coordinates in GTF format (required).") parser.add_option("-d", "--db", action="store", dest="db", help="FASTA database for species comparison (required).") parser.add_option("-o", "--out", action="store", dest="out", help="Output unique name (required).") parser.add_option("-m", "--mask", action="store", dest="mask", default="none", help="GTF file for masking (f.e. pseudogenes).") parser.add_option( "-O", "--orthologs", action="store", dest="ort", default="none", help="Additional list of known gene orthologs (f.e. Ensembl Compara).") blast_path = 'blastn' bedtools_path = 'bedtools' (opt, args) = parser.parse_args() check_arg(opt.fasta, "--fasta") check_arg(opt.gtf, "--gtf") check_arg(opt.db, "--db") check_arg(opt.out, "--out") check_file(opt.fasta) check_file(opt.gtf) if not os.path.exists('out'): os.makedirs('out') if not os.path.exists('tmp'): os.makedirs('tmp') seqs = SeqIO.index(opt.fasta, "fasta") #BLAST 2.3.0 exc = exclude(seqs) run_blast(opt.fasta, opt.db, blast_path, '1', opt.out) print('Parsing') regions = parse_blast("tmp/blast_" + opt.out + ".xml", exc) coords_t = parse_gtf(opt.gtf, opt.out) print('Writing main regions') write_main_regions(regions, opt.gtf, coords_t, exc, opt.out, opt.mask, opt.ort, bedtools_path) #BEDtools 2.28 print('Clustering regions from same genes') clusters = cluster_regions2(opt.out) print('Writing final output') biotype = getBiotypes(opt.gtf) write_final_regions(opt.out, clusters, biotype) r = glob.glob('tmp/*') for i in r: if i.startswith('tmp/' + opt.out): os.remove(i)
from Bio import SeqIO import argparse parser = argparse.ArgumentParser( description='Process some fasta file to return the stats on the matches at some e-value') parser.add_argument('-i', '--input', type=str, help="This in the fasta file to get the stats on") parser.add_argument('-r', '--reference', type=str, help="This is the list of files to compare the stats on") args = parser.parse_args() input_file = args.input ref_file = args.reference # Parse and count the reads from each of the genomes dict = {} fasta_sequences = SeqIO.parse(open(input_file), 'fasta') for fasta in fasta_sequences: name, sequence = fasta.id, str(fasta.seq) split = name.split('__') if split[0] in dict.keys(): dict[split[0]] += 1 else: dict[split[0]] = 1 # Construct a list of them all, and print those that have more then 1 hit_list = [] multi_hits = [] print("+++ MULTI MATCHES +++") for item in dict: hit_list.append(item) if dict[item] > 1: multi_hits.append(item)
continue else: names.append(newname) break else: names.append(name) break pathstofiles = [] path = tempfile.mkdtemp(dir=self.tmpdir) for name, seq in zip(names, seqs): whole_path = os.path.join(path, name)+".gb" seq.write(whole_path) pathstofiles.append('"{}"'.format(whole_path)) p = subprocess.Popen("{} {}".format(self.path_to_editor," ".join(pathstofiles)), shell=True, stdout = tempfile.TemporaryFile(), stderr = tempfile.TemporaryFile()).pid time.sleep(0.5) #shutil.rmtree(path) #for name in names: # print os.path.join(path, name)+".gb" if __name__=="__main__": from Bio import SeqIO sr1 = SeqIO.parse("../tests/pUC19.gb","gb").next() sr2 = SeqIO.parse("../tests/pCAPs.gb","gb").next() aperunner = Ape("tclsh /home/bjorn/.ApE/apeextractor/ApE.vfs/lib/app-AppMain/AppMain.tcl") aperunner.open(sr1,sr2)