def read_single_with_titles(filename, alphabet): global title_to_ids iterator = FastaIterator(open(filename), alphabet, title_to_ids) record = iterator.next() try: second = iterator.next() except StopIteration: second = None assert record is not None and second is None return record
def filter_influenza_fa(in_fasta, out_fasta, pattern, accession_set): ''' accession_set .. a set of accession IDs that we query the fasta header against l, count = filter_influenza_fa(path_fa, pattern, include_accession) ''' cache_previous = () count, l = 0, [] with open(in_fasta) as handle, open(out_fasta, 'a+') as out: for record in FastaIterator(handle): # [^1] if '(' not in list(record.description): cache_current = re.search(pattern, record.description).group(1, 2, 3) if cache_current[0] in cache_previous: # [^2] continue acc = cache_current[0] cache_previous = cache_current if acc in accession_set: count += 1 l.append(acc) out.write('>' + acc + '\n') out.write(str(record.seq) + '\n') return (count) # l could be returned also
def writeClassifiedFastas(classType,Dirr,resultsDir, df): fasta_files_dict = Get_Dirr_All_Fasta (classType,Dirr) classDict = {} writerDict = {} for key, value in fasta_files_dict.items(): files = {key:value} for filename, classname in files.items(): with open(filename) as fasta: for record in FastaIterator(fasta): #SeqIO.SimpleFastaParser(fasta): title = record[0] seq_id = title.split(None, 1)[0] if (record.id in df.index): classname = df[record.id] if (classname not in writerDict): classname = "".join([c for c in classname if c.isalpha() or c.isdigit() or c==' ']).rstrip() file = resultsDir + '\\' + classname + '.fasta' classHandle = open(file, "w") classDict[classname] = classHandle myWriter = FastaWriter(classDict[classname]) myWriter.write_header() writerDict[classname] = myWriter writerDict[classname].write_record(record) for classname, classHandle in classDict.items(): writerDict[classname].write_footer() classDict[classname].close()
def cut_fasta_by_len(fa_file, len_cutoff, outdir, prefix, suffix): # https://stackoverflow.com/questions/273192/how-can-i-create-a-directory-if-it-does-not-exist # Defeats race condition when another thread created the path #if not os.path.exists(outdir): # os.mkdir(outdir) try: os.makedirs(outdir) except OSError as e: if e.errno != errno.EEXIST: raise cut_fa_file = os.path.join(outdir, prefix + ".ge" + str(len_cutoff) + suffix) if os.path.exists(cut_fa_file) and (os.path.getsize(cut_fa_file) > 0): return cut_fa_file if fa_file.endswith(".gz"): in_h = gzip.open(fa_file, 'rt') else: in_h = open(fa_file, 'r') with open(cut_fa_file, 'w') as out_h: #for rec in SeqIO.parse(in_h, 'fasta'): # if len(rec.seq) >= len_cutoff: # SeqIO.write(rec, out_h, 'fasta') # yes, the SeqIO.parse() API is more simple to use, easy to understand # but, try different method, you will find something writer = FastaWriter(out_h) writer.write_header() for rec in FastaIterator(in_h): if len(rec) >= len_cutoff: writer.write_record(rec) writer.write_footer() in_h.close() return cut_fa_file
def align(fh, transl=True): """ Translate and align pangenome cluster fasta file """ align_exe = MuscleCommandline( r'C:\Users\matthewwhiteside\workspace\b_ecoli\muscle\muscle3.8.31_i86win32.exe', clwstrict=True) # Align on stdin/stdout proc = subprocess.Popen(str(align_exe), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True, shell=False) sequences = FastaIterator(fh) inp = [ ">" + record.id + "\n" + str(record.translate(table="Bacterial").seq) + "\n" for record in sequences ] inp = "".join(inp) align, err = proc.communicate(input=inp) return (align)
def parse(fasta_file): ref_prot_fasta_file = RefProtFastaFile(fasta_file) with open(ref_prot_fasta_file.filename) as ff: for record in FastaIterator(ff): entry = RefProtFastaEntry.parse_fasta_record(record, ref_prot_fasta_file.taxon_id) ref_prot_fasta_file.add_entry(entry) return ref_prot_fasta_file
def fasta_reader(filename): from Bio.SeqIO.FastaIO import FastaIterator input = [] with open(filename) as handle: for record in FastaIterator(handle): input += [[str(record.id), str(record.seq)]] return input
def generate_fake_genome(sample: str, reference: Path, vcf_path: Path, ploidy_dict: Dict[str, int] ) -> Generator[SeqRecord, None, None]: """ Generate a fake genome given a VCF, a reference, and a ploidy dict. A fasta record for each chromosome will be created. :param sample: The name in the sample of the VCF to use :param reference: The reference fasta file to use :param vcf_path: The path to the VCF :param ploidy_dict: A dictionary containing the ploidies for each contig. :return: A Generator that creates the chromosomes one by one. """ mutations_dict = vcf_to_mutations(str(vcf_path), sample) with reference.open("rt") as reference_h: for seqrecord in FastaIterator(reference_h): ploidy = ploidy_dict.get(seqrecord.id, 2) for allele_no in range(ploidy): # Default to empty list if no mutations were listed. mutations = mutations_dict.get(seqrecord.id, {} ).get(allele_no, []) new_sequence = sequence_with_mutations( sequence=str(seqrecord.seq), mutations=mutations) new_id = seqrecord.id + "_" + str(allele_no) yield SeqRecord( Seq(new_sequence, seqrecord.seq.alphabet), id=new_id, name=new_id, description=new_id)
def add_gc_content(df, df_sub, args): if args.feature == 'gene': fasta = {c[0]:c[1] for c in SimpleFastaParser(args.fasta)} gene2exon_indx = df.groupby(['gene_id', 'feature']) exons = defaultdict(str) gene_ids = set(df.gene_id) for gene in gene_ids: idx = gene2exon_indx.groups[(gene, 'exon')] nn = 0 for ii, row in df.iloc[idx,:].iterrows(): exon_key = '{}:{}-{}'.format(row['seqname'], row['start']-1, row['end']) seq = fasta.get(exon_key) exons[gene] += seq for gene_id in df_sub.index: if gene_id in exons: seq = exons[gene_id] gc_content = GC(seq) df_sub.at[gene_id, 'gc_content'] = gc_content else: print("missing gene_id in exons dict:") print(gene_id) elif args.feature == 'transcript': tx_ids = set(df['transcript_id'].values) for rec in FastaIterator(args.fasta): if rec.id in tx_ids: gc_content = GC(rec.seq) df_sub.loc[rec.id, 'gc_content'] = gc_content else: print(rec.id) else: raise ValueError('check feature type!') return df_sub
def readFasta(fastaFile): """ Reads a FASTA file and parses contigs for GC content. Args: fastaFile: The path to the FASTA file. Returns: contigs A dictionary mapping contigIDs to sidr.common.Contig objects with GC content as a variable. """ contigs = [] if ".gz" in fastaFile: # should support .fa.gz files in a seamless (if slow) way openFunc = gzip.open else: openFunc = open with openFunc(fastaFile) as data: click.echo("Reading %s" % fastaFile) with click.progressbar(FastaIterator(data)) as fi: for record in fi: # TODO: conditional formatting contigs.append( common.Contig(record.id.split(' ')[0], variables={"GC": GC(record.seq)})) if len(contigs) != len( set([x.contigid for x in contigs]) ): # exit if duplicate contigs, https://stackoverflow.com/questions/5278122/checking-if-all-elements-in-a-list-are-unique raise ValueError("Input FASTA contains duplicate contigIDs, exiting") return dict( (x.contigid, x) for x in contigs ) # https://stackoverflow.com/questions/3070242/reduce-python-list-of-objects-to-dict-object-id-object
def main(): args = argument_parser().parse_args() source = Position.from_string(args.source) target = Position.from_string(args.target) with open(args.fasta, "rt") as fasta_h: records = FastaIterator(fasta_h) result = mutate(records, source, target) print(result.format("fasta"), end='')
def load_files(): '''Load all files in to an arrary, unshuffled''' data = [] for i, filename in enumerate(FILES): with open("data/" + filename) as f: filedata = [(values, i) for values in FastaIterator(f)] data.extend(filedata) return data
def _fasta_reader(filename: str) -> SeqRecord: """ FASTA file reader as iterator """ with open(filename) as handle: for record in FastaIterator(handle): yield record
def fastarename(input, relabel, output): from Bio.SeqIO.FastaIO import FastaIterator with open(output, 'w') as outfile: counter = 1 for record in FastaIterator(open(input)): newName = relabel + str(counter) outfile.write(">%s\n%s\n" % (newName, record.seq)) counter += 1
def _fasta_reader(filename: str) -> Iterator: """ Read FASTA file content including multifasta format """ with open(filename) as handle: for record in FastaIterator(handle): yield record
def get_base(fasta: str, chromosome: str, start: int, end: Optional[int]): if end is None: end = start + 1 with open(fasta, "rt") as fasta_handle: records = FastaIterator(fasta_handle) for record in records: if record.id == chromosome: return record[start:end].seq # If we have not returned the chromosome was not there. raise ValueError(f"{chromosome} not found in {fasta}")
def fasta_reader(filename): """ Read a multi or single fasta file. Inputs: filename - string that represents a name of the file or a path to the file. Outputs: A generator object containing a Seq and ID biopython objects. """ if filename.endswith('.gz'): with gzip.open(filename, 'rt') as handle: for record in FastaIterator(handle): yield str(record.id), str(record.seq) else: with open(filename) as handle: for record in FastaIterator(handle): yield str(record.id), str(record.seq)
def multi_check(self, filename): """Test parsing multi-record FASTA files.""" msg = f"Test failure parsing file {filename}" re_titled = list(FastaIterator(filename, title2ids=title_to_ids)) default = list(SeqIO.parse(filename, "fasta")) self.assertEqual(len(re_titled), len(default), msg=msg) for old, new in zip(default, re_titled): idn, name, descr = title_to_ids(old.description) self.assertEqual(new.id, idn, msg=msg) self.assertEqual(new.name, name, msg=msg) self.assertEqual(new.description, descr, msg=msg) self.assertEqual(new.seq, old.seq, msg=msg)
def multi_check(self, filename, alphabet): """Basic test for parsing multi-record FASTA files.""" re_titled = list(FastaIterator(open(filename), alphabet, title_to_ids)) default = list(SeqIO.parse(open(filename), "fasta", alphabet)) self.assertEqual(len(re_titled), len(default)) for old, new in zip(default, re_titled): idn, name, descr = title_to_ids(old.description) self.assertEqual(new.id, idn) self.assertEqual(new.name, name) self.assertEqual(new.description, descr) self.assertEqual(str(new.seq), str(old.seq)) self.assertEqual(new.seq.alphabet, old.seq.alphabet)
def read_single_with_titles(filename, alphabet): global title_to_ids handle = open(filename) iterator = FastaIterator(handle, alphabet, title_to_ids) record = next(iterator) try: second = next(iterator) except StopIteration: second = None handle.close() assert record is not None and second is None return record
def read_single_with_titles(filename, alphabet): """Parser wrapper to confirm single entry FASTA file.""" global title_to_ids with open(filename) as handle: iterator = FastaIterator(handle, alphabet, title_to_ids) record = next(iterator) try: second = next(iterator) except StopIteration: second = None assert record is not None and second is None return record
def multi_check(self, filename, alphabet): """Test parsing multi-record FASTA files.""" msg = "Test failure parsing file %s" % filename re_titled = list(FastaIterator(filename, alphabet, title_to_ids)) default = list(SeqIO.parse(filename, "fasta", alphabet)) self.assertEqual(len(re_titled), len(default), msg=msg) for old, new in zip(default, re_titled): idn, name, descr = title_to_ids(old.description) self.assertEqual(new.id, idn, msg=msg) self.assertEqual(new.name, name, msg=msg) self.assertEqual(new.description, descr, msg=msg) self.assertEqual(str(new.seq), str(old.seq), msg=msg) self.assertEqual(new.seq.alphabet, old.seq.alphabet, msg=msg)
def read_fasta(inputfile): """Method for loading sequences from a FASTA formatted file and storing them into a list of sequences and names. :param inputfile: .fasta file with sequences and headers to read :return: lists of sequences and names. """ names = list() # list for storing names sequences = list() # list for storing sequences with open(inputfile) as handle: for record in FastaIterator(handle): # use biopythons SeqIO module names.append(record.description) sequences.append(str(record.seq)) return sequences, names
def parse_file(file_path): records_letters = {} with open(file_path) as in_handle: for record in FastaIterator(in_handle): records_letters[record.id] = {'A': 0, 'C': 0, 'G': 0, 'T': 0, 'Y': 0, 'M': 0, 'S': 0, 'R': 0, 'W': 0, 'K': 0, 'N': 0, 'D': 0, 'B': 0, 'H': 0, 'V': 0, 'all': 0, 'a': 0, 'c': 0, 'g': 0, 't': 0, 'y': 0, 'm': 0, 's': 0, 'r': 0, 'w': 0, 'k': 0, 'n': 0, 'd': 0, 'b': 0, 'h': 0, 'v': 0, 'all_small': 0, 'all_big': 0} for letter in record.seq: if letter.islower(): records_letters[record.id]['all_small'] += 1 else: records_letters[record.id]['all_big'] += 1 records_letters[record.id][letter] += 1 records_letters[record.id]['all'] += 1 return records_letters
def reheader_fasta(fa_in, fa_out, header_function, in_gz, gz): if in_gz: in_h = gzip.open(fa_in, 'rt') else: in_h = open(fa_in, 'r') if gz: out_h = bgzf.BgzfWriter(fa_out, 'wb') else: out_h = open(fa_out, 'w') writer = FastaWriter(out_h) writer.write_header() for rec in FastaIterator(in_h, title2ids=header_function): writer.write_record(rec) writer.write_footer() out_h.close() in_h.close()
def create_rs(self, file): newpath = Cf().create_file_folder(file=file) filename, extension = os.path.splitext(os.path.basename(file)) random_genome_file = os.path.join(newpath, os.path.normpath(os.path.join(filename + '_random' + extension))) with open(file, 'rU') as GenomeFile: with open(random_genome_file, 'w') as RgFile: for record in FastaIterator(handle=GenomeFile): print('Creating random record for: ' + record.id) created_random_seq = self.generate_rs(str(record.seq)) random_record = SeqRecord(BioPythonSeq(created_random_seq), id=record.id + '_random_', name=record.name + '_random_', description=record.description + '_random_') SeqIO.write(random_record, RgFile, 'fasta') RgFile.close() return random_genome_file
def parse_file(self, file_path): data = {} print("Analysing: " + file_path) with open(file_path) as file: for record in FastaIterator(file): data[record.id] = {} start_index = 0 end_index = len(record.seq) - 1 while start_index + self.window_size < end_index: data[record.id][start_index] = self.parse_sequence( record.seq[start_index:(start_index + self.window_size)]) start_index += self.window_size data[record.id][start_index] = self.parse_sequence( record.seq[start_index:end_index]) return data
def simple_check(self, filename): """Test parsing single record FASTA files.""" msg = f"Test failure parsing file {filename}" title, seq = read_title_and_seq(filename) # crude parser idn, name, descr = title_to_ids(title) # First check using Bio.SeqIO.FastaIO directly with title function. records = FastaIterator(filename, title2ids=title_to_ids) record = next(records) with self.assertRaises(StopIteration): next(records) self.assertEqual(record.id, idn, msg=msg) self.assertEqual(record.name, name, msg=msg) self.assertEqual(record.description, descr, msg=msg) self.assertEqual(record.seq, seq, msg=msg) # Now check using Bio.SeqIO (default settings) record = SeqIO.read(filename, "fasta") self.assertEqual(record.id, title.split()[0], msg=msg) self.assertEqual(record.name, title.split()[0], msg=msg) self.assertEqual(record.description, title, msg=msg) self.assertEqual(record.seq, seq, msg=msg)
def FindGene(PATRICID, Header): OUT = dict() SPGENE = pd.read_csv('/pylon5/br5phhp/tv349/AMR/PATRIC/SPGENE/' + PATRICID + '.PATRIC.spgene.tab', sep='\t') LocalPos = SPGENE.index[SPGENE['patric_id'] == Header].tolist() # if the sequence exists here: OUTSPGENE = dict() if len(LocalPos) == 1: OUTSPGENE = (SPGENE.loc[LocalPos, ['gene', 'product', 'property', 'function']] ).to_dict('records')[0] FEATURES = pd.read_csv('/pylon5/br5phhp/tv349/AMR/PATRIC/FEATURES/' + PATRICID + '.PATRIC.features.tab', sep='\t') LocalPos = FEATURES.index[FEATURES['patric_id'] == Header].tolist() OUTFEATURES = dict() if len(LocalPos) == 1: OUTFEATURES = (FEATURES.loc[LocalPos, ['gene', 'product']]).to_dict('records')[0] OUT = {**OUTFEATURES, **OUTSPGENE} # Get sequence with open("/pylon5/br5phhp/tv349/AMR/PATRIC/PROTEIN/" + PATRICID + ".PATRIC.faa") as handle: for record in FastaIterator(handle): if record.id == Header: AAseq = str(record.seq) OUT['translation'] = AAseq return OUT
FNULL = open(os.devnull, 'w') pid = os.getpid() #reverse complement rev primer ForPrimer = args.fwdprimer RevPrimer = revcomp_lib.RevComp(args.revprimer) print 'Loading ' + '{0:,}'.format(amptklib.countfasta( args.input)) + ' sequence records' print 'Searching for forward primer: %s, and reverse primer: %s' % (ForPrimer, RevPrimer) print 'Requiring reverse primer match with at least %i mismatches' % args.primer_mismatch #loop through seqs, remove primer if found, and truncate to length truncated = 'bold2amptk_' + str(pid) + '.truncate.tmp' with open(truncated, 'w') as output: for record in FastaIterator(open(args.input)): Seq = str(record.seq) StripSeq = '' ForCutPos = amptklib.findFwdPrimer(ForPrimer, Seq, args.primer_mismatch, amptklib.degenNucSimple) RevCutPos = amptklib.findRevPrimer(RevPrimer, Seq, args.primer_mismatch, amptklib.degenNucSimple) if ForCutPos and RevCutPos: StripSeq = Seq[ForCutPos:RevCutPos] elif not ForCutPos and RevCutPos: StripSeq = Seq[:RevCutPos] if len(StripSeq) >= args.minlen: output.write('>%s\n%s\n' % (record.description, StripSeq))
def PairedFastaQualIterator(fasta_handle, qual_handle, alphabet = single_letter_alphabet, title2ids = None) : """Iterate over matched FASTA and QUAL files as SeqRecord objects. For example, consider this short QUAL file:: >EAS54_6_R1_2_1_413_324 26 26 18 26 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 26 26 23 23 >EAS54_6_R1_2_1_540_792 26 26 26 26 26 26 26 26 26 26 26 22 26 26 26 26 26 12 26 26 26 18 26 23 18 >EAS54_6_R1_2_1_443_348 26 26 26 26 26 26 26 26 26 26 26 24 26 22 26 26 13 22 26 18 24 18 18 18 18 And a matching FASTA file:: >EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC >EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA >EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG You can parse these separately using Bio.SeqIO with the "qual" and "fasta" formats, but then you'll get a group of SeqRecord objects with no sequence, and a matching group with the sequence but not the qualities. Because it only deals with one input file handle, Bio.SeqIO can't be used to read the two files together - but this function can! For example, >>> rec_iter = PairedFastaQualIterator(open("Quality/example.fasta", "rU"), ... open("Quality/example.qual", "rU")) >>> for record in rec_iter : ... print record.id, record.seq EAS54_6_R1_2_1_413_324 CCCTTCTTGTCTTCAGCGTTTCTCC EAS54_6_R1_2_1_540_792 TTGGCAGGCCAAGGCCGATGGATCA EAS54_6_R1_2_1_443_348 GTTGCTTCTGGCGTGGGTGGGGGGG As with the FASTQ or QUAL parsers, if you want to look at the qualities, they are in each record's per-letter-annotation dictionary as a simple list of integers: >>> print record.letter_annotations["phred_quality"] [26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 24, 26, 22, 26, 26, 13, 22, 26, 18, 24, 18, 18, 18, 18] If you have access to data as a FASTQ format file, using that directly would be simpler and more straight forward. Note that you can easily use this function to convert paired FASTA and QUAL files into FASTQ files: >>> from Bio import SeqIO >>> rec_iter = PairedFastaQualIterator(open("Quality/example.fasta", "rU"), ... open("Quality/example.qual", "rU")) >>> out_handle = open("Quality/temp.fastq", "w") >>> SeqIO.write(rec_iter, out_handle, "fastq") 3 >>> out_handle.close() And don't forget to clean up the temp file if you don't need it anymore: >>> import os >>> os.remove("Quality/temp.fastq") """ from Bio.SeqIO.FastaIO import FastaIterator fasta_iter = FastaIterator(fasta_handle, alphabet=alphabet, \ title2ids=title2ids) qual_iter = QualPhredIterator(qual_handle, alphabet=alphabet, \ title2ids=title2ids) #Using zip(...) would create a list loading everything into memory! #It would also not catch any extra records found in only one file. while True : try : f_rec = fasta_iter.next() except StopIteration : f_rec = None try : q_rec = qual_iter.next() except StopIteration : q_rec = None if f_rec is None and q_rec is None : #End of both files break if f_rec is None : raise ValueError("FASTA file has more entries than the QUAL file.") if q_rec is None : raise ValueError("QUAL file has more entries than the FASTA file.") if f_rec.id != q_rec.id : raise ValueError("FASTA and QUAL entries do not match (%s vs %s)." \ % (f_rec.id, q_rec.id)) if len(f_rec) != len(q_rec.letter_annotations["phred_quality"]) : raise ValueError("Sequence length and number of quality scores disagree for %s" \ % f_rec.id) #Merge the data.... f_rec.letter_annotations["phred_quality"] = q_rec.letter_annotations["phred_quality"] yield f_rec