def fetch(args): fasta = Fasta(args.fasta) regions = args.regions if args.list: with args.list as listfile: for region in listfile: regions.append(region.rstrip()) for region in regions: region = region.split()[0] try: rname, interval = region.split(':') except ValueError: rname = region interval = None try: start, end = interval.split('-') sequence = fasta[rname][int(start) - 1:int(end)] except (AttributeError, ValueError): sequence = fasta[rname][:] if args.complement: sequence = sequence.complement if args.reverse: sequence = sequence.reverse line_len = fasta[rname]._fa.faidx.index[rname]['lenc'] if args.name: sys.stdout.write('>' + sequence.name + '\n') for line in wrap_sequence(line_len, sequence.seq): sys.stdout.write(line) else: for line in wrap_sequence(line_len, sequence.seq): sys.stdout.write(line) fasta.close()
def fetch_sequence(args, fasta, name, start=None, end=None): try: line_len = fasta.faidx.index[name].lenc if args.auto_strand and start > end and start is not None and end is not None: # flip (0, 1] coordinates sequence = fasta[name][end - 1:start + 1] sequence = sequence.reverse.complement else: sequence = fasta[name][start:end] except KeyError: sys.stderr.write( "warning: {name} not found in file\n".format(**locals())) return if args.complement: sequence = sequence.complement if args.reverse: sequence = sequence.reverse if args.no_output: return if args.no_names: pass else: if (start or end) and not args.no_coords: yield ''.join(['>', sequence.fancy_name, '\n']) else: yield ''.join(['>', sequence.name, '\n']) for line in wrap_sequence(line_len, sequence.seq): yield line
def splitGenome(self): fa = Fasta(self.fastaFileName) for seq in fa: with open('{}{}.fa'.format(STANDARD_GENOME_PATH, seq.name), 'w') as out: out.write('>{}\n'.format(seq.name)) for line in wrap_sequence(70, str(seq)): out.write(line) print("<<<<<<<Splitted>>>>>>")
def make_long_fasta(filename, nrecs=250, seqlen=SEQLEN): headers = [] with open(filename, 'w') as f: s = "ACTGACTGAC" for i in range(nrecs): h = "header%i" % i headers.append(h) f.write('>' + h + '\n') for line in pyfaidx.wrap_sequence(80, s * (seqlen//10)): f.write(line) return headers
def Split_chromosome(): print("Enter the FASTA file :") fasta = input() fasta = '{}.{}'.format(fasta, 'fa') fa = Fasta(fasta) print("Splitting chromosomes..........") for seq in fa: with open('{}.fa'.format(seq.name), 'w') as out: out.write('>{}\n'.format(seq.name)) for line in wrap_sequence(70, str(seq)): out.write(line) print("<<<<<<<Splitted>>>>>>")
def fetch_sequence(args, fasta, name, start=None, end=None): line_len = fasta.faidx.index[name].lenc sequence = fasta[name][start:end] if args.complement: sequence = sequence.complement if args.reverse: sequence = sequence.reverse if not args.no_names: if start or end: yield ''.join(['>', sequence.longname, '\n']) else: yield ''.join(['>', sequence.name, '\n']) for line in wrap_sequence(line_len, sequence.seq): yield line
def bed_to_introns(bed_in, fasta_in, fasta_out): logging.info("Opening FASTA: {0}".format(fasta_in)) logging.info("Note: will take a while the first time it is opened.") fasta = Fasta(fasta_in, key_function=lambda key: key.split()[0], strict_bounds=True) bed_h = open(bed_in, 'r') all_keys = {} output_seq = [] count = 0 for line in bed_h: if count % 10000 == 0 and count > 0: logging.info("On intron: {0}".format(count)) ref, start, stop, genename = line.split() seq = fasta[ref][int(start):int(stop)] key = ref + ':' + start + '-' + stop if key in all_keys: logging.warning("ERROR: {0} appears once already".format(key)) continue all_keys[key] = True if int(start) > int(stop): logging.warning( "Intron coords greater than reference: {0}:{1}-{2}".format( ref, start, stop)) logging.warning("Reference length: {0}".format(len(fasta[ref]))) continue if len(seq) == 0: logging.warning("Intron length is 0? {0}:{1}-{2}".format( ref, start, stop)) continue output_seq.append(seq) count += 1 bed_h.close() with open(fasta_out, 'w') as outf: logging.info("Writing intron sequences out to {0}".format(fasta_out)) for rec in output_seq: print('>{rname}:{start}-{end}'.format(rname=rec.name, start=str(rec.start - 1), end=str(rec.end)), file=outf) for line in wrap_sequence(60, rec.seq): outf.write(line)
def fetch_sequence(args, fasta, name, start=None, end=None): try: line_len = fasta.faidx.index[name].lenc sequence = fasta[name][start:end] except KeyError: sys.stderr.write("warning: {name} not found in file\n".format(**locals())) return if args.complement: sequence = sequence.complement if args.reverse: sequence = sequence.reverse if args.no_names: pass elif args.full_names: yield ''.join(['>', fasta[name].long_name, '\n']) else: if start or end: yield ''.join(['>', sequence.longname, '\n']) else: yield ''.join(['>', sequence.name, '\n']) for line in wrap_sequence(line_len, sequence.seq): yield line
with open(pos) as P, open(outfa, 'w') as OUT, open(outdisstat, 'w') as DIS: tmp = {} basedis = {} DIS.write('\t'.join(['location', 'A', 'C', 'G', 'T']) + '\n') for i in P.readlines(): if i.startswith('PA'): continue #this is use the python spice method, so must substract one id, st, ed, strand = makecord(i.strip()) seqname = id + ':' + str(st) + '-' + str(ed) + ':' + strand sequence = fasta[id][st:ed] line_len = fasta.faidx.index[id].lenc if strand == '-': sequence = sequence.complement sequence = sequence.reverse OUT.write('>' + seqname + '\n') for line in wrap_sequence(line_len, sequence.seq): OUT.write(line) else: OUT.write('>' + seqname + '\n') for line in wrap_sequence(line_len, sequence.seq): OUT.write(line) for index, base in enumerate(list(sequence.seq)): if index not in basedis: basedis[index] = [base] else: basedis[index].append(base) for index, base in basedis.items(): c = Counter(base) total = sum(c.values()) percent = {key: value / total for key, value in c.items()}
def main(progname=None): parse = argparse.ArgumentParser( description= 'Build normal genome by integrating germline SNPs from a VCF file.', prog=progname if progname else sys.argv[0]) parse.add_argument('-v', '--vcf', type=check_vcf, required=True, metavar='FILE', help='a VCF file containing germline SNPs') parse.add_argument('-r', '--reference', type=str, required=True, metavar='FILE', help='a fasta file of reference genome') default = 'normal_fa' parse.add_argument('-o', '--output', type=check_output_folder, default=default, metavar='DIR', help='output directory [{}]'.format(default)) parse.add_argument( '-a', '--autosomes', type=check_autosomes, required=True, metavar='STR', help='autosomes of the genome (e.g. 1,2,3,4,5 or 1..4,5)') default = None parse.add_argument( '-s', '--sex_chr', type=check_sex, default=default, metavar='STR', help='sex chromosomes of the genome (separated by comma) [{}]'.format( default)) args = parse.parse_args() if args.sex_chr == None: args.sex_chr = [] else: args.sex_chr = args.sex_chr.split(',') autosomes = parse_autosomes(args.autosomes) #build the data structure: genome_profile reference = pyfaidx.Fasta(args.reference) genome_profile = fai_info(fai=args.reference + '.fai', autosomes=autosomes, sex_chr=args.sex_chr) #fill in the list hap_vars in genome_profile add_vcf_vars(profile=genome_profile, vcf=args.vcf) os.mkdir(args.output, mode=0o755) for i in range(2): with open('{}/normal.parental_{}.fa'.format(args.output, i), 'w') as output: for chroms in genome_profile['order']: if i < len(genome_profile[chroms]['hap_vars']): start = 0 segments = [] for snp in genome_profile[chroms]['hap_vars'][i]: try: segments.append(reference[chroms][start:(snp[0] - 1)].seq) except ValueError: if snp[0] - start == 1: #This snp and the previous one is adjacent snps. e.g. chr1 45 (previous) and chr1 46 (current) #If you retrive by reference['chr1'][45:(46-1)].seq, the return is not '', an error will pop actually. pass else: raise segments.append(snp[1]) start = snp[0] if start < genome_profile[chroms]['length']: segments.append(reference[chroms][start:].seq) output.write('>{}\n'.format(chroms)) for outputline in pyfaidx.wrap_sequence( genome_profile[chroms]['linebases'], ''.join(segments)): output.write(outputline)
def build_fasta(output=None, chain=None, normal_fa=None, width=None): refs = [] for fa in normal_fa.split(','): refs.append(pyfaidx.Fasta(fa)) parentalre = re.compile('^parental:[01]$') node = os.path.basename(chain) node = node.split('.')[0] outputf = [] for parental in 0, 1: outputf.append( open('{}/{}.parental_{}.fa'.format(output, node, parental), 'w')) reference = None with open(chain) as inputf: seq_name = None parental = None seq = [] for line in inputf: line = line.rstrip() if line.startswith('>'): if seq: outputf[parental].write('>{}\n'.format(seq_name)) for outputline in pyfaidx.wrap_sequence( width, ''.join(seq)): outputf[parental].write(outputline) seq_name, parental = line[1:].split() if parentalre.match(parental): parental = int(parental.split(':')[1]) try: reference = refs[parental] except IndexError: raise FastaMissingError( 'There is no parental {} avalible,\n'.format( parental) + 'which is required in the record ({}):\n{}\n'. format(chain, line)) else: raise ChainFileError( 'The format of this line below from the chain file ' + '({}) is not correct:\n{}\n'.format(chain, line)) seq = [] else: column = line.split() chroms = column[0] start = int(column[1]) end = int(column[2]) seq_type = column[3] segment = '' if seq_type == 'REF': segment = reference[chroms][start:end].seq elif seq_type == 'SNV': ref = reference[chroms][start:end].seq m = Mutation(ref=ref, form=column[4]) segment = m.alternative elif seq_type == 'DEL': pass else: raise ChainFileError( 'Can not recognize the sequence type ({}) '.format( seq_type) + 'of the record below from the chain file ({}):\n{}\n'. format(chain, line)) seq.append(segment) if seq: outputf[parental].write('>{}\n'.format(seq_name)) for outputline in pyfaidx.wrap_sequence(width, ''.join(seq)): outputf[parental].write(outputline) for parental in 0, 1: outputf[parental].close()
# Import packages import os import sys # Import packages from Bio import SeqIO from Bio.SeqRecord import SeqRecord from Bio.SeqFeature import FeatureLocation, CompoundLocation from Bio import pairwise2 from Bio.Seq import Seq from pyfaidx import Fasta, wrap_sequence file = sys.argv[1] output_file = file with open(output_file + ".nosynthetic", 'a') as out: fasta_sequences = SeqIO.parse(open(file), 'fasta') for fasta in fasta_sequences: name = fasta.name desc = fasta.description if '>' in desc: desc_s = desc.split('>')[0] else: desc_s = desc if 'synthetic' not in desc_s \ and 'artificial' not in desc_s \ and 'fragment' not in desc_s \ and 'low quality' not in desc_s \ and 'partial' not in desc_s: out.write('>{}\n'.format(desc_s)) for line in wrap_sequence(70, str(fasta.seq)): out.write(line)
from os import listdir from pyfaidx import Fasta, wrap_sequence fileList = listdir('.') fileList.remove('BhybridumS_002_v1.0.softmasked.fa') fileList.remove('q.PAC4GC.hybridum.S.chr.list') fileList = [fasta for fasta in fileList if fasta.endswith('.fasta')] with open('q.PAC4GC.hybridum.S.chr.list','r') as f: chrList = f.read().split('\n') print chrList chrList.remove('') for file in fileList: newfilename = file[:file.find('.')]+'S.fasta' print newfilename fasta = Fasta(file) open(newfilename,'w').close() with open(newfilename,'w') as fOut: print chrList print file,newfilename for chromName in chrList: print '>'+chromName fOut.write('>' + chromName +'\n%s\n'%(''.join(line for line in wrap_sequence(60, str(fasta[chromName])))))