def aa_seq(options): """ Gets the ancestral sequence from a Fasta file """ f = Fasta(options.ancestralfasta) keyz = (f.keys()) match = '' if (options.single_chromosome): # Single chromosome fasta should only have one sequence. # that sequence should be the sequence of interest. keyz = list(keyz) key = keyz[0] else: get_chromosome_from_header = options.header get_chromosome_from_header = \ get_chromosome_from_header.replace('?', options.chromosome) for key in keyz: if(re.match(get_chromosome_from_header, key) is not None): match = key if(match is ''): raise Exception("No match possible is something wrong with the" " regex specified to the program as" "--header-regex") aaSeq = f[key] return(aaSeq)
def calc_nuc_counts(fasta_filename, region_size_min, region_size_max, verbose): ''' calculate nuc frequencies for normalization. Returns: dict of nucleotide frequencies. ''' nuc_counts = defaultdict(Counter) fasta = Fasta(fasta_filename) for chrom, seq in fasta.items(): for idx, pos in enumerate(seq): for region_size in range(region_size_min, region_size_max + 1): nucs = seq[idx:idx+region_size] if len(nucs) < region_size: continue nuc_counts[region_size][nucs] += 1 return nuc_counts
def _no_empty(self, lista, listb): ''' removes empty entries ''' # check for empty fasta. tmpa = list() tmpb = list() for i in range(len(listb)): # open it. try: z = Fasta(listb[i], record_class=MemoryRecord) # check for empty. if len(z.keys()) == 0: continue # add to temp. tmpa.append(lista[i]) tmpb.append(listb[i]) except: logging.warning("bad fasta file") # sort back. return tmpa, tmpb
def parse_sequences(sites, size, fasta_file): """Adds the binding site sequences extende to 'size' per row (decoded as A=0, C=1, G=2, T=3) to each input region.""" from pyfasta import Fasta # Fasta package is needed to fetch sequences from genome fasta file print "INFO: Begin to fetch sequences...." f = Fasta(fasta_file, key_fn=lambda key: key.split()[0]) for i, reg in enumerate(sites): start = reg["ext_start"] end = reg["ext_end"] # if motif on negativ strand, shift region by +1 to account for zero based half-open intervals if reg["strand"] == '-': start += 1 end += 1 seq = f.sequence({"chr":reg["chr"], "start":start, "stop":end}, one_based=False) # Note, the 'strand':reg["strand"] argument for f.sequence does not work, there seems to be a bug in the pyfasta/fasta.py code. seq = seq.upper() # if motif on negative strand, convert seq to reverse complement if reg["strand"] == '-': seq = reverse_complement(seq) # add sequence to region dict reg["ext_seq"] = seq print "INFO: Finished sequences." return regions
def read_fa(fa='/Share/home/zhangqf5/gongjing/Kethoxal_RNA_structure/data/mm10/transcriptome/mm10_transcriptome.fa'): gj.printFuncRun('read_fa') gj.printFuncArgs() fa_dict = Fasta(fa, key_fn=lambda key:key.split("\t")[0]) print fa_dict.keys()[0:3] gj.printFuncRun('read_fa') return fa_dict
def removehost(fasta, bed): removeregion = dict() with open(bed) as bedin: for i in bedin: removeregion[i.rstrip()] = 1 fa = Fasta(fasta) outfile = 'removehost_' + fasta outio = open(outfile, 'w') for seqname in fa.keys(): if seqname in removeregion: continue else: outst = '>' + seqname + '\n' + str(fa[seqname]) + '\n' outio.write(outst) outio.close()
def read_score(score_tab, ref): fa = Fasta(ref) fa_dict = {} for i, j in fa.items(): fa_dict[i.split('\t')[0]] = j score_dict = nested_dict() with open(score_tab, 'r') as TXT: for line in TXT: line = line.strip() if not line or line.startswith('@'): continue arr = line.split('\t') if arr[1] == '-': continue score_dict[arr[0]][int(arr[2])] = arr[7] score_dict = score_dict.to_dict() # print score_dict reactivity_dict = nested_dict(2, list) for i, j in score_dict.items(): for p in xrange(1, len(fa_dict[i]) + 1): if p not in score_dict[i]: r = 'NULL' elif score_dict[i][p] == '-1': r = 'NULL' else: r = score_dict[i][p] reactivity_dict[i]['reactivity_ls'].append(r) return reactivity_dict.to_dict()
def create_pyfasta_iterator(self, **kwargs): from pyfasta import Fasta print "Generating PyFasta sequence index. This may take a moment...." self.fasta = Fasta(kwargs['input']) self.readcount = len(self.fasta) self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys())) self.read = iter(self.db_values)
def read_fa( fa='/Share/home/zhangqf7/gongjing/zebrafish/data/reference/transcriptome/danRer10.refSeq.transcriptome.fa' ): fa_dict1 = Fasta(fa, key_fn=lambda key: key.split("\t")[0]) fa_dict = {i.split()[0]: j[0:] for i, j in fa_dict1.items()} print fa_dict.keys()[0:3] return fa_dict
def aa_seq(options): """ Gets the ancestral sequence from a Fasta file """ f = Fasta(options.ancestralfasta) keyz = (f.keys()) match = '' if (options.single_chromosome): # Single chromosome fasta should only have one sequence. # that sequence should be the sequence of interest. keyz = list(keyz) key = keyz[0] else: get_chromosome_from_header = options.header get_chromosome_from_header = \ get_chromosome_from_header.replace('?', options.chromosome) for key in keyz: if (re.match(get_chromosome_from_header, key) is not None): match = key if (match is ''): raise Exception("No match possible is something wrong with the" " regex specified to the program as" "--header-regex") aaSeq = f[key] return (aaSeq)
def run(self, filename): self.openOutFiles(filename) f = Fasta(filename) count = len(f) self.not_found_in_kabat, self.fr4_not_found, current = (0, 0, 0) for name in f.keys(): current += 1 if current % 1000 == 0: print "All %d. Current: %d" % (count, current) # format: vName_jName{frameNumber} or vName_dName{frameNumber}_jName{frameNumber} vGeneName = name.split("_")[0] vGeneRegions = self.getVGeneRegions(vGeneName) if vGeneRegions is None: continue withoutMarkup = f[name][vGeneRegions[self.kabat.regions_count * 2 - 1]:] group = self.findFR4(name, withoutMarkup) if group is None: continue self.result_kabat_file.write(name) self.result_kabat_file.write(("\t%d" * 10) % tuple(vGeneRegions)) self.result_kabat_file.write(("\t%d" * 4 + "\n") % tuple( [vGeneRegions[9] + i for i in [1, group.start(), group.start() + 1, len(withoutMarkup)]])) self.closeOutFiles() print "all: {}; not in kabat: {}; without fr4: {}".format(current, self.not_found_in_kabat, self.fr4_not_found)
def genome_contenct_stats(fasta_path): f = Fasta(fasta_path) g_box_total = [] for seqid in f.keys(): seq = f[seqid][:] g_boxs = len(re.findall('CACGTG',seq,flags=re.IGNORECASE)) g_box_total.append(g_boxs) print >>sys.stderr, "total gboxes:{0}".format(sum(g_box_total))
def read_fa(fa='/Share/home/zhangqf7/gongjing/zebrafish/data/reference/transcriptome/danRer10.refSeq.transcriptome.fa'): gj.printFuncRun('read_fa') gj.printFuncArgs() fa_dict1 = Fasta(fa, key_fn=lambda key:key.split("\t")[0]) fa_dict = {i.split()[0]:j[0:] for i,j in fa_dict1.items()} print fa_dict.keys()[0:3] gj.printFuncRun('read_fa') return fa_dict
def create_fasta_flat_file(file): """Reads a fasta file for fast sequence retrival""" fasta_file = Fasta(file, key_fn=lambda key: key.split()[0]) fasta_headers = set(fasta_file.keys()); return fasta_file, fasta_headers
def genome_contenct_stats(fasta_path): f = Fasta(fasta_path) g_box_total = [] for seqid in f.keys(): seq = f[seqid][:] g_boxs = len(re.findall("CACGTG", seq, flags=re.IGNORECASE)) g_box_total.append(g_boxs) print >> sys.stderr, "total gboxes:{0}".format(sum(g_box_total))
def getSequence(genome): genome=Fasta(genome) RAD_seq = pd.read_csv('../data/input_data/peak.csv') result = map(lambda i:[genome.sequence({'chr':RAD_seq['chrom'][i],'start':RAD_seq['start'][i],'stop':RAD_seq['end'][i]})],range(len(RAD_seq))) RAD_seq['seq'] = result RAD_seq['seq'] = RAD_seq.apply(fuc,axis=1) RAD_seq.to_csv('../data/input_data/RAD_seq.csv',index=False) print 'getSequence is over,RAD_seq.csv is bulit!'
def check_keyfn2(path, klass, inplace): f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda key: "-".join(key.split())) assert sorted(f.keys()) == ['a-extra', 'b-extra', 'c-extra'], f.keys() assert f['a-extra'] fix(path)
class Alg: def __init__(self, fastafn, freqfn, colorfn): self.pos = [] self.init = False self.size = 0 self.fasta = Fasta(fastafn) self.colorfn = colorfn self.conta = {'n':0, '-':0, 'a':1, 'c':2, 'g':3, 't':4, '\n':'\n'} self.read_fasta(fastafn) self.write_freqs(freqfn) def do_plot(self, plot, names = False): msa = self.seqtocol(self.colorfn, names= names) if plot: return(msa) def read_fasta(self, fastafn): for entry in self.fasta.keys(): seq = self.fasta[entry][:] if not self.init: # this assumes that all the entries in the fasta record are the same size. # this is the default setting for clustalo # TODO add an assertion ro verify so self.size = len(seq) for i in range(0, self.size): self.pos.append(Pos(i)) self.init = True for nt in range(0, self.size): self.pos[nt].freq[seq[nt].lower()]+=1 def seqtocol(self, outfn, names=False): outf = open(outfn, 'w') colors = [] for i,entry in enumerate(self.fasta.keys()): outf.write(entry+','+','.join([str(self.conta[i.lower()]) for i in self.fasta[entry][:]])+'\n') if names: colors.append(entry) [colors.append(self.conta[i.lower()]) for i in self.fasta[entry][:]] outf.close() # TODO thisis very weird, check why one option returns the transpose if names: #colors = np.array(colors).reshape( 1+i, 1+len(self.fasta[entry][:])) colors = np.array(colors).reshape( 1+len(self.fasta[entry][:]), 1+i) else: colors = np.array(colors).reshape(1+i, len(self.fasta[entry][:])) return(colors) def write_freqs(self, outfn): outf = open(outfn, 'w') outf.write('\t'.join(['a','c','t','g'])+'\n') for j in self.pos: outf.write('\t'.join([str(j.freq['a']),str(j.freq['c']),str(j.freq['t']),str(j.freq['g'])])+'\n') outf.close()
def __init__(self, fastafn, freqfn, colorfn): self.pos = [] self.init = False self.size = 0 self.fasta = Fasta(fastafn) self.colorfn = colorfn self.conta = {'n':0, '-':0, 'a':1, 'c':2, 'g':3, 't':4, '\n':'\n'} self.read_fasta(fastafn) self.write_freqs(freqfn)
def split(args): parser = optparse.OptionParser("""\ split a fasta file into separated files. pyfasta split -n 6 [-k 5000 ] some.fasta the output will be some.0.fasta, some.1.fasta ... some.6.fasta the sizes will be as even as reasonable. """) parser.add_option("--header", dest="header", metavar="FILENAME_FMT", help="""this overrides all other options. if specified, it will split the file into a separate file for each header. it will be a template specifying the file name for each new file. e.g.: "%(fasta)s.%(seqid)s.fasta" where 'fasta' is the basename of the input fasta file and seqid is the header of each entry in the fasta file.""" ,default=None) parser.add_option("-n", "--n", type="int", dest="nsplits", help="number of new files to create") parser.add_option("-o", "--overlap", type="int", dest="overlap", help="overlap in basepairs", default=0) parser.add_option("-k", "--kmers", type="int", dest="kmers", default=-1, help="""\ split big files into pieces of this size in basepairs. default default of -1 means do not split the sequence up into k-mers, just split based on the headers. a reasonable value would be 10Kbp""") options, fasta = parser.parse_args(args) if not (fasta and (options.nsplits or options.header)): sys.exit(parser.print_help()) if isinstance(fasta, (tuple, list)): assert len(fasta) == 1, fasta fasta = fasta[0] kmer = options.kmers if options.kmers != -1 else None overlap = options.overlap if options.overlap != 0 else None f = Fasta(fasta) if options.header: names = dict([(seqid, options.header % \ dict(fasta=f.fasta_name, seqid=seqid)) \ for seqid in f.iterkeys()]) """ if len(names) > 0: assert names[0][1] != names[1][1], ("problem with header format", options.header) fhs = dict([(seqid, open(fn, 'wb')) for seqid, fn in names[:200]]) fhs.extend([(seqid, StringIO(), fn) for seqid, fn in names[200:]]) """ return with_header_names(f, names) else: names = newnames(fasta, options.nsplits, kmers=kmer, overlap=overlap, header=options.header) #fhs = [open(n, 'wb') for n in names] if options.kmers == -1: return without_kmers(f, names) else: return with_kmers(f, names, options.kmers, options.overlap)
def mask_to_bed(fasta_file, mask_bed_name): "creates a bed file of the start and stops of masked seqs" mask_bed = open(mask_bed_name,"wb") f= Fasta(fasta_file) mask_id = 1 for seqid in f.keys(): seq = f[seqid][:] for m in re.finditer("X+",seq): mask_id = mask_id + 1 w = '{0}\t{1}\t{2}\t{3}\t{4}\t+\t.\t.\t.\t1\t{5}\t0\n'.format(seqid,m.start(),m.end(),"mask_id {0}".format(mask_id),(m.end()-m.start()),(m.end()-m.start()+1)) mask_bed.write(w)
def write_c2t(fasta_name, unconverted, colorspace=False): """ given a fasta file, write a new file: `some.fr.c2t.fasta` which contains: + the same headers prefixed with 'f' with all C's converted to T + headers prefixed with 'r' reverse complemented with all C's converted to T. if unconverted is false, then also save a file with the forward and reverse without conversion. """ d = op.join(op.dirname(fasta_name), "bowtie_index") if colorspace: d += "_colorspace" if not op.exists(d): os.mkdir(d) p, ext = op.splitext(op.basename(fasta_name)) # some.fasta -> some, fasta fname = "%s/%s.fr.c2t%s" % (d, p, ext) # no conversion, just copy the file into the index dir. unconverted_fname = "%s/%s.fr%s" % (d, p, ext) if op.exists(fname): if not unconverted: return fname, unconverted_fname elif op.exists(unconverted_fname): return fname, unconverted_fname fasta = Fasta(fasta_name) c2t_fh = open(fname, 'w') unc_fh = open(unconverted_fname, 'w') if unconverted else None print >> sys.stderr, "writing forward and reverse c2t to: %s" % (fname, ) try: for header in fasta.iterkeys(): seq = str(fasta[header]).upper() assert not ">" in seq # c2t, prefix header with f and write print >> c2t_fh, ">f%s" % header print >> c2t_fh, seq.replace('C', 'T') # then r-c, c2t, prefix header with r and write print >> c2t_fh, ">r%s" % header rseq = revcomp(seq) print >> c2t_fh, rseq.replace('C', 'T') if unc_fh is not None: print >> unc_fh, ">f%s\n%s" % (header, seq) print >> unc_fh, ">r%s\n%s" % (header, rseq) c2t_fh.close() except: os.unlink(fname) os.unlink(unconverted_fname) raise return fname, unconverted_fname
def write_c2t(fasta_name, unconverted, colorspace=False): """ given a fasta file, write a new file: `some.fr.c2t.fasta` which contains: + the same headers prefixed with 'f' with all C's converted to T + headers prefixed with 'r' reverse complemented with all C's converted to T. if unconverted is false, then also save a file with the forward and reverse without conversion. """ d = op.join(op.dirname(fasta_name), "bowtie_index") if colorspace: d += "_colorspace" if not op.exists(d): os.mkdir(d) p, ext = op.splitext(op.basename(fasta_name)) # some.fasta -> some, fasta fname = "%s/%s.fr.c2t%s" % (d, p, ext) # no conversion, just copy the file into the index dir. unconverted_fname = "%s/%s.fr%s" % (d, p, ext) if op.exists(fname): if not unconverted: return fname, unconverted_fname elif op.exists(unconverted_fname): return fname, unconverted_fname fasta = Fasta(fasta_name) c2t_fh = open(fname, 'w') unc_fh = open(unconverted_fname, 'w') if unconverted else None print >>sys.stderr, "writing forward and reverse c2t to: %s" % (fname,) try: for header in fasta.iterkeys(): seq = str(fasta[header]).upper() assert not ">" in seq # c2t, prefix header with f and write print >>c2t_fh, ">f%s" % header print >>c2t_fh, seq.replace('C', 'T') # then r-c, c2t, prefix header with r and write print >>c2t_fh, ">r%s" % header rseq = revcomp(seq) print >>c2t_fh, rseq.replace('C', 'T') if unc_fh is not None: print >>unc_fh, ">f%s\n%s" % (header, seq) print >>unc_fh, ">r%s\n%s" % (header, rseq) c2t_fh.close() except: os.unlink(fname) os.unlink(unconverted_fname) raise return fname, unconverted_fname
def cut_up_genome(input_files_list, output_folder, region_length): for file in input_files_list: f = Fasta(file) chr = sorted(f.keys()) for chromosome in chr: sequence = f[chromosome] regions = [ sequence[i:i + region_length] for i in range(0, len(sequence), region_length) ] path = os.path.join(output_folder, f'chr={chromosome}') write_to_json(path, regions, region_length) print(f'{chromosome} is complete!')
def mask_to_bed(fasta_file, mask_bed_name): "creates a bed file of the start and stops of masked seqs" mask_bed = open(mask_bed_name, "wb") f = Fasta(fasta_file) mask_id = 1 for seqid in f.keys(): seq = f[seqid][:] for m in re.finditer("X+", seq): mask_id = mask_id + 1 w = '{0}\t{1}\t{2}\t{3}\t{4}\t+\t.\t.\t.\t1\t{5}\t0\n'.format( seqid, m.start(), m.end(), "mask_id {0}".format(mask_id), (m.end() - m.start()), (m.end() - m.start() + 1)) mask_bed.write(w)
def mask(fasta_file, org, cutoff, mask_value='X'): h5, node = get_node(org, 'r') outfile = fasta_file[:fasta_file.rfind(".")] + (".masked.%i" % cutoff) \ + fasta_file[fasta_file.rfind("."):] print "> masking sequence to file:", outfile out = open(outfile ,'w') fasta = Fasta(fasta_file) soft_mask = mask_value.lower() == 'soft' for seqid in sorted(fasta.iterkeys()): masked = 0 if soft_mask: seq = str(fasta[seqid]) # mask is the lowercase sequence. mask_value = np.array(seq.lower(), dtype='c') seq = np.array(seq.upper(), dtype='c') else: fasta[seqid].tostring = False seq = fasta[seqid][:] # a if not 'c' + seqid in node: print >>sys.stderr, seqid,\ '! not found in masked, writing unchanged\n' \ ' this means that no section of this sequence appeared\n' \ ' more than %i times' % cutoff out.write('>' + seqid + '\n') out.write(seq.tostring() + '\n') continue hit_counts = getattr(node, 'c' + seqid)[:] masked_seq = np.where(numexpr.evaluate("hit_counts > %i" % cutoff) , mask_value, seq).tostring() l = len(masked_seq) print >>sys.stderr, "! seq:%s len:%i %%masked:%.3f" % (seqid, l, 100.0 * masked_seq.count(mask_value) / l) assert len(seq) == l out.write('>' + seqid + '\n') out.write(masked_seq + '\n') out.close() # write out a file .fasta.version containing # the svnversion (if available of this script # that was used to create the file. path = os.path.dirname(__file__) os.system('svnversion %s > %s.version' % (path, outfile)) h5.close()
def main(gff_file, outdir): """empty docstring""" name = re.compile("parent=([^.;]+)", re.I) feats = {} non_cds_feats = collections.defaultdict(list) for line in open(gff_file): line = line.split("\t") match = re.search(name, line[-1]) if not match: continue fname = match.groups(0)[0] non_cds_feats[fname].append(line) if line[2].upper() == "CDS": feats[fname] = True continue if fname in feats: continue feats[fname] = None i = 0 for k, v in sorted(feats.items()): if not v is None: del non_cds_feats[k] seen = {} RNA = open(outdir + "/at_non_cds.gff", "w") for k, feat_list in sorted(non_cds_feats.items()): for feat in feat_list: if feat[0] in ("ChrC", "ChrM"): continue if feat[2] == "exon": continue key = (feat[0], feat[3], feat[4]) if key in seen: continue feat[0] = feat[0].upper().replace("CHR", "") seen[key] = True feat[-1] = k print >> RNA, "\t".join(feat) RNA.close() gff = read_gff(outdir + "/at_non_cds.gff") fasta = Fasta("/home/gturco/src/find_cns_gturco/pipeline/data/arabidopsis.fasta") ftypes = {} FA = open(outdir + "/at_rnas.fasta", "w") for chr, feature_list in gff.iteritems(): for fname, feature in feature_list.iteritems(): seq = fasta.sequence(feature) print >> FA, ">", feature["name"] print >> FA, seq FA.close()
def mask(fasta_file, org, cutoff, mask_value='X'): h5, node = get_node(org, 'r') outfile = fasta_file[:fasta_file.rfind(".")] + (".masked.%i" % cutoff) \ + fasta_file[fasta_file.rfind("."):] print "> masking sequence to file:", outfile out = open(outfile, 'w') fasta = Fasta(fasta_file) soft_mask = mask_value.lower() == 'soft' for seqid in sorted(fasta.iterkeys()): masked = 0 if soft_mask: seq = str(fasta[seqid]) # mask is the lowercase sequence. mask_value = np.array(seq.lower(), dtype='c') seq = np.array(seq.upper(), dtype='c') else: fasta[seqid].tostring = False seq = fasta[seqid][:] # a if not 'c' + seqid in node: print >>sys.stderr, seqid,\ '! not found in masked, writing unchanged\n' \ ' this means that no section of this sequence appeared\n' \ ' more than %i times' % cutoff out.write('>' + seqid + '\n') out.write(seq.tostring() + '\n') continue hit_counts = getattr(node, 'c' + seqid)[:] masked_seq = np.where(numexpr.evaluate("hit_counts > %i" % cutoff), mask_value, seq).tostring() l = len(masked_seq) print >> sys.stderr, "! seq:%s len:%i %%masked:%.3f" % ( seqid, l, 100.0 * masked_seq.count(mask_value) / l) assert len(seq) == l out.write('>' + seqid + '\n') out.write(masked_seq + '\n') out.close() # write out a file .fasta.version containing # the svnversion (if available of this script # that was used to create the file. path = os.path.dirname(__file__) os.system('svnversion %s > %s.version' % (path, outfile)) h5.close()
def check_keyfn(path, klass, inplace): f = Fasta(path, record_class=klass, flatten_inplace=inplace, key_fn=lambda key: key.split()[0]) assert sorted(f.keys()) == ['a', 'b', 'c'], f.keys() fix(path) ff = Fasta(path, record_class=klass, flatten_inplace=inplace) assert sorted(ff.keys()) == ['a extra', 'b extra', 'c extra'], (ff.keys(), klass) fix(path)
def check_kmer_overlap(f): chr2 = f['chr2'] kmers = Fasta.as_kmers(chr2, 10, overlap=2) for i, k in enumerate(list(kmers)[:-1]): assert (len(k[1]) == 10) assert (k[0] == (i * (10 - 2))) kmers = Fasta.as_kmers(chr2, 10, overlap=4) seqs = [k[1] for k in kmers] paired_seqs = zip(seqs[0:-1], seqs[1:]) for a, b in paired_seqs: if len(a) < 4 or len(b) < 4: continue assert (a[-4:] == b[:4])
def spgenome(fafile, outdir, maxsize=1000000000): spfiles = list() if path.exists(fafile): outfiles = dict() subfiles = dict() infa = Fasta(fafile) # nowsub = 0 nowlen = 0 for chrom in infa.keys(): chrlen = len(infa[chrom]) nowlen = nowlen+chrlen nowsub = int(nowlen/maxsize) if nowsub not in subfiles: subfilename = 'tmpfile' + str(nowsub) + '.fa' subfile = path.join(outdir,subfilename) spfiles.append(subfile) subfiles[nowsub] = open(subfile,'w') # outfiles[chrom] = nowsub print('>', chrom, sep='', file=subfiles[nowsub]) print(infa[chrom], file=subfiles[nowsub]) for nowsub in subfiles: subfiles[nowsub].close() else: print("Can't find ", fafile) return spfiles
def search(self, ref_base, pos, alt_base="X"): var_name = "".join([ref_base, str(pos), alt_base]) fasta_string = self.create_variant_probe_set(var_name=var_name) with tempfile.NamedTemporaryFile() as fp: fp.write(fasta_string) fp.seek(0) fasta = Fasta(fp.name) refs = [] alts = [] for k, v in fasta.items(): if "ref" in k: refs.append(str(v)) else: alts.append(str(v)) return {"query": var_name, "results": self.genotype_alleles(refs, alts)}
def get_sequence_dict(file_path, upper=True): """ Returns a dictionary of fasta records. If upper is true, all bases will be uppercased. """ assert os.path.exists(file_path), ('Error: FASTA file {} does not exist'.format(file_path)) gdx_path = file_path + ".gdx" assert os.path.exists(gdx_path), ("Error: gdx does not exist for this fasta. We need the fasta files to be " "flattened in place prior to running the pipeline because of concurrency issues.") flat_path = file_path + '.flat' assert os.path.exists(flat_path), ("Error: flat file does not exist for this fasta. We need the fasta files to be " "flattened in place prior to running the pipeline because of concurrency issues.") if upper is True: return Fasta(file_path, record_class=UpperNpyFastaRecord) else: return Fasta(file_path)
def align_consensus(tmp_dir, gp, target_genome_fasta, ref_tx_fasta): """ Main consensus alignment function. """ ref_tx_fasta = Fasta(ref_tx_fasta) target_genome_fasta = Fasta(target_genome_fasta) tmp_tgt, tmp_ref, tmp_psl = prepare_tmp_files(tmp_dir, gp, target_genome_fasta) tx_seq = str(ref_tx_fasta[gp.name]) fastaWrite(tmp_ref, gp.name, tx_seq) system("blat {} {} -out=psl -noHead {}".format(tmp_tgt, tmp_ref, tmp_psl)) r = popenCatch("simpleChain -outPsl {} /dev/stdout".format(tmp_psl)) r = r.split("\n")[:-1] best_cov, best_ident = evaluate_blat_results(r) return map(str, [gp.id, gp.name, best_cov, best_ident])
def test_find_closest_splice_acceptor_plus(self): """ Find the closest splice acceptor, which is 17 bp upstream. Plus strand.""" # Process reference junctions sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) # Intron bound info transcriptID = "test_read" jnNumber = 0 chrom = "chr1" start = 23071360 end = 23072140 strand = "+" genome = Fasta("input_files/hg38_chr1.fa") junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end, strand, genome, sjDict) acceptor = junction.get_splice_acceptor() closest_acceptor = TC.find_closest_bound(acceptor, acceptors) assert closest_acceptor.start == 23072122 assert closest_acceptor.end == 23072123 assert closest_acceptor.dist == -17
def test_primary_monoexon_read(self): """ The supplied read is a primary alignment. This means that a transcript object is created, and the logInfo struct notes the primary status.""" sam_file = "input_files/sams/perfectReferenceMatch_noIntrons.sam" with open(sam_file, 'r') as f: sam_line = f.readline().strip() genome = Fasta("input_files/hg38_chr1.fa") sjAnnot = set() transcript, logInfo = TC.transcript_init(sam_line, genome, sjAnnot) assert transcript.QNAME == "c21031/f2p3/3400" assert transcript.FLAG == 0 assert transcript.CHROM == "chr1" assert transcript.POS == 192575775 assert transcript.CIGAR == "155M" assert transcript.MD == "MD:Z:155" assert logInfo.Mapping == "primary" assert logInfo.corrected_deletions == \ logInfo.uncorrected_deletions == \ logInfo.variant_deletions == \ logInfo.corrected_insertions == \ logInfo.uncorrected_insertions == \ logInfo.variant_insertions == \ logInfo.corrected_mismatches == \ logInfo.uncorrected_mismatches == \ logInfo.corrected_NC_SJs == logInfo.uncorrected_NC_SJs == "NA"
def test_find_closest_splice_acceptor_minus(self): """ Find the closest splice acceptor, which is 1 bp downstream. Minus strand. Note that dist is relative to the genome, not to the direction of the transcript.""" # Process reference junctions sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) # Intron bound info transcriptID = "test_read" jnNumber = 0 chrom = "chr1" start = 22071331 end = 22073331 strand = "-" genome = Fasta("input_files/hg38_chr1.fa") junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end, strand, genome, sjDict) acceptor = junction.get_splice_acceptor() closest_acceptor = TC.find_closest_bound(acceptor, acceptors) assert closest_acceptor.start == 22071329 assert closest_acceptor.end == 22071330 assert closest_acceptor.dist == -1
def test_get_depth_info(self): ref_fasta = Fasta(fasta_dir + 'test/chr0.fa') chr0 = ref_fasta['chr0'] confident_regions = Regions([(0,10000000)]) reads = list(self.bam_in) r = get_depth_info(reads, "chr0", 0, len(chr0), None, confident_regions) (depth_df, summary_depth_info, confident_depth_info, target_info, target_cov) = r reads_dd = filter(lambda x: not x.is_duplicate, reads) r_dd = get_depth_info(reads_dd, "chr0", 0, len(chr0), None, confident_regions) (dd_depth_df, summary_depth_info_deduped, confident_depth_info, target_info, target_cov) = r_dd self.assertEqual(summary_depth_info, {0: 10, 1: 10, 2: 10, 3: 10}) self.assertEqual(summary_depth_info_deduped, {0: 10, 1: 20, 2: 10}) self.assertEqual(target_info, {}) r = get_depth_info(reads, "chr0", 0, len(chr0), Regions([(5, 15)]), confident_regions) (target_depth_df, summary_depth_info, confident_depth_info, target_info, target_cov) = r self.assertEqual(summary_depth_info, {2: 5, 3: 5}) self.assertEqual(len(target_depth_df), 10) self.assertEqual(len(target_cov), 1) self.assertEqual(target_cov['mean'][0], 2.5) self.assertEqual(sum(target_depth_df.coverage), target_info['on_target_bases']) r_dd = get_depth_info(reads_dd, "chr0", 0, len(chr0), Regions([(5, 15)]), confident_regions) (target_depth_df, summary_depth_info_deduped, confident_depth_info, target_info, target_cov) = r_dd self.assertEqual(summary_depth_info_deduped, {1: 5, 2: 5})
def segments(self): ''' Generator for Segments ''' startchr = self.start_chromosome start = self.start_location chrs = [x[0] for x in sorted(self.fasta.index.items(), key=lambda a: a[1][0])] for chr in chrs: segcount = 0 if self.verbose: print "Reading chr %s" % chr # Skip forward if a starting chr was defined if startchr is not None and startchr != chr: continue else: startchr = None for kmer in Fasta.as_kmers(self.fasta[chr],self.segment_size): end = start + self.segment_size seg = Segment(start, end, kmer[1] ,chr) segcount += 1 if self.verbose and segcount % 1000 == 0: print "Read %d segments" % segcount yield seg start = end
def segments(self): ''' Generator for Segments ''' startchr = self.start_chromosome start = self.start_location chrs = [ x[0] for x in sorted(self.fasta.index.items(), key=lambda a: a[1][0]) ] for chr in chrs: segcount = 0 if self.verbose: print "Reading chr %s" % chr # Skip forward if a starting chr was defined if startchr is not None and startchr != chr: continue else: startchr = None for kmer in Fasta.as_kmers(self.fasta[chr], self.segment_size): end = start + self.segment_size seg = Segment(start, end, kmer[1], chr) segcount += 1 if self.verbose and segcount % 1000 == 0: print "Read %d segments" % segcount yield seg start = end
def test_find_closest_splice_donor_minus(self): """ For a toy case with multiple donors and acceptors in close proximity, test whether TC can find the closest reference donor to the supplied intron bound. Similar to before, there is an exact match for the donor, located at 23071360 in 1-based coordinates and 23071359 in 0-based.""" # Process reference junctions sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation( sjFile, tmp_dir, chroms) # Intron bound info transcriptID = "test_read" jnNumber = 0 chrom = "chr1" start = 23070360 end = 23071360 strand = "-" genome = Fasta("input_files/hg38_chr1.fa") junction = sj.SpliceJunction(transcriptID, jnNumber, chrom, start, end, strand, genome, sjDict) donor = junction.get_splice_donor() closest_donor = TC.find_closest_bound(donor, donors) assert closest_donor.start == 23071359 assert closest_donor.end == 23071360 assert closest_donor.dist == 0
def test_fix_donor_case3(self): """ Toy transcript with sequence AAGGT|GAA, where the splice motif is noncanonical but located 2 bp from a canonical splice donor. chr1: 23,071,357 - 23,072,126 So-called case #3 """ # Process references sjFile = "input_files/test_junctions.txt" tmp_dir = "scratch/test/TC_tmp/" chroms = set(["chr1"]) donors, acceptors, sjDict = TC.processSpliceAnnotation(sjFile, tmp_dir, chroms) genome = Fasta("input_files/hg38_chr1.fa") # Init transcript object sam_fields = ["test_read", "0", "chr1", "23071357", "255", "5M762N3M", "*", "0", "0", "AAGGTGAA", "*", "NM:i:0", "MD:Z:8"] transcript = t2.Transcript(sam_fields, genome, sjDict) jnNumber = 0 maxDist = 5 donor = (transcript.spliceJunctions[jnNumber]).bounds[0] # Attempt to correct the splice donor side of the junction (left) new_seq, new_cigar = TC.fix_one_side_of_junction(transcript.CHROM, transcript.POS, jnNumber, donor, -2, genome, transcript.SEQ, transcript.CIGAR) assert new_seq == "AAGGAA" assert new_cigar == "3M764N3M"
def test_classes(): for inplace in (True, False): for klass in record_classes: f = Fasta('tests/data/three_chrs.fasta', record_class=klass, flatten_inplace=inplace) yield check_keys, f yield check_misc, f, klass yield check_contains, f yield check_shape, f yield check_bounds, f yield check_tostring, f yield check_kmers, f yield check_kmer_overlap, f yield check_slice_size, f yield check_slice, f yield check_full_slice, f yield check_array_copy, f yield check_array, f yield check_one_based, f fasta_name = f.fasta_name del f yield check_keyfn, 'tests/data/key.fasta', klass, inplace yield check_reload, klass, fasta_name yield check_duplicates, klass, inplace _cleanup()
def count_freq(blast_file, fasta, org, count_subject=True): """one large blast file """ h5, node = get_node(org, 'w') # use existing counts. if (h5, node) == (None, None): return f = Fasta(fasta) print "counting..." cache = {} for sline in open(blast_file): line = sline.split("\t") qchr, schr = line[:2] qstart, qstop, sstart, sstop = map(int, line[6:10]) if not qchr in cache: update_cache(qchr, node, len(f[qchr]), h5, cache) cache_clear(cache, node, qchr, schr) # convert to 0-based indexes: # 1 8 => 0 8, but range doesnt include upper boud. cache[qchr][qstart - 1:qstop] += 1 if count_subject: if sstart > sstop: sstart, sstop = sstop, sstart if not schr in cache: update_cache(schr, node, len(f[schr]), h5, cache) cache_clear(cache, node, qchr, schr) cache[schr][sstart - 1:sstop] += 1 for achr in cache: getattr(node, 'c' + achr)[:] = cache[achr] h5.close()
def check_kmers(f): seq = str(f['chr2']) kmers = list(Fasta.as_kmers(f['chr2'], 10)) assert (len(kmers) == len(seq) / 10) assert (kmers[0] == (0, seq[:10])) seqs = [k[1] for k in kmers] assert ("".join(seqs) == seq) last_pair = kmers[-1] assert (seqs[-1][-1] == 'T') seq = str(f['chr3']) kmers = list(Fasta.as_kmers(f['chr3'], 1)) assert (kmers[2][0] == 2) seqs = [k[1] for k in kmers] assert ("".join(seqs) == seq)
class Reference(object): def __init__(self, genome_fasta): # @see: https://pypi.python.org/pypi/pyfasta key_fn = lambda key : key.split()[0] # Use first value before whitespace as keys self.fasta = Fasta(genome_fasta, key_fn=key_fn) def get_sequence_from_iv(self, iv): feature_hash = {'chr' : iv.chrom, 'start' : iv.start, 'stop' : iv.end, 'strand' : iv.strand} return self.fasta.sequence(feature_hash, one_based=False)
def read_fasta(ref_files, fasta_header): """Read fasta file New line character can only exist between header and sequence, not inside sequence Args: file_path (str): Path to fasta file. Returns: fasta_dict (dict): Dictionary with fasta headers as keys and the sequences as values. """ # Open fasta file and store headers and sequences for fasta_path in ref_files: # print(fasta_path) fasta = Fasta(fasta_path) if fasta_header in fasta.keys(): return fasta
def split_seqs(self, num_jobs, max_ref=5, max_qry=20): ''' splits reference and query into appropriate number of splits ''' # load data into memory. r = Fasta(self.ref_fasta, record_class=MemoryRecord) q = Fasta(self.qry_fasta, record_class=MemoryRecord) ## reference ## # split according to criteria. if len(r) < max_ref: max_ref = len(r) if max_ref > num_jobs: max_ref = 1 if len(q) < max_qry: max_qry = len(q) if num_jobs < max_qry: max_qry = num_jobs if (max_ref * max_qry) > num_jobs: max_qry = int(float(num_jobs) / float(max_ref)) # count number of seqs. sc = len(r.keys()) # create split info. self.ref_names = ["ref_%i" % x for x in range(max_ref)] self.ref_files = ["%s/%s.fasta" % (self.out_dir, x) for x in self.ref_names] # split according to rules. pyfasta.split_fasta.without_kmers(r, self.ref_files) self.ref_names, self.ref_files = self._no_empty(self.ref_names, self.ref_files) ## query ## # create split info. self.qry_names = ["qry_%i" % x for x in range(max_qry)] self.qry_files = ["%s/%s.fasta" % (self.out_dir, x) for x in self.qry_names] # split according to rules. pyfasta.split_fasta.without_kmers(q, self.qry_files) self.qry_names, self.qry_files = self._no_empty(self.qry_names, self.qry_files)
def generate_corpusfile(fasta_fname, n, corpus_fname): ''' Args: fasta_fname: corpus file name n: the number of chunks to split. In other words, "n" for "n-gram" corpus_fname: corpus_fnameput corpus file path Description: Protvec uses word2vec inside, and it requires to load corpus file to generate corpus. ''' f = open(corpus_fname, "w") fasta = Fasta(fasta_fname) for record_id in tqdm(fasta.keys(), desc='corpus generation progress'): r = fasta[record_id] seq = str(r) ngram_patterns = split_ngrams(seq, n) for ngram_pattern in ngram_patterns: f.write(" ".join(ngram_pattern) + "\n") f.close()
def process_query(): print('Reading sequence library and query sequence') library = Fasta(library_path) queries = Fasta(query_path) query_sequence = str(queries["Rattus"]) print('Processing') progress = progressbar.ProgressBar(max_value=len(library.keys())) cpu_count = multiprocessing.cpu_count() executor = ThreadPoolExecutor(max_workers=cpu_count) tasks = [] for record in list(library.keys())[:library_process_limit]: library_sequence = str(library[record]) future = executor.submit(align, library_sequence, query_sequence) tasks.append(AlignmentTask(record, future)) results = [] for i in range(len(tasks)): _, _, score = tasks[i].future.result() results.append(AlignmentResult(title=tasks[i].record, score=score)) progress.update(i) etalone_score = sum([ smatrix[(x, x)] for x in query_sequence ]) print("Done") print("Etalone score is %d" % etalone_score) print("Got %d results, here are top-30 among them:" % len(results)) print("Score | Match | Record") for sequence in sorted(results, key=lambda x: x.score, reverse=True)[:30]: match = (sequence.score / etalone_score) * 100.0 print("%6d | %5.3f%% | %s" % (sequence.score, match, sequence.title)) timer = get_performance_timer() for time in [timer.dotplot, timer.regions, timer.align]: print(time / cpu_count)
def main(gff_file, fasta_file, parents, children): db_file = gff_file + ".db" if not op.exists(db_file): GFFutils.create_gffdb(gff_file, db_file) f = Fasta(fasta_file) g = GFFutils.GFFDB(db_file) parents = set(parents.split(',')) parents_iter = [g.features_of_type(x) for x in parents] parents_list = itertools.chain(*parents_iter) children_list = set(children.split(',')) for feat in parents_list: children = [] for c in g.children(feat.id, 1): if c.featuretype not in children_list: continue child = f.sequence(dict(chr=c.chrom, start=c.start, stop=c.stop, strand=c.strand)) children.append((child, c)) if not children: print >>sys.stderr, "[warning] %s has no children with type %s" \ % (feat.id, ','.join(children_list)) continue # sort children in incremental position children.sort(key=lambda x: x[1].start) # reverse children if negative strand if feat.strand=='-': children.reverse() feat_seq = ''.join(x[0] for x in children) print ">%s" % feat.id print feat_seq
class Sequence(): """docstring for Sequence""" def __init__(self, engine='mysql', function = 'iterator', **kwargs): self.engine = engine if self.engine == 'mysql' and function == 'iterator': self.create_mysql_iterator(**kwargs) elif self.engine == 'biopython' and kwargs['data_type'] == 'fasta': self.create_biopython_iterator(**kwargs) elif self.engine == 'pyfasta' and kwargs['data_type'] == 'fasta': self.create_pyfasta_iterator(**kwargs) elif self.engine == 'twobit' and kwargs['data_type'] == 'twobit': self.create_twobit_iterator(**kwargs) def create_mysql_iterator(self, **kwargs): cur = kwargs['cursor'] query = '''SELECT id, record FROM sequence WHERE n_count <= 2 AND trimmed_len > 40''' cur.execute(query) self.readcount = cur.rowcount self.read = iter(cur.fetchall()) def create_biopython_iterator(self, **kwargs): from Bio import SeqIO print "Generating BioPython sequence index. This may take a moment...." self.fasta = SeqIO.index(kwargs['input'], kwargs['data_type']) self.readcount = len(self.fasta) self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys())) self.read = iter(self.db_values) def create_twobit_iterator(self, **kwargs): import bx.seq.twobit self.fasta = bx.seq.twobit.TwoBitFile(file(kwargs['input'])) self.readcount = self.fasta.seq_count self.db_values = zip(range(self.fasta.seq_count), sorted(self.fasta.keys())) self.read = iter(self.db_values) def create_pyfasta_iterator(self, **kwargs): from pyfasta import Fasta print "Generating PyFasta sequence index. This may take a moment...." self.fasta = Fasta(kwargs['input']) self.readcount = len(self.fasta) self.db_values = zip(range(len(self.fasta)), sorted(self.fasta.keys())) self.read = iter(self.db_values) def get_pyfasta_reads(self, **kwargs): from pyfasta import Fasta self.fasta = Fasta(kwargs['input']) self.readcount = len(self.fasta)
def with_kmers(f, names, k, overlap): """ split the sequences in Fasta object `f` into pieces of length `k` with the given `overlap` the results are written to the array of files `fhs` """ fhs = [open(name, 'wb') for name in names] i = 0 for seqid in f.keys(): seq = f[seqid] for (start0, subseq) in Fasta.as_kmers(seq, k, overlap=overlap): fh = fhs[i % len(fhs)] print >>fh, ">%s" % format_kmer(seqid, start0) print >>fh, subseq i += 1
def main(): args = make_parser() if args.inplace: f = Fasta(args.fasta_file, flatten_inplace=True) else: f = Fasta(args.fasta_file) if args.output_file is not None: output = open(args.output_file, 'w') else: output_file_name = args.fasta_file.split('.')[0] output_file = '{0}.phylip'.format(output_file_name) output = open(output_file, 'w') sequence_count = len(f.keys()) sequence_length = len(f[next(iter(f.keys()))]) # print('', sequence_count, sequence_length, sep=' ') output.write(' {0} {1}\n'.format(sequence_count, sequence_length)) for key in f.keys(): subseq = [] for chunk in grouper(f[key][:LINE_LENGTH], CHUNK_LENGTH): subseq.append(''.join(item[0] for item in chunk)) subseq = ' '.join(subseq) if len(key) < CHUNK_LENGTH: key = key.ljust(CHUNK_LENGTH) else: key = key[:CHUNK_LENGTH] # print(key, ' ', subseq) output.write('{0} {1}\n'.format(key, subseq)) sequence_length -= LINE_LENGTH start = LINE_LENGTH stop = LINE_LENGTH * 2 # print() output.write('\n') while sequence_length > 0: for key in f.keys(): subseq = [] for chunk in grouper(f[key][start:stop], CHUNK_LENGTH, ' '): subseq.append(''.join(item[0] for item in chunk)) subseq = ' '.join(subseq) # print(PAD_STRING, ' ', subseq) output.write('{0} {1}\n'.format(PAD_STRING, subseq)) sequence_length -= LINE_LENGTH start += LINE_LENGTH stop += LINE_LENGTH # print() output.write('\n') output.close()
def align(): hg19 = Fasta('hg19.fa') print hg19.keys() hg19Chr = sorted(hg19.keys(), reverse=True) YRI = Fasta('YRIref.fasta') print YRI.keys() YRIChr = sorted(YRI.keys()) print hg19[hg19Chr[0]][:20] print YRI[YRIChr[0]][:20] print hg19[hg19Chr[0]][:20] print YRI[YRIChr[0]][:20] fhout = open('hg19_YRI_diff.bed', 'w') header = 'chrom, chromStart, chromEnd, hg19, YRI \n' fhout.write(header) for each in hg19Chr: seq1 = hg19[each][:10000] seq2 = YRI[each][:10000] print 'reached 1' print 'doing alignment for ', each alignment = nw.global_align(seq1, seq2, gap=-2, matrix=None, match=1, mismatch=-1) print 'reached 2' len1 = len(alignment[0]) #hg19 len2 = len(alignment[1]) #YRI if len2>len1: x = len2 else: x = len1 for i in range(x): if alignment[0][i] != alignment[1][i]: #write to fhout outline = each + ',' + str(i) + ',' + str(i+1) + ',' + alignment[0][i] + ',' + alignment[1][i] + '\n' fhout.write(outline) fhout.close()
''' 检测比对过的fasta文件中所有序列之间是否两两均具有重叠区域 ''' __version__ = "1.0" from pyfasta import Fasta import argparse #命令行选项处理 parser = argparse.ArgumentParser() parser.add_argument("-i", "-in", "--input", metavar="filename", dest="input", type=str , help="fasta file to check") parser.add_argument("-v", "--version", action='version', help="The version of this program.", version = "Version: " + __version__) args = parser.parse_args() f = Fasta(args.input) loci = sorted(f.keys()) for locus1 in loci: for locus2 in loci: flag = 0 sequence1 = f[locus1] sequence2 = f[locus2] i = 0 while i < len(sequence1) and i < len(sequence2): base1 = sequence1[i] base2 = sequence2[i] if base1 != "-" and base2 != "-": flag = 1 break i += 1 if flag == 0: