def get_one_hot_sequence(chrname, start, stop, nuc, HPC_MODE): if HPC_MODE: # Path on the HPC of the 2bit version of the human reference genome (hg19) genome = twobit.TwoBitFile( '/hpc/cog_bioinf/ridder/users/lsantuari/Datasets/genomes/hg19.2bit' ) else: # Path on the local machine of the 2bit version of the human reference genome (hg19) genome = twobit.TwoBitFile( '/Users/lsantuari/Documents/Data/GiaB/reference/hg19.2bit') #ltrdict = {'a': 1, 'c': 2, 'g': 3, 't': 4, 'n': 0} # N one-hot #ltrdict = {'a': 0, 'c': 0, 'g': 0, 't': 0, 'n': 1} #return np.array([ltrdict[x.lower()] for x in genome['chr'+chrname][start:stop]]) if chrname == 'MT': chrname = 'M' return np.array([ 1 if x.lower() == nuc.lower() else 0 for x in genome['chr' + chrname][start:stop] ])
def get_one_hot_sequence_by_list(chrname, positions, HPC_MODE): if HPC_MODE: # Path on the HPC of the 2bit version of the human reference genome (hg19) genome = twobit.TwoBitFile( '/hpc/cog_bioinf/ridder/users/lsantuari/Datasets/genomes/hg19.2bit' ) else: # Path on the local machine of the 2bit version of the human reference genome (hg19) genome = twobit.TwoBitFile( '/hpc/cog_bioinf/ridder/users/lsantuari/Datasets/genomes/hg19.2bit' ) if chrname == 'MT': chrname = 'M' whole_chrom = str(genome['chr' + chrname]) nuc_list = ['A', 'T', 'C', 'G', 'N'] res = np.zeros(shape=(len(positions), len(nuc_list)), dtype=np.uint32) for i, nuc in enumerate(nuc_list, start=0): res[:, i] = np.array([ 1 if whole_chrom[pos].lower() == nuc.lower() else 0 for pos in positions ]) return res
def load_human_genome_sequence(genomic_build): genome_file = json_data[genomic_build]["genome"] if not os.path.exists(genome_file): raise Exception(messages['error_messages'] ['GENOME_NOT_PRESENT'].format(genome_file)) return twobitreader.TwoBitFile(genome_file)
def loadChromInfo(twoBit): """map of chrom names to sizes""" tbr = twobitreader.TwoBitFile(twoBit) try: return dict(tbr.sequence_sizes()) finally: tbr.close()
def get_regionLevel_simplex_parameters(inputbed, outputbed, plusbw, minusbw, biasmat, ext, genome2bit): simplex_code = encoding() biasdict, flank = readBG(biasmat) B, B0, B1, B2 = paramest(biasdict) permuteSeq = {} inf = open("permuteSeq8mer.txt") for line in inf: ll = line.split() permuteSeq[ll[0]] = ll[1] inf.close() # outitem = seq2biasParm("ACTCGCAA",B,simplex_code) #print B genome = twobitreader.TwoBitFile(genome2bit) # seq = genome[chrm][(int(ll[1])-flank):(int(ll[1])+flank)].upper() plusBWH = BigWigFile(open(plusbw, 'rb')) minusBWH = BigWigFile(open(minusbw, 'rb')) inf = open(inputbed) outf = open(outputbed, 'w') for line in inf: ll = line.split() chrm = ll[0] center = (int(ll[1]) + int(ll[2])) / 2 start = max(0, center - ext) end = center + ext plusSig = plusBWH.summarize(ll[0], start, end, end - start).sum_data minusSig = minusBWH.summarize(ll[0], start, end, end - start).sum_data if type(plusSig) == None or type(minusSig) == None: continue plusSequence = genome[chrm][(start - flank):(end + flank)].upper() minusSequence = genome[chrm][(start - flank + 1):(end + flank + 1)].upper() plus_data = numpy.array([0.0] * len(B)) minus_data = numpy.array([0.0] * len(B)) for i in range(len(plusSig)): #position = start + i pcuts = plusSig[i] if pcuts > 0: pseq = plusSequence[i:(i + 2 * flank)].upper() if not "N" in pseq: p_out = seq2biasParm(permuteSeq[pseq], B, simplex_code) plus_data += pcuts * p_out for i in range(len(minusSig)): #position = start + i mcuts = minusSig[i] if mcuts > 0: tmpseq = minusSequence[i:(i + 2 * flank)] if not "N" in tmpseq: mseq = revcomp(tmpseq).upper() m_out = seq2biasParm(permuteSeq[mseq], B, simplex_code) minus_data += mcuts * m_out newll = ll + list(plus_data) + list(minus_data) outf.write("\t".join(map(str, newll)) + "\n") inf.close() outf.close()
def dupFeature(usename, seq2bit, datatype): if datatype == "ATAC": biasfile = "/scratch/sh8tv/Project/scATAC/Data/Summary_Data/bias_matrix/summary36bp/singleMat/NakedYeast_ATAC_Enc8mer.txt" cutoff = 3.5 elif datatype == "DNase": biasfile = "/scratch/sh8tv/Project/scATAC/Data/Summary_Data/bias_matrix/summary36bp/singleMat/NakedIMR90_DNase_Enc8mer.txt" cutoff = -2.2 else: print datatype sys.exit() bias = {} inf = open(biasfile) for line in inf: if line.startswith("seqtype"): continue ll = line.split() bias[ll[0]] = round(float(ll[2]), 4) inf.close() flankLen = len(ll[0]) / 2 genome = twobitreader.TwoBitFile(seq2bit) inf = open(usename + "_uniq.bed") outf1 = open(usename + "_g1.bed", 'w') outf2 = open(usename + "_g2.bed", 'w') outf3 = open(usename + "_g3.bed", 'w') outf4 = open(usename + "_g4.bed", 'w') for line in inf: ll = line.split() if ll[0] == "chrM": continue left, right = PEreads_feature(ll, flankLen, genome, bias, datatype) if left == "NA" or right == "NA": continue newll = [ ll[0], (int(ll[1]) + int(ll[2])) / 2, (int(ll[1]) + int(ll[2])) / 2 + 1, ".", ".", "+" ] newline = "\t".join(map(str, newll)) + "\n" if left > cutoff: if right > cutoff: outf1.write(newline) else: outf4.write(newline) else: if right > cutoff: outf2.write(newline) else: outf3.write(newline) outf1.close() outf2.close() outf3.close() outf4.close()
def load_genomes(UTRfilestring, twobitfile): """ make this a separate function so that these only need to be loaded a single time """ UTRdict= rph.readindict(open(UTRfilestring, "rU")) genome= twobitreader.TwoBitFile(twobitfile) # do we actually need to load this in here? return UTRdict, genome
def __init__(self): import os self.root = os.path.dirname( os.path.abspath(__file__)) + "/test/test_corrGC/" self.tbitFile = self.root + "sequence.2bit" self.bamFile = self.root + "test.bam" self.chrNameBam = '2L' self.chrNameBit = 'chr2L' bam = pysam.Samfile(self.bamFile) tbit = twobit.TwoBitFile(self.tbitFile) global debug debug = 0 global global_vars global_vars = { '2bit': self.tbitFile, 'bam': self.bamFile, 'filter_out': None, 'extra_sampling_file': None, 'max_reads': 5, 'min_reads': 0, 'min_reads': 0, 'reads_per_bp': 0.3, 'total_reads': bam.mapped, 'genome_size': sum(tbit.sequence_sizes().values()) }
def count_cut_nmers(fp, w_minus, lflank, rflank, single_nmer_cutoff, sequence, offset): """ count the number of cuts associated with each nmer in sequence covered by X. offset is the position of the cut to be associated with each nmer. if offset = 0 the first base of the tag is lined up with the nmer start """ # w_plus_H=BigWigFile(open(w_plus, 'rb')) w_minus_H = BigWigFile(open(w_minus, 'rb')) genome = twobitreader.TwoBitFile(sequence) # keep count of the number of occurrences of each n-mer seq_nmer_dict = {} cut_nmer_dict = {} for line in fp.readlines(): ll = line.split() chrm = ll[0] start = int(ll[1]) end = int(ll[2]) # pseq = genome[chrm][(start-lflank+offset):(end+rflank+offset)].upper() nseq = genome[chrm][(start - lflank - offset):(end + rflank - offset)].upper() # cp = list(w_plus_H.summarize(ll[0],start,end,end-start).sum_data) cn = list(w_minus_H.summarize(ll[0], start, end, end - start).sum_data) #each = (len(ll)-5)/2 #cp = (map(float,ll[5:(5+each)])) #cn = (map(float,ll[(5+each):(5+each*2)])) for k in range(len(cn)): # p_cut = cp[k] n_cut = cn[k] # p_seq = pseq[k:(k+lflank+rflank)] n_seq = nseq[(k + 1):(k + lflank + rflank + 1)] # rev_n_seq = rev(n_seq) # if 'N' not in p_seq and p_cut <= single_nmer_cutoff : # try: # cut_nmer_dict[ p_seq ] += p_cut # except: # cut_nmer_dict[ p_seq ] = p_cut # try: # seq_nmer_dict[ p_seq ] += 1 # except: # seq_nmer_dict[ p_seq ] = 1 if 'N' not in n_seq and n_cut <= single_nmer_cutoff: rev_n_seq = rev(n_seq) try: cut_nmer_dict[rev_n_seq] += n_cut except: cut_nmer_dict[rev_n_seq] = n_cut try: seq_nmer_dict[rev_n_seq] += 1 except: seq_nmer_dict[rev_n_seq] = 1 return seq_nmer_dict, cut_nmer_dict
def getSeq(gene_tsv_path, twobitpath): genes_df = pd.read_csv(gene_tsv_path, sep='\t', names=GENE_TSV_HEADER) genome_reader = twobitreader.TwoBitFile(twobitpath) for i, gene in genes_df.iterrows(): seq = genome_reader[gene['chrom']][int(gene['startTranscription'] ):int(gene['endTranscription'])] seq = seq.upper() seq = np.array(list(seq)) if gene['strand'] == '-': seq = np.flip(seq) seq = np.vectorize(REVERSE_COMPLEMENT_MAP.get)(seq) exon_starts = np.array(gene['exonStart'].split(',')[:-1], dtype=np.int64) exon_sizes = np.array(gene['exonSize'].split(',')[:-1], dtype=np.int64) intron_exon_lbl = np.full(len(seq), "I") for exon_idx, (start, size) in enumerate(zip(exon_starts, exon_sizes)): if exon_idx == 0: exon_lbl = 'F' elif exon_idx == len(exon_starts) - 1: exon_lbl = 'L' else: exon_lbl = 'M' intron_exon_lbl[start:start + size] = exon_lbl yield seq + ['^'], intron_exon_lbl + ['L']
def bias_correct_flank(inbdg,outname,biasMat,Gen,strand): genome = twobitreader.TwoBitFile(Gen) BGraw,BGenc,Nmer = readBG(biasMat) flank = int(Nmer)/2 inf = open(inbdg) outf_encCorrect = open(outname + ".bdg",'w') for line in inf: ll = line.split() chrm = ll[0] start = int(ll[1]) end = int(ll[2]) raw_sig = float(ll[3]) if raw_sig == 0: outf_encCorrect.write(line) else: for pos in range(start,end): if strand == "+": this_seq = genome[chrm][(pos-flank):(pos+flank)].upper() else: this_seq = rev(genome[chrm][(pos-flank+1):(pos+flank+1)].upper()) if BGraw.has_key(this_seq): enc_correct_sig = raw_sig * BGenc[this_seq] else: enc_correct_sig = raw_sig newllenc = [chrm,pos,pos+1,enc_correct_sig] outf_encCorrect.write("\t".join(map(str,newllenc))+"\n") outf_encCorrect.close() inf.close()
def _examinePSL(self, querySeq): chromfile = os.path.join(self.ChromosomesDir, self.Chromosome + ".2bit") bitfile = tbr.TwoBitFile(chromfile) genomicSequence = bitfile[ self.Chromosome][int(self.tStart):int(self.tEnd)] self.genomicFlats = [] self.DelGenomicSeqs = {} for x in range(len(self.qBlocks)): try: qB = int(self.qCuts[x]) lenB = int(self.qBlocks[x]) tS = int(self.tCuts[x]) genomicCut = genomicSequence[(int(tS) - int(self.tStart)):( int(tS) - int(self.tStart) + lenB)] self.genomicFlats.append([int(tS), int(tS) + lenB]) if self.tStrand == "-": Gseq = str(Seq(genomicCut).reverse_complement()) else: Gseq = genomicCut Qseq = querySeq[qB:(qB + lenB)] location = self.Chromosome + "(" + self.tStrand + "):" + str( tS) + "-" + str(int(tS) + lenB) self.DelGenomicSeqs[location] = {'Del': Qseq, 'Genomic': Gseq} except ValueError: pass ##Send all information to function that will join overlapping ranges ## And find the deletions in the genomic sequence self.missingRange()
def ATprofile(inputbed, genome2bit): inf = open(inputbed) genome = twobitreader.TwoBitFile(genome2bit) ATfrac = [0] * 140 ave_ATfrac = [0] * 140 reads_count = 0 for line in inf: ll = line.strip().split('\t') #chrom = ll[0] if ll[5] == "+": seq = genome[ll[0]][(int(ll[1]) + 4):(int(ll[1]) + 145)].upper() else: #transtab = string.maketrans("ACGTNX","TGCANX") seq = genome[ll[0]][(int(ll[2]) - 145):( int(ll[2]) - 4)].upper()[::-1] #.translate(transtab)[::-1] if len(seq) != 141: continue reads_count += 1 for i in range(140): if seq[i:(i + 2)] in ["AA", "TT", "AT", "TA"]: ATfrac[i] += 1 for i in range(140): ave_ATfrac[i] = (ATfrac[i] * 1.0) / (reads_count * 1.0) inf.close() return ave_ATfrac
def seqbias(out, sequence, rflank, lflank): genome = twobitreader.TwoBitFile(sequence) # keep count of the number of occurrences of each n-mer seq_nmer_dict = make_nmer_dict(lflank + rflank) for chrom in genome.keys(): if not chrom.split('_')[0] == chrom: continue wholeSeq = genome[chrom][:] RVwholeSeq = rev(wholeSeq) for i in range(len(wholeSeq) - lflank - rflank + 1): seq6mer = wholeSeq[i:(i + lflank + rflank)] RVseq6mer = RVwholeSeq[i:(i + lflank + rflank)] if 'a' in seq6mer or 't' in seq6mer or 'c' in seq6mer or 'g' in seq6mer or 'n' in seq6mer or 'N' in seq6mer: pass else: seq_nmer_dict[seq6mer] += 1 if 'a' in RVseq6mer or 't' in RVseq6mer or 'c' in RVseq6mer or 'g' in RVseq6mer or 'n' in RVseq6mer or 'N' in RVseq6mer: pass else: seq_nmer_dict[RVseq6mer] += 1 outf = open(out, 'w') for seqtype in sorted(seq_nmer_dict.keys()): outf.write("\t".join(map(str, [seqtype, seq_nmer_dict[seqtype]])) + "\n") outf.close()
def getsignal(inputfile,outputfile,BGmatrix,pcut,ncut,Ipcut,Incut,pspan,tspan,gen,left,right,fetch_length=100): # p=BwIO(pcut) # chrom_len = {} # for i in p.chromosomeTree['nodes']: # chrom_len[i['key']] = i['chromSize'] genome = twobitreader.TwoBitFile(gen) pcutbw = BigWigFile(open(pcut, 'rb')) ncutbw = BigWigFile(open(ncut, 'rb')) Ipcutbw = BigWigFile(open(Ipcut, 'rb')) Incutbw = BigWigFile(open(Incut, 'rb')) inf = open(inputfile) testll = inf.readline().split() ml = int(testll[2]) - int(testll[1]) pspan = pspan - ml/2 inf.seek(0) pBG,nBG = readBG(BGmatrix) outf = open(outputfile,'w') for line in inf: ll = line.split() chrom = ll[0] start = int(ll[1]) end = int(ll[2]) strand = ll[5] seq = genome[chrom][(start-pspan-left):(end + pspan+right)] pout = make_cut(pcutbw,ll,pspan,fetch_length) nout = make_cut(ncutbw,ll,pspan,fetch_length) Ipout = make_cut(Ipcutbw,ll,pspan,fetch_length) Inout = make_cut(Incutbw,ll,pspan,fetch_length) if strand == "-": pout,nout = nout,pout Ipout,Inout = Inout,Ipout if pout == 'NA': continue if 'N' in seq.upper(): continue #print 1 pseq = seq[:-1] nseq = seq[1:] p=[] n=[] for k in range(len(pseq) +1 - left-right): p.append(pBG[pseq[k:k+left+right].upper()]) n.append(nBG[nseq[k:k+left+right].upper()]) if strand != '-': pbglist = p nbglist = n else: pbglist = n[::-1] nbglist = p[::-1] TC,FOS = makeTCFOS(pcutbw,ncutbw,ll,tspan,ml) newll = ll + [TC,FOS] + pout + nout + Ipout + Inout + pbglist + nbglist outf.write("\t".join(map(str,newll))+"\n") outf.close() inf.close()
def render_indel_html(chrom, pos, ref_seq, alt_seq, twobit_file='ref/mm10.2bit', boundary=20): import twobitreader boundary = boundary + 1 twobit_ref = twobitreader.TwoBitFile(twobit_file) ref_length = len(ref_seq) alt_length = len(alt_seq) # pad by whichever seq is longest bounds = [ref_length + boundary, alt_length + boundary] boundary_end = max(bounds) # is indel a direct sub, a delete or insert? # this affects colours rendered in sequence also in_or_del = 'sub' if ref_length > alt_length: in_or_del = 'delete' elif ref_length < alt_length: in_or_del = 'insert' startpos = pos - boundary endpos = pos + boundary prefix = twobit_ref[chrom][startpos:pos - 1] suffix = twobit_ref[chrom][pos + ref_length - 1:endpos + ref_length - 2] #raise bp_text = '' bp_calc = alt_length if in_or_del == 'sub': bp_text = "<strong class='%s'>%sbp substitution</strong>" \ % (in_or_del, bp_calc) elif in_or_del == 'insert': bp_calc = alt_length - ref_length bp_text = "<strong class='%s'>%sbp insert</strong>" \ % (in_or_del, bp_calc) else: bp_calc = ref_length - alt_length bp_text = "<strong class='%s'>%sbp deletion</strong>" \ % (in_or_del, bp_calc) ref_html = '<strong>REF: </strong>' + prefix + \ '<strong class="' + in_or_del + '">' + \ ref_seq.ljust(alt_length, '-') + '</strong>' + suffix alt_html = '<strong>ALT: </strong>' + prefix \ + '<strong class="' + in_or_del + '">' + \ alt_seq.ljust(ref_length, '-') + '</strong>' + suffix return '<pre>%s<br/>%s<br/>%s</pre>' % (bp_text, ref_html, alt_html)
def seqbias(peak, tag, out, sequence, kmer): genome = twobitreader.TwoBitFile(sequence) pcut = make_nmer_dict(kmer) bgseq = make_nmer_dict(kmer) inf = open(peak) for line in inf: ll = line.strip().split("\t") seq = genome[ll[0]][int(ll[1]):int(ll[2])] for i in range(len(seq)): subseq_template = seq[(i - 6):(i + 11)] if len(subseq_template) != 17: continue subseq = SOBfetchSEQ(subseq_template, kmer) if bgseq.has_key(subseq): bgseq[subseq] += 1 else: pass inf.close() inf = open(tag) PEtag = 0 for line in inf: ll = line.strip().split("\t") if len(ll) < 6: PEtag = 1 if PEtag == 1 or ll[5] == '+' or ll[5] == ".": rawseq = genome[ll[0]][(int(ll[1]) - 6):(int(ll[1]) + 11)].upper() if len(rawseq) != 17: continue seq = SOBfetchSEQ(rawseq, kmer) if pcut.has_key(seq): pcut[seq] += 1 else: #print seq pass if PEtag == 1 or ll[5] == '-' or ll[5] == ".": rawseq = rev(genome[ll[0]][(int(ll[2]) - 11):(int(ll[2]) + 6)].upper()) if len(rawseq) != 17: continue seq = SOBfetchSEQ(rawseq, kmer) if pcut.has_key(seq): pcut[seq] += 1 else: #print seq pass inf.close() outf = open(out, 'w') for seqtype in sorted(pcut.keys()): if bgseq[seqtype] == 0: pbias = -1 else: pbias = float(pcut[seqtype]) / float(bgseq[seqtype]) #nbias = float(ncut[seqtype])/float(bgseq[seqtype]) #outf.write("\t".join(map(str,[seqtype,pcut[seqtype]]))+"\n") outf.write("\t".join( map(str, [seqtype, pbias, pcut[seqtype], bgseq[seqtype]])) + "\n") outf.close()
def fetch_subseq(path, chrom, start, end, strand): genome = twobitreader.TwoBitFile(path) subseq = genome[chrom].get_slice(start, end) if strand == "-": subseq = reverse(subseq) return subseq
def test_twobit_chr1_sequence(self): t = twobitreader.TwoBitFile(self.filename) chr1 = str(t['chr1']) self.assertEqual( chr1, 'GAACATGTACAACCTGACCTTCCACgaacatgtacaacctgaccttccacNNNNATGTACAACCTGACCTTCCAC' ) t.close()
def load_genomes(UTRfilestring, firstStopsCSV, twobitfile): """ make this a separate function so that these only need to be loaded a single time """ UTRdict= rph.readindict(open(UTRfilestring, "rU")) utr3adj = pd.read_csv(firstStopsCSV, index_col=0) genome= twobitreader.TwoBitFile(twobitfile) # do we actually need to load this in here? return UTRdict, utr3adj, genome
def __init__(self, path, chrom, sense, system=None, split_chrom="", **kwargs): kwargs['sense_specific'] = False super(TwoBitAccessor, self).__init__(path, chrom, sense, system=system, **kwargs) import twobitreader as TB from time import time self.system = system self.data = None # try to access the whole genome, using indexing for fast lookup fname = os.path.join(path, "{0}.2bit".format(system)) self.logger = logging.getLogger( 'byo.io.TwoBitAccessor({0})'.format(fname)) t0 = time() try: self.data = TB.TwoBitFile(fname) t1 = time() self.data.chrom_stats = self.data.sequence_sizes() self.chrom_lookup = {} if split_chrom: self.chrom_lookup = {} for chr in self.data.chrom_stats.keys(): short = chr.split(split_chrom)[0] self.chrom_lookup[short] = chr self.covered_strands = '*' self.logger.info("file provides {0} sequences.".format( len(self.data.chrom_stats))) except IOError: t1 = time() # all fails: return Ns only self.logger.warning( "Could not access '{0}'. Switching to dummy mode (only Ns)". format(fname)) self.get_data = self.get_dummy self.get_oriented = self.get_dummy self.covered_strands = [chrom + '+', chrom + '-'] self.no_data = True # TODO: maybe remove this if not needed self.get = self.get_oriented #self.logger.debug("covered strands: '{0}'".format(",".join(self.covered_strands[:10])) ) t2 = time() self.logger.debug( "opening 2bit file took {0:.1f}ms, entire constructor {1:.1f}ms". format((t1 - t0) * 1000., (t2 - t0) * 1000.))
def main(): outdir = 'f6_mutation_matrix_score' os.makedirs(outdir,exist_ok=True) indir = 'f2_mutation_on_cancer_specific_CTCF' # CTCF position weight matrix pwm_file ='/nv/vol190/zanglab/shared/Motif/pwm/jaspar_vertebrates/CTCF_MA0139.1.txt' # ACGT pwm = pd.read_csv(pwm_file,header=None,sep='\t') pwm.columns=['A','C','G','T'] # this is only tested bg_var from /nv/vol190/zanglab/shared/Motif/sites/hg38_fimo_jarspar/raw_results/CTCF/fimo.txt bg_var=0.025 bg=[0.25+bg_var,0.25-bg_var,0.25-bg_var,0.25+bg_var] log2LikelihoodRatio = np.log2((pwm+0.0000001)/bg) # hg38 sequencing genome = twobitreader.TwoBitFile('/nv/vol190/zanglab/zw5j/work2017/fusion_analysis/111_fusion_append/4_CTCFpairing_GCskew_RLoop_RNAprotein_GTExFusion/f0_infiles/hg38.2bit') cancertype_mutation_matchness = {'BRCA':'BRCA','CRC':'COAD','LUAD':'LUAD','PRAD':'PRAD','AML':'AML','PRAD_TissueAdded':'PRAD'} cancertypes=['BRCA','CRC','LUAD','AML','PRAD','PRAD_TissueAdded'] for celltype in cancertypes: for binding_type in ['gained','lost']: mutation_info_file ='{}/{}_mutation_on_{}_{}_expand9.csv'.format(indir,cancertype_mutation_matchness[celltype],celltype,binding_type) df = pd.read_csv(mutation_info_file,sep='\t',index_col=1,low_memory=False) # for each binding position with motif, get the DNA sequence matching_score_df = pd.DataFrame() for id in df.index: chr = df.loc[id,'chr'] mid = df.loc[id,'mid'] strand = df.loc[id,'strand'] mutation_info = df.loc[id,'mutation'].split(',') mutation_info_len = [len(i) for i in mutation_info] # check if there is a mutation matching_score_df.loc[id,'chr'] = chr matching_score_df.loc[id,'mid'] = mid matching_score_df.loc[id,'strand'] = strand if sum(mutation_info_len) !=0: sequence = genome[chr][mid-9-1:mid+9];sequence sequence = Seq(sequence).upper() assert if_all_ATCG(sequence) alt_sequence = get_altered_sequencing(sequence,mutation_info,mutation_info_len) seq_score,rev_seq_score = return_matching_score(log2LikelihoodRatio,sequence) alt_seq_score,rev_alt_seq_score = return_matching_score(log2LikelihoodRatio,alt_sequence) #matching_score_df[id,'sequence'] = sequence # matching_score_df[id,'alt_sequence'] = alt_sequence matching_score_df.loc[id,'seq_score'] = seq_score matching_score_df.loc[id,'alt_seq_score'] = alt_seq_score matching_score_df.loc[id,'rev_seq_score'] = rev_seq_score matching_score_df.loc[id,'rev_alt_seq_score'] = rev_alt_seq_score else: matching_score_df.loc[id,'seq_score'] = 0 matching_score_df.loc[id,'alt_seq_score'] = 0 matching_score_df.loc[id,'rev_seq_score'] = 0 matching_score_df.loc[id,'rev_alt_seq_score'] = 0 matching_score_df.to_csv('{}/{}_{}.csv'.format(outdir,celltype,binding_type))#;exit()
def list_sgrnas(genes_file, input_prefix, GC_cutoff, spacing, guides_per_gene, gecko, sam): """ Returns a list of (ontarget) sgrna sequences using a genome file and list of transcription start sites form a .csv file. """ genome_2bit_file = input_prefix + '.2bit' tbf = twobitreader.TwoBitFile(genome_2bit_file) final_guides = [] with open(genes_file, 'rb') as gf: f = [row for row in csv.reader(gf.read().splitlines())] for i, l in enumerate(f): if i == 0: columns = l continue #fetch the current gene and region gene = dict([(columns[i], e) for i, e in enumerate(l)]) region_bounds = [long(gene["start"]), long(gene["end"]) + 1] region = tbf[gene["chrom"]][region_bounds[0]:region_bounds[1]] region = region.upper() if "N" in region: print "found N in target region of", gene["name"] continue #identify and filter guides that target region guides = get_sorted_guides(region, gene, GC_cutoff, spacing, input_prefix) if len(guides) == 0: continue #add offtarget scores to filtered guides and select guides with higher offtarget scores ot_guides_sql = get_ot_guides(guides, input_prefix) ot_guides_dict = dict(ot_guides_sql) for g in guides: spacer = g[1] g.append(ot_guides_dict[spacer]) #sort and add guides with the highest offtarget scores to final guides guides = sorted(guides, key=itemgetter(-1), reverse=True) if len(guides) <= guides_per_gene: final_guides.extend(guides) else: final_guides.extend(guides[:guides_per_gene]) # add gecko or sam flanking sequences to the spacer for the oligo library if sam or gecko: for guide in final_guides: spacer = guide[1] if gecko: oligo = gecko_flank[0] + spacer + gecko_flank[1] if sam: oligo = sam_flank[0] + spacer + sam_flank[1] guide.append(oligo) return final_guides
def getTwoBits(): import twobitreader as tbr basedir="/home/clarkg/MouseGenomes/MM10/" files=glob.glob(basedir+"*.2bit") genome={} for f in files: chrm=f.split("/")[-1].split(".")[4] genome[chrm]=tbr.TwoBitFile(f) return genome
def _examinePSL(querySeq, blatinfo): multiDels = [] for blatinfo in results: qsize, qCuts, qblocks, tStart, tEnd, tStrand, tCuts, matches, coverage, chromstr = blatinfo chromfile = os.path.join(dirset.chromosomes, chromstr + ".2bit") bitfile = tbr.TwoBitFile(chromfile) genomicSequence = bitfile[chromstr][int(tStart):int(tEnd)] #flagged=_initAlign(querySeq,genomicSequence,tStrand) #if flagged: # genomeGaps=map(lambda l :int(tStart)+l,flagged) # gapSpans=group_consecutives(genomeGaps) # #print "INSERTIONS INTO QUERY at points:" # for i,gp in enumerate(gapSpans): # if len(gp) == 2: # print "\t\t"+str(i+1)+".)\t"+chromstr+":"+"-".join([str(gp[0]),str(gp[1])]) # elif len(gp) == 1: # print "\t\t"+str(i+1)+".)\t"+chromstr+":"+str(gp[0]) # qblocks=[] genomicFlats = [] for x in range(len(qblocks)): try: qB = int(qCuts[x]) lenB = int(qblocks[x]) tS = int(tCuts[x]) genomicCut = genomicSequence[(int(tS) - int(tStart)):(int(tS) - int(tStart) + lenB)] genomicFlats.append([int(tS), int(tS) + lenB]) location = chromstr + "(" + tStrand + "):" + str(tS) if tStrand == "-": Gseq = str(Seq(genomicCut).reverse_complement()) else: Gseq = genomicCut Qseq = querySeq[qB:(qB + lenB)] # print ">Query\n",Qseq # print ">Genom\n",Gseq except ValueError: pass deletions = missingRange(genomicFlats, tStart, tEnd) if len(deletions): deletions.insert(0, tStrand) multiDels.append(deletions) # print "\n\n" # print len(multiDels[0]) # time.sleep(5) if len(multiDels): return multiDels[0] else: return []
def main(args): dat_file = args.dat_file mutalyzer_results = args.mutalyzer_results twobit_file = args.twobit_file vcf_out = args.vcf_out #coord_hash = load_coord_hash(args.blat_coord_hash) mut_df = pandas.read_csv(mutalyzer_results, sep='\t') #mut_df.loc[:, 'Chromosomal Variant'] = mut_df.apply(lambda row: fix_mutalyzer(row, coord_hash), axis=1) genome = twobitreader.TwoBitFile(twobit_file) df_init = pandas.read_csv(dat_file, sep='\t') crit = df_init.apply(lambda row: row['Ref'] != '-' and not row['Alt'] in ('dup', 'del', 'ins') and not 'Leu' in row['Transcript'], axis=1) df_pre = df_init[crit] # rm duplicate rows df_pre.loc[:, 'simple_nm'] = df_pre.apply(fix_transcript, axis=1) cols = [x for x in df_pre.columns.values if x != 'Transcript'] idx_vals = df_pre[cols].drop_duplicates().index.values df = df_pre.loc[idx_vals][df_init.columns.values] df.loc[:, 'ref'] = df.apply(fix_ref, axis=1) df.loc[:, 'alt'] = df.apply(fix_alt, axis=1) df.loc[:, 'c.'] = df.apply(fix_cdot, axis=1) df.loc[:, 'Input Variant'] = df.apply(mk_var_new, axis=1) df.loc[:, 'clinical_class'] = df.apply(lambda row: '_'.join(str(row['Classification']).split()), axis=1) # should be left, but inner to skip mistakes/misisng for mutalyzer # need to drop bad mutalyzer results m = pandas.merge(df, mut_df, on='Input Variant', how='left').dropna(subset=('Chromosomal Variant',)) m.loc[:, 'chrom'] = m.apply(get_chrom, axis=1) m.loc[:, 'pos'] = m.apply(get_pos, axis=1) # print('debug') # print(m[m.pos==149771732][['chrom','pos','fixed_chrom']]) # print('debug') uc = ['chrom', 'pos', 'ref', 'alt', 'clinical_class', 'Pos Fam Cnt', 'Neg Fam Cnt', 'Homozygous Fam Cnt', 'Hemizygous Fam Cnt', 'Heterozygous Fam Cnt', 'Chromosomal Variant', 'Input Variant'] new_cols = {'Pos Fam Cnt':'pos_fam_count', 'Neg Fam Cnt':'neg_fam_count', 'Homozygous Fam Cnt':'hom_fam_count', 'Hemizygous Fam Cnt':'hemi_fam_count', 'Heterozygous Fam Cnt':'het_fam_count' } df_final = m[uc].rename(columns=new_cols).sort_values(by=['chrom', 'pos']) final_cols = [x for x in df_final.columns.values if x != 'Input Variant'] idx_vals = df_final[final_cols].drop_duplicates().index.values df_fix = df_final.loc[idx_vals] write_tab(df_fix, genome, vcf_out)
def seqbias(peak, tag, out, sequence, flank): genome = twobitreader.TwoBitFile(sequence) pcut = make_nmer_dict(2 * flank) #ncut = make_nmer_dict(2*flank) bgseq = make_nmer_dict(2 * flank) inf = open(peak) for line in inf: ll = line.strip().split("\t") seq = genome[ll[0]][int(ll[1]):int(ll[2])] for i in range(len(seq) - 2 * flank): subseq = seq[i:(i + 2 * flank)] if bgseq.has_key(subseq): bgseq[subseq] += 1 else: pass inf.close() inf = open(tag) PEtag = 0 for line in inf: ll = line.strip().split("\t") if "." in ll[0] or "_" in ll[0]: continue if ll[0] == "chrMT": chrm = 'chrM' else: chrm = ll[0] if len(ll) < 6: PEtag = 1 if PEtag == 1 or ll[5] == '+' or ll[5] == ".": seq = genome[chrm][(int(ll[1]) - flank):(int(ll[1]) + flank)].upper() if pcut.has_key(seq): pcut[seq] += 1 else: #print seq pass if PEtag == 1 or ll[5] == '-' or ll[5] == ".": seq = rev(genome[chrm][(int(ll[2]) - flank):(int(ll[2]) + flank)].upper()) if pcut.has_key(seq): pcut[seq] += 1 else: #print seq pass inf.close() outf = open(out, 'w') for seqtype in sorted(pcut.keys()): if bgseq[seqtype] == 0: pbias = -1 else: pbias = float(pcut[seqtype]) / float(bgseq[seqtype]) #nbias = float(ncut[seqtype])/float(bgseq[seqtype]) #outf.write("\t".join(map(str,[seqtype,pcut[seqtype]]))+"\n") outf.write("\t".join( map(str, [seqtype, pbias, pcut[seqtype], bgseq[seqtype]])) + "\n") outf.close()
def divideReadsFromBias(infile, outname, BiasfileA, BiasfileD, sequence, flank): genome = twobitreader.TwoBitFile(sequence) biasATAC, medATAC = readDict(BiasfileA) biasDNase, medDNase = readDict(BiasfileD) inf = open(infile) outf = open(outname + "_all.bed", 'w') outf11 = open(outname + "_biasG11.bed", 'w') outf10 = open(outname + "_biasG10.bed", 'w') outf01 = open(outname + "_biasG01.bed", 'w') outf00 = open(outname + "_biasG00.bed", 'w') random.seed(1228) for line in inf: ll = line.strip().split("\t") if ll[5] == '+': seq = genome[ll[0]][(int(ll[1]) - flank):(int(ll[1]) + flank)].upper() elif ll[5] == '-': seq = rev(genome[ll[0]][(int(ll[2]) - flank):(int(ll[2]) + flank)].upper()) else: rdnum = random.randint(0, 1) if rdnum == 0: seq = genome[ll[0]][(int(ll[1]) - flank):(int(ll[1]) + flank)].upper() newll = [ll[0], ll[1], str(int(ll[1]) + 1), ll[3], ll[4], "+"] else: seq = rev(genome[ll[0]][(int(ll[2]) - flank):(int(ll[2]) + flank)].upper()) newll = [ll[0], str(int(ll[2]) - 1), ll[2], ll[3], ll[4], "-"] line = "\t".join(newll) + "\n" if not biasATAC.has_key(seq) or not biasDNase.has_key(seq): continue biasA = biasATAC[seq] biasD = biasDNase[seq] outf.write(line) if biasA > medATAC: if biasD > medDNase: outf11.write(line) else: outf10.write(line) else: if biasD > medDNase: outf01.write(line) else: outf00.write(line) outf11.close() outf10.close() outf01.close() outf00.close() outf.close() inf.close()
def test_pickle(self): t = twobitreader.TwoBitFile(self.filename) buf = StringIO() pickle.dump(t, buf) buf.seek(0) t2 = pickle.load(buf) self.assertListEqual(sorted(t.keys()), sorted(t2.keys())) for k in t: self.assertEqual(str(t[k]), str(t2[k])) t.close()
def get_twobit_seq(fpath, chrom): """ Important to remember here that the return value is a TwoBitSequence object, allows sliced access :param fpath: :param chrom: :return: :rtype: TwoBitSequence """ seqfile = tbr.TwoBitFile(fpath) return seqfile[chrom]