def __init__(self, location, verbose=True): if os.path.isdir(location): self.genome = {} filenames = os.listdir(location) count = 0 for f in filenames: count += 1 self.genome.update( {f.split('.')[0]: self.readFasta(str(location + f))}) if verbose: functions.showProgress(count, len(filenames), 'Loading genome') if verbose: print('') elif os.path.isfile(location): self.genome = {} fc = open(location, 'r') line = fc.readline() header, *_ = line.split() tmp_name = header.lstrip('>') seq = '' line = fc.readline() while line: if line[0] == '>': self.genome[tmp_name] = seq seq = '' header, *_ = line.split() tmp_name = header.lstrip('>') else: line = line.replace('\n', '') seq = seq + line.upper() line = fc.readline() self.genome[tmp_name] = seq fc.close() else: sys.stderr.write('No file or directory. Exit')
def main(parclipfile, gfffile, upstream, downstream, sense, minSize, maxSize, verbose, xbins, ybins, vstring=''): anno = gff.GFF(gfffile) anno.filterSize(minSize, maxSize) totalsize = upstream + maxSize + 1 + downstream anno.sort2size() pc = ParclipSiteContainer.from_file(parclipfile) mat = [] annosize = [] for g in range(anno.size()): tmp = [-1] * totalsize if verbose: functions.showProgress(g, (anno.size() - 1), vstring) if anno.strand[g] == '+': values = pc.getValues(anno.chr[g], anno.start[g], anno.strand[g], sense, upstream, (anno.stop[g] - anno.start[g]) + downstream) else: values = pc.getValues(anno.chr[g], anno.stop[g], anno.strand[g], sense, upstream, (anno.stop[g] - anno.start[g]) + downstream) if values is not None: tmp[0:(len(values) - 1)] = values mat.append(functions.shrinkValues(tmp, xbins)) annosize.append(anno.stop[g] - anno.start[g]) smat = [] sannosize = [] if ybins >= anno.size(): print('Warning: --ybins >= entries in ' + gfffile) ybins = anno.size() ystep = round(anno.size() / ybins) ystart = 0 ystop = ystep while ystop < anno.size(): tmp = [0] * xbins for i in range(xbins): count = 0 tmpanno = 0 for j in range(ystart, ystop): tmp[i] += mat[j][i] # [row][col] tmpanno += annosize[j] count += 1 tmp[i] = tmp[i] / count tmpanno = tmpanno / count smat.append(tmp) sannosize.append(tmpanno) ystart = ystop ystop += ystep return smat, sannosize if verbose: print()
def precalculateDistributions(minN, maxN, par, verbose): distributions = [0] * (maxN + 1) for i in range(minN, (maxN + 1)): tmp_probabilities = [] for j in range((i + 1)): tmp_probabilities.append(prob_bg(j, i, par)) distributions[i] = tmp_probabilities if verbose: functions.showProgress(i + 1, (maxN + 1), 'precalculating probability distributions') if verbose: print('') return (distributions)
def processPileup(file_pileup, verbose, reference='T', mutation='C'): pseudocount = 1 maxr = 1000 #upper limit for sequencing depth which is used for parameter estimation reference_reverse = functions.makeReverseComplement(reference) mutation_reverse = functions.makeReverseComplement(mutation) lines = 0 for line in open(file_pileup): lines += 1 mr_list_neg = [[0]] #initializing empty matrices for parameter estimation mr_list_signal = [[0]] for i in range(1, maxr): mr_list_neg.append([pseudocount] * (i + 1)) mr_list_signal.append([pseudocount] * (i + 1)) file_pileup = open(file_pileup, 'r') line = file_pileup.readline() count = percent_old = percent_new = 0 if verbose: functions.showProgress(count, lines, 'Processing Pileup') while (line): count += 1 split = line.split('\t') if split[2] == reference: tmp_counts = functions.getCounts(split[4], forward=True) counts = [tmp_counts[0], tmp_counts[1][mutation]] if counts[0] < maxr: mr_list_signal[counts[0]][counts[1]] += 1 if split[2] == reference_reverse: tmp_counts = functions.getCounts(split[4], forward=False) counts = [tmp_counts[0], tmp_counts[1][mutation_reverse]] if counts[0] < maxr: mr_list_signal[counts[0]][counts[1]] += 1 if split[2] != reference and split[2] != reference_reverse: tmp_counts = functions.getCounts(split[4], forward=True) counts = [tmp_counts[0], max(tmp_counts[1].values())] if counts[0] < maxr: mr_list_neg[counts[0]][counts[1]] += 1 percent_new = math.trunc((count / lines) * 100) if (percent_new > percent_old): if verbose: functions.showProgress(count, lines, 'Processing Pileup') percent_old = percent_new line = file_pileup.readline() if verbose: print('') file_pileup.close() return ([mr_list_neg, mr_list_signal])
def getCountMat(file_pileup, minCoverage, verbose): alphabet = ['A', 'C', 'G', 'T'] translate = {'A': 0, 'C': 1, 'G': 2, 'T': 3} if verbose: lines = 0 with open(file_pileup) as infile: for line in infile: lines += 1 mat = [[0] * 4, [0] * 4, [0] * 4, [0] * 4] with open(file_pileup) as file_pileup: count = 0 percent_old = 0 percent_new = 0 if verbose: functions.showProgress(count, lines, 'Processing Pileup') for line in file_pileup: count += 1 split = line.split('\t') nuc = split[2].upper() if nuc != 'N': tmp_counts = functions.getCounts(split[4], forward=True) if tmp_counts[0] >= minCoverage: for c in alphabet: if c == nuc: mat[translate[nuc]][translate[c]] += tmp_counts[0] - tmp_counts[2] else: mat[translate[nuc]][translate[c]] += tmp_counts[1][c] if verbose: percent_new = math.trunc(count / lines * 100) if percent_new > percent_old: functions.showProgress(count, lines, 'Processing Pileup') percent_old = percent_new if verbose: print() return mat
def main(parclipfile, outputfile, gfffile, downstream, upstream, gene, sense, minSize, maxSize, verbose, vstring=''): anno = gff.GFF(gfffile) anno.filterSize(minSize, maxSize) pc = ParclipSiteContainer() pc.loadFromFile(parclipfile) with open(outputfile, 'w') as fc_out: for g in range(anno.size()): if verbose: functions.showProgress(g, (anno.size() - 1), vstring) if anno.strand[g] == '+': values_upstream = pc.getValues(anno.chr[g], anno.start[g], anno.strand[g], sense, upstream, gene) values_dostream = pc.getValues(anno.chr[g], anno.stop[g], anno.strand[g], sense, gene, downstream) else: values_upstream = pc.getValues(anno.chr[g], anno.stop[g], anno.strand[g], sense, upstream, gene) values_dostream = pc.getValues(anno.chr[g], anno.start[g], anno.strand[g], sense, gene, downstream) if values_upstream is not None and values_dostream is not None: print(*chain(values_upstream, values_dostream), sep='\t', file=fc_out) if verbose: print()
def main(parclip, outdir, prefix, genomepath, negset, gfffile, kmer, key, useQuantiles, verbose, args): scriptPath = os.path.dirname(os.path.realpath(__file__)) plot_script = os.path.join(scriptPath, 'plotKmerLogOdds.R') pc = ParclipSiteContainer.from_file(parclip) if gfffile is not None: pc.remove_gff_sites(gfffile) pc.sort(by=key, ascending=False) kmers = functions.makekmers(kmer, list('ACGT'))[kmer - 1] negfreq = loadNegTable(negset) with EfficientGenome(genomepath) as genomeseq: allfreqs = [] fileprefix = '%s_logodds_%smer_sort_%s' % (prefix, kmer, key) if useQuantiles: fileprefix = fileprefix + '_quantiles' allfreqs.append( getkmerLogs(pc, genomeseq, negfreq, kmers, 0, 1000, 15)) quantiles = [ 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.125, 0.15, 0.175, 0.2, 0.225, 0.25, 0.275, 0.3, 0.325, 0.35, 0.375, 0.4, 0.45, 0.5, 0.55, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9 ] count = 1 stop = 1000 for q in quantiles: if verbose: functions.showProgress( count, len(quantiles), 'Getting kmer log-odds from quantiles...') old_stop = stop start = functions.getQuantileIndex(len(pc), q) - 500 stop = functions.getQuantileIndex(len(pc), q) + 500 if start < 0: start = 0 if stop > len(pc) - 2: break count = count + 1 if (stop - 500) < old_stop: msg_pat = 'Bin %s and %s are overlapping by %s sites!' # TODO 2x quantiles[count - 2] is probably a bug msg = msg_pat % (quantiles[count - 2], quantiles[count - 2], old_stop - (stop - 500)) print(msg, file=sys.stderr) allfreqs.append( getkmerLogs(pc, genomeseq, negfreq, kmers, start, stop, 15)) else: maxsize = 50000 stepsize = 1000 start = 0 stop = 1000 run = True while run: if stop > len(pc) - 2 or stop > maxsize: print() print('STOP at: %s' % +stop) run = False break if verbose: functions.showProgress( stop, maxsize, 'Getting kmer log-odds from bins...') allfreqs.append( getkmerLogs(pc, genomeseq, negfreq, kmers, start, stop, 15)) start = stop stop = stop + stepsize table_file = os.path.join(outdir, fileprefix + '.table') pdf_file = os.path.join(outdir, fileprefix + '.pdf') sortAndSave(allfreqs, table_file, kmers) cmd = [ 'R', '-q', '--slave', '-f %r' % plot_script, '--args', '%r' % table_file, '%r' % pdf_file, ] execute(cmd) if not args.keep_tmp_files: os.remove(table_file)
def findPvalueParclipInPileup(pileup, output_file, mincov, maxcov, probabilities, verbose, reference='T', mutation='C', maxPvalue=0.001, SNPlikely=False): reference_reverse = functions.makeReverseComplement(reference) mutation_reverse = functions.makeReverseComplement(mutation) found_sites = 0 lines = 0 with open(pileup) as handle: for line in handle: lines += 1 with open(pileup) as file_pileup, open(output_file, 'w') as file_table: header = list(PC_MANDATORY_FIELDS) + ['p_value'] print(*header, sep='\t', file=file_table) line = file_pileup.readline() linecount = 0 percent_old = 0 percent_new = 0 if verbose: functions.showProgress(linecount, lines, 'Processing Pileup') while (line): linecount += 1 split = line.split('\t') counts = [0, 0] pvalue = 1 if split[2] == reference: tmp_counts = functions.getCounts(split[4], forward=True) counts = [tmp_counts[0], tmp_counts[1][mutation]] if counts[0] > mincov and counts[1] > 0: if counts[0] > 500: pvalue = getPvalue( round((counts[1] / counts[0]) * 500), 500, probabilities, SNPlikely) else: pvalue = getPvalue(counts[1], counts[0], probabilities, SNPlikely) if pvalue <= maxPvalue: print(split[0], split[1], counts[1], counts[0], 1 - pvalue, '+', 0, pvalue, sep='\t', file=file_table) found_sites += 1 if split[2] == reference_reverse: tmp_counts = functions.getCounts(split[4], forward=False) counts = [tmp_counts[0], tmp_counts[1][mutation_reverse]] if counts[0] > mincov and counts[1] > 0: if counts[0] > 500: pvalue = getPvalue( round((counts[1] / counts[0]) * 500), 500, probabilities, SNPlikely) else: pvalue = getPvalue(counts[1], counts[0], probabilities, SNPlikely) if pvalue <= maxPvalue: print(split[0], split[1], counts[1], counts[0], 1 - pvalue, '-', 0, pvalue, sep='\t', file=file_table) found_sites += 1 percent_new = math.trunc((linecount / lines) * 100) if (percent_new > percent_old): if verbose: functions.showProgress(linecount, lines, 'Processing Pileup') percent_old = percent_new line = file_pileup.readline() print('Found %s PAR-CLIP sites.' % found_sites)