def replacementDict(tc, dc, cc, tr): MAX_PREDICATES = 3 repl = {} for i in range(len(tc)): repl["tag_%i"%i] = tc[i] repl["tag_%i_aan"%i] = a_or_an(tc[i]) if dc[i]: if len(dc[i].keys()) < MAX_PREDICATES: relVerbs = dc[i].keys() lemmas = [rc(dc[i][relVerb]) for relVerb in relVerbs] for j in range(MAX_PREDICATES - len(relVerbs)): useAgain = rc(dc[i].keys()) relVerbs.append(useAgain) lemmas.append(rc(dc[i][useAgain])) else: relVerbs = rs(dc[i].keys(), MAX_PREDICATES) lemmas = [rc(dc[i][relVerb]) for relVerb in relVerbs] zipped = zip(relVerbs, lemmas) for j in range(MAX_PREDICATES): repl["tag_%i_predicate_%i"%(i,j)] = "%s %s" % (rc(zipped[j][0]), zipped[j][1]) dump = [] for rv in dc[i].keys(): for lemma in dc[i][rv]: dump.append("%s it %s %s." % (rc(tr), rc(rv), lemma)) random.shuffle(dump) repl["tag_%i_dump"%i] = dump else: for j in range(MAX_PREDICATES): repl["tag_%i_predicate_%i"%(i,j)] = "remains unknown" repl["tag_%i_dump"%i] = [] return repl
def replacementDict(tc, dc, cc, tr): MAX_PREDICATES = 3 repl = {} for i in range(len(tc)): repl["tag_%i"%i] = tc[i] repl["tag_%i_aan"%i] = a_or_an(tc[i]) if dc[i]: if len(dc[i].keys()) < MAX_PREDICATES: relVerbs = dc[i].keys() lemmas = [rc(dc[i][relVerb]) for relVerb in relVerbs] for j in range(MAX_PREDICATES - len(relVerbs)): useAgain = rc(dc[i].keys()) relVerbs.append(useAgain) lemmas.append(rc(dc[i][useAgain])) else: relVerbs = rs(dc[i].keys(), MAX_PREDICATES) lemmas = [rc(dc[i][relVerb]) for relVerb in relVerbs] zipped = zip(relVerbs, lemmas) for j in range(MAX_PREDICATES): repl["tag_%i_predicate_%i"%(i,j)] = " ".join(zipped[j]) dump = [] for rv in dc[i].keys(): for lemma in dc[i][rv]: dump.append("%s it %s %s." % (rc(tr), rv, lemma)) random.shuffle(dump) repl["tag_%i_dump"%i] = dump else: for j in range(MAX_PREDICATES): repl["tag_%i_predicate_%i"%(i,j)] = "remains unknown" repl["tag_%i_dump"%i] = [] return repl
def chTitle(hi): htmlFile = open(APPPATH+'static/output/'+hi+'.html', 'r') html = htmlFile.read() htmlFile.close() soup = BeautifulSoup(html) text = "\n".join([unicode(i) for i in soup.p.contents]).replace("<br/>", "\n") s = parsetree(text) nounPhrases = [] for sentence in s: for chunk in sentence.chunks: if chunk.type == "NP": nounPhrases.append(chunk.string) selectNPs = rs([np for np in nounPhrases if not "&" in np], ri(1,2)) articles = ["a", "an", "the"] nps = [] for np in selectNPs: if startsWithCheck(np, articles): nps.append(np) else: nps.append(a_or_an(np)) if len(selectNPs) == 1: title = titlecase(nps[0]) elif len(selectNPs) == 2: title = titlecase(" and ".join(nps)) # elif len(selectNPs) == 3: # title = titlecase("%s, %s, and %s" % tuple(nps)) return title.encode('ascii', 'xmlcharrefreplace')
def get_random(self): seed = rs(1, self.max) # Shift off bits, discarding the sign.Discarding the sign is # important because OR w / 5 can give us + or - numbers. seed += (seed * seed) | 5 r = (seed >> 32) / self.max return int(modf(r)[0] * 10000000) % self.scope
def grafBuilder(exploDictsConf): MAX_TRANSITIONS = 7 tags, exploDicts, confs = zip(*exploDictsConf) confMean = float(sum(confs)) / len(confs) grafs = [] templates = [open_template(n) for n in range(1, 7)] transFile = open(APPPATH + 'lists/transitions.txt', 'r') transitions = [l.strip() for l in transFile.readlines()] transFile.close() i = 0 for tagsChunk in chunks(tags, MAX_GRAF_DENSITY): dictsChunk = exploDicts[i:i + len(tagsChunk)] confsChunk = confs[i:i + len(tagsChunk)] replDict = replacementDict(tagsChunk, dictsChunk, confsChunk, transitions) transes = rs(transitions, MAX_TRANSITIONS) for j in range(MAX_TRANSITIONS): replDict["transition_%i" % j] = transes[j] for j in range(len(tagsChunk)): for k in range(2, 5, 2): if len(replDict["tag_%i_dump" % j]) > k: replDict["tag_%i_dump%i" % (j, k)] = " ".join( rs(replDict["tag_%i_dump" % j], k + 1)) else: replDict["tag_%i_dump%i" % (j, k)] = "" templNo = len(tagsChunk) grafs.append(templates[templNo - 1].substitute(**replDict)) i += len(tagsChunk) return grafs
def grafBuilder(exploDictsConf): MAX_TRANSITIONS = 7 tags, exploDicts, confs = zip(*exploDictsConf) confMean = float(sum(confs))/len(confs) grafs = [] templates = [open_template(n) for n in range(1,7)] transFile = open(APPPATH+'lists/transitions.txt', 'r') transitions = [l.strip() for l in transFile.readlines()] transFile.close() i = 0 for tagsChunk in chunks(tags, MAX_GRAF_DENSITY): dictsChunk = exploDicts[i:i+len(tagsChunk)] confsChunk = confs[i:i+len(tagsChunk)] replDict = replacementDict(tagsChunk, dictsChunk, confsChunk, transitions) transes = rs(transitions, MAX_TRANSITIONS) for j in range(MAX_TRANSITIONS): replDict["transition_%i"%j] = transes[j] for j in range(len(tagsChunk)): for k in range(2,5,2): if len(replDict["tag_%i_dump"%j]) > k: replDict["tag_%i_dump%i"%(j,k)] = " ".join(rs(replDict["tag_%i_dump"%j], k+1)) else: replDict["tag_%i_dump%i"%(j,k)] = "" templNo = len(tagsChunk) grafs.append(templates[templNo-1].substitute(**replDict)) i+=len(tagsChunk) return grafs
def random_search_lgbm(self, param, n_iterations, X, y): '''Select the best parameters for the lgbm model''' # allocate all results in dataframe final_result = pd.DataFrame( columns=['mean f1-score', 'std', 'parameters']) for i in range(n_iterations): # choose values for parameters randomly hp = {k: rs(v, 1)[0] for k, v in param.items()} # model model_random_search = LGBMClassifier( objective='binary', num_leaves=hp['num_leaves'], min_data_in_leaf=hp['min_data_in_leaf'], learning_rate=hp['learning_rate'], n_estimators=hp['n_estimators'], max_depth=hp['max_depth'], colsample_bytree=hp['colsample_bytree'], min_child_weight=hp['min_child_weight'], random_state=42, n_jobs=-1).fit(X, y) # define CV strategy sk_fold = StratifiedKFold(n_splits=10, random_state=None) # calculate cross validation cv_scores = cross_val_score(model_random_search, X, y, cv=sk_fold, scoring='f1', n_jobs=-1) # append cv scores in dataframe result = pd.DataFrame( [[mean(cv_scores), std(cv_scores), hp]], columns=['mean f1-score', 'std', 'parameters']) final_result = pd.concat([final_result, result]) return final_result.sort_values('mean f1-score', ascending=False).head(10)
def process_captions(self, ch, method, properties, body): def int_to_enc(i): return "{0:b}".format(i).replace('0', '~').replace('1', '|') img_hash, csv = body.split('#', 1) print img_hash, "CAPTIONED" captions_raw = list(set(csv.split(','))) if self.manual or len(captions_raw) <= self.sentence_count: captions_cut = captions_raw else: captions_cut = rs(captions_raw, self.sentence_count) self.unused_captions = list(set(captions_raw) - set(captions_cut)) self.unused_captions_per_graf = len(self.unused_captions) / self.sentence_count captions = map( # lambda (i, x): int_to_enc(i%8) + x[0].upper() + x[1:], lambda (i, x): x[0].upper() + x[1:], enumerate(captions_cut) ) approved_captions = list() for c in captions: approved = True if self.manual: if len(approved_captions) > self.sentence_count: approved = False else: approved = self.approve(c) if approved: approved_captions.append(c) for c in approved_captions: self.channel.basic_publish( exchange = '', routing_key = 'CaptionToExpand', body = img_hash + '#' + self.pre_seed + c )
def peaks_from_info(bam_fileobj, wiggle, pos_counts, lengths, loc, gene_length, margin=25, fdr_alpha=0.05, binom_alpha=0.001, method="Randomization" ,user_threshold=None, minreads=20, poisson_cutoff=0.05, plotit=False, width_cutoff=10, windowsize=1000, SloP=False, correct_p=False, max_width=None, min_width=None, max_gap=None, algorithm="spline"): """ same args as before wiggle is converted from bam file pos_counts - one point per read instead of coverage of entire read lengths - lengths aligned portions of reads rest are the same fix later calls peaks for an individual gene gene_length - effective length of gene margin - space between sections for calling new peaks fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup) user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha minreads - min reads in section to try and call peaks poisson_cutoff - p-value for signifance cut off for number of reads in genomic_center that gets called - might want to use ashifted distribution plotit - makes figures w_cutoff - width cutoff, peaks narrower than this are discarted windowssize - for super local calculation distance left and right to look SloP - super local p-value instead of gene-wide p-value correct_p - boolean bonferoni correction of p-values from poisson algorithm - str the algorithm to run """ peak_dict = {} #all the information nessessary to record a genomic_center, used later, but declared outside of loops #these are what is built in this dict, complicated enough that it might #be worth turning into an object #peak_dict['clusters'] = {} #peak_dict['sections'] = {} #peak_dict['nreads'] = int() #peak_dict['threshold'] = int() #peak_dict['loc'] = loc #data munging chrom, gene_name, tx_start, tx_end, strand = loc tx_start, tx_end = [int(x) for x in [tx_start, tx_end]] #used for poisson calclulation? nreads_in_gene = sum(pos_counts) #decides FDR calcalation, maybe move getFRDcutoff mean into c code gene_threshold = 0 if user_threshold is None: if method == "Binomial": #Uses Binomial Distribution to get cutoff if specified by user gene_threshold = get_Binom_cutoff(lengths,gene_length,binom_alpha) else: gene_threshold = get_FDR_cutoff_mean(lengths, gene_length,alpha=fdr_alpha) else: logging.info("using user threshold") gene_threshold = user_threshold if not isinstance(gene_threshold, int): raise TypeError peak_dict['clusters'] = [] peak_dict['sections'] = {} peak_dict['nreads'] = int(nreads_in_gene) peak_dict['threshold'] = gene_threshold peak_dict['loc'] = loc peak_number=1 sections = find_sections(wiggle, margin) if plotit is True: plot_sections(wiggle, sections, gene_threshold) for sect in sections: sectstart, sectstop = sect sect_length = sectstop - sectstart + 1 data = wiggle[sectstart:(sectstop + 1)] #this cts is alright because we know the reads are bounded cts = pos_counts[sectstart:(sectstop + 1)] xvals = arange(0, sect_length) Nreads = sum(cts) peak_dict['sections'][sect] = {} threshold = int() peak_dict['sections'][sect]['nreads'] = int(Nreads) #makes sure there are enough reads if Nreads < minreads: logging.info("""%d is not enough reads, skipping section: %s""" %(Nreads, sect)) peak_dict['sections'][sect]['tried'] = False continue else: logging.info("""Analyzing section %s with %d reads""" %(sect, Nreads)) pass if user_threshold == None: if SloP: #gets random subset of lengths of reads for calculations on a section #not exactly the right way to do this but it should be very close. sect_read_lengths = rs(lengths, Nreads) #use the minimum FDR cutoff between superlocal and gene-wide calculations threshold = min(gene_threshold, get_FDR_cutoff_mean(sect_read_lengths, sect_length, alpha=fdr_alpha)) logging.info("Using super-local threshold %d" %(threshold)) else: threshold = gene_threshold else: threshold = user_threshold #saves threshold for each individual section peak_dict['sections'][sect]['threshold'] = threshold peak_dict['sections'][sect]['nreads'] = int(Nreads) peak_dict['sections'][sect]['tried'] = True peak_dict['sections'][sect]['nPeaks'] = 0 if max(data) < threshold: logging.info("data does not excede threshold, stopping") continue if algorithm == "spline": initial_smoothing_value = (sectstop - sectstart + 1) fitter = SmoothingSpline(xvals, data, initial_smoothing_value, lossFunction="get_norm_penalized_residuals") elif algorithm == "gaussian": fitter = GaussMix(xvals, data) elif algorithm == "classic": fitter = Classic(xvals, data, max_width, min_width, max_gap) try: peak_definitions = fitter.peaks(threshold, plotit) except Exception as error: logging.error(gene_name) raise error #subsections that are above threshold #peak center is actually the location where we think binding should #occur, not the average of start and stop for peak_start, peak_stop, peak_center in peak_definitions: genomic_start = tx_start + sectstart + peak_start genomic_stop = tx_start + sectstart + peak_stop number_reads_in_peak = bam_fileobj.count(chrom, start=genomic_start, end=genomic_stop) #sum(cts[peak_start:(peak_stop + 1)]) logging.info("""Peak %d (%d - %d) has %d reads""" %(peak_number, peak_start, (peak_stop + 1), number_reads_in_peak)) #makes sure there enough reads if (number_reads_in_peak < minreads or max(data[peak_start:(peak_stop + 1)]) < threshold): logging.info("""skipping genomic_center, %d is not enough reads""" %(number_reads_in_peak)) continue #highest point in start stop genomic_center = tx_start + sectstart + peak_center #makes it thicker so we can see on the browser thick_start = genomic_center - 2 thick_stop = genomic_center + 2 #best_error checking logic to keep bed files from breaking if thick_start < genomic_start: thick_start = genomic_start if thick_stop > genomic_stop: thick_stop = genomic_stop peak_length = genomic_stop - genomic_start + 1 #skip really small peaks if peak_length < width_cutoff: continue #super local logic #best_error check to make sure area is in area of gene #distance from gene start if genomic_center - tx_start - windowsize < 0: area_start = 0 #for super local gets area around genomic_center for calculation else: area_start = genomic_center - tx_start - windowsize #area_start = sectstart #same thing except for end of gene instead of start if genomic_center + windowsize > tx_end: #distance to gene stop area_stop = tx_start - tx_end + 1 else: area_stop = genomic_center - tx_start + windowsize #area_stop = sectstop #use area reads + 1/2 all other reads in gene: #area_reads = sum(pos_counts[area_start:area_stop]) + #0.5*(sum(pos_counts) - #sum(pos_counts[area_start:area_stop])) #use area reads: area_reads = sum(pos_counts[area_start:area_stop]) area_size = area_stop - area_start + 1 #area_reads = sum(pos_counts[sectstart:sectstop]) #area_size = sect_length #calcluates poisson based of whole gene vs genomic_center if algorithm == "classic" and peak_length < min_width: peak_length = min_width gene_pois_p = poissonP(nreads_in_gene, number_reads_in_peak, gene_length, peak_length) if SloP is True: #same thing except for based on super local p-value slop_pois_p = poissonP(area_reads, number_reads_in_peak, area_size, peak_length) #makes sure spop_poisP is defined, even if its #just normal, something to be removed later, #slop should only be used when defined as true else: slop_pois_p = gene_pois_p if math.isnan(slop_pois_p): slop_pois_p = 1 #defines the bedline of a genomic_center for returning #TODO This should be abstracted out for now... seperate model from view peak_dict['clusters'].append(Peak(chrom, genomic_start, genomic_stop, gene_name, #need this is a unique id for later analysis slop_pois_p, strand, thick_start, thick_stop, peak_number, number_reads_in_peak, gene_pois_p, peak_length, 0 ) ) peak_number += 1 peak_dict['sections'][sect]['nPeaks'] +=1 #inflate p-values based on # of comparisons #bonferroni corrected if correct_p is True: #best I can tell this never executes... for genomic_center in peak_dict['clusters']: genomic_center.p = genomic_center.p * peak_number #bonferroni correct p-value for MHT peak_dict['Nclusters'] = peak_number if plotit: import sys plt.show() v = sys.stdin.read(1) return peak_dict
def make_chars(self): # establish gender ratio charGenders = [ri(0,1) for _ in range(CC)] # initialize list of characters chars = [] # add user defined characters for firstlast in args.charnames: fl_list = firstlast.split('_') # Note that split is an underscore! chars.append(Character(fl_list[0], fl_list[1])) # add generated characters for b in charGenders: if b: chars.append(Character(rc(fFirstNames), rc(surnames))) else: chars.append(Character(rc(mFirstNames), rc(surnames))) # establish list of intro scenes introScenePaths = rs(characterTropeFiles, len(chars)) # establish list of settings settings = rs(settingTropeFiles, len(chars)*TSV) # establish list of drug trips trips = rs(erowidExpPaths, len(chars)*DTV) # establish list of scp articles scps = rs(scpPaths, len(chars)*SCP) # establish list of gberg excerpts gbergs = rs(gPaths.values(), len(chars)*GGV) i = 0 j = 0 m = 0 p = 0 s = 0 for c in chars: # make friends c.friends += rs(chars, ri(1,len(chars)-1)) if c in c.friends: c.friends.remove(c) # add introduction description c.introDesc = self.personal_trope([c], introScenePaths[i]) # add setting scenes for k in range(TSV): c.scenes.append(self.personal_trope([c]+c.friends, settings[j+k])) # add drug trip scenes for n in range(DTV): c.drugTrips.append(self.personal_trip([c]+c.friends, trips[m+n])) # add scp articles for q in range(SCP): c.scpReports.append(self.personal_scp([c]+c.friends, scps[p+q])) # add gberg excerpts for t in range(GGV): c.gbergExcerpts.append(self.personal_gberg([c]+c.friends, gbergs[s+t])) i += 1 j += TSV m += DTV p += SCP s += GGV self.characters = chars
def shuffle(self) -> List[int]: ''' Returns a random shuffling of the list of numbers. ''' rs(self.rand) return self.rand
current_game = None nextgame_counter = 10 return ret = current_game.step() if ret in ["draw", "win"]: status = ret if __name__=="__main__": # fix bot permissions os.system("chmod u+x bots/*") # create games. every bot plays agains every bot, including itself and every game is played two times with different colors games = [] for g in itertools.product([i for i in os.listdir("bots") if i[0]!="."], repeat = 2): games.append(game.Game([game.Bot(g[0].split(".")[0], "bots/"+g[0]), game.Bot(g[1].split(".")[0], "bots/"+g[1])])) rs(games) # debug: drop a is b games games = [i for i in games if i.white.name != i.black.name] status = "running" current_game = None nextgame_counter = NEXT_GAME_DELAY task.LoopingCall(step).start(GAME_STEP_INTERVAL, True) reactor.listenTCP(8080, server.Site(ChessGameServer())) reactor.run()
from random import shuffle as rs NOVICE = [ 'r', 'r', 'r', 'r', 'r', 'r', 'b', 'b', 'b', 'b', 'b', 'b', 'w', 'w', 'e' ] APPRENTICE = [ 'r', 'r', 'r', 'r', 'r', 'r', 'b', 'b', 'b', 'b', 'w', 'w', 'y', 'y', 'e' ] EXPERT = [ 'r', 'r', 'r', 'r', 'b', 'b', 'b', 'b', 'w', 'w', 'y', 'y', 'g', 'g', 'e' ] MASTER = [ 'r', 'r', 'r', 'r', 'b', 'b', 'w', 'w', 'y', 'y', 'g', 'g', 'p', 'p', 'e' ] rs(NOVICE) rs(APPRENTICE) rs(EXPERT) rs(MASTER) with open("input1.txt", "w") as f: for i in range(50): rs(NOVICE) for char in NOVICE: f.write(char + " ") f.write("\n") with open("input2.txt", "w") as f: for i in range(50): rs(APPRENTICE) for char in APPRENTICE:
def peaks_from_info(wiggle, pos_counts, lengths, loc, gene_length, margin=25, fdr_alpha=0.05, user_threshold=None, minreads=20, poisson_cutoff=0.05, plotit=False, width_cutoff=10, windowsize=1000, SloP=False, correct_p=False): """ same args as before wiggle is converted from bam file pos_counts - one point per read instead of coverage of entire read lengths - lengths aligned portions of reads rest are the same fix later calls peaks for an individual gene gene_length - effective length of gene margin - space between sections for calling new peaks fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup) user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha minreads - min reads in section to try and call peaks poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution plotit - makes figures w_cutoff - width cutoff, peaks narrower than this are discarted windowssize - for super local calculation distance left and right to look SloP - super local p-value instead of gene-wide p-value correct_p - boolean bonferoni correction of p-values from poisson """ peak_dict = {} #these are what is built in this dict, complicated enough that it might #be worth turning into an object #peak_dict['clusters'] = {} #peak_dict['sections'] = {} #peak_dict['nreads'] = int() #peak_dict['threshold'] = int() #peak_dict['loc'] = loc #data munging chrom, gene_name, tx_start, tx_end, signstrand = loc tx_start, tx_end = [int(x) for x in [tx_start, tx_end]] #used for poisson calclulation? nreads_in_gene = sum(pos_counts) #decides FDR calcalation, maybe move getFRDcutoff mean into c code if user_threshold is None: gene_threshold = get_FDR_cutoff_mean(lengths, gene_length, alpha=fdr_alpha) else: gene_threshold = user_threshold if gene_threshold == "best_error": #verboseprint("""I had a hard time with this one: %s. # I think I'll use a threshold of 50""" % (loc)) threshold = 50 peak_dict['clusters'] = {} peak_dict['sections'] = {} peak_dict['nreads'] = int(nreads_in_gene) peak_dict['threshold'] = gene_threshold peak_dict['loc'] = loc peakn = 1 sections = find_sections(wiggle, margin) if plotit is True: plot_sections(wiggle, sections, gene_threshold) for sect in sections: sectstart, sectstop = sect sect_length = sectstop - sectstart + 1 data = wiggle[sectstart:(sectstop + 1)] cts = pos_counts[sectstart:(sectstop + 1)] xvals = arange(0, sect_length) Nreads = sum(cts) #gets random subset of lengths of reads for calculations on a section #not exactly the right way to do this but it should be very close. sect_read_lengths = rs(lengths, Nreads) peak_dict['sections'][sect] = {} threshold = int() #makes sure there are enough reads if Nreads < minreads: #verboseprint("""%d is not enough reads, skipping section: # %s""" % (Nreads, sect)) continue else: pass #verboseprint("""Analyzing section %s with %d reads""" # % (sect, Nreads)) #sets super-local if requested, might be able to factor this if user_threshold is None: if SloP is True: #use the minimum FDR cutoff between superlocal and gene-wide calculations threshold = min( gene_theshold, get_FDR_cutoff_mean(sect_read_lengths, sect_length, alpha=fdr_alpha)) #verboseprint("Using super-local threshold %d" % (threshold)) else: threshold = gene_threshold else: threshold = user_threshold #saves threshold for each individual section peak_dict['sections'][sect]['threshold'] = threshold peak_dict['sections'][sect]['nreads'] = int(Nreads) #if wiggle track never excides threshold if max(data) < threshold: #verboseprint("data does not excede threshold, stopping") continue #fitting splines logic, black magic try: degree = 3 #cubic spline weights = None #for very large windows with many reads a large smoothing #parameter is required. test several different options #to determine a reasonable inital estimate #Goal is to find optimnal smooting paramater in multiple steps #initial_smoothing_value initial estimate of smoothing paramater #step 1, identify good initial value initial_smoothing_value = (sectstop - sectstart + 1) best_smoothing_value = initial_smoothing_value best_estimate = 1 #step 2, refine so as not to runinto local minima later, #try to come up with a good way of getting optimal paramater best_error = find_spline_residuals(initial_smoothing_value, xvals, data, degree, weights) for i in range(2, 11): cur_smoothing_value = initial_smoothing_value * i #tries find optimal initial smooting paraater in this loop cur_error = find_spline_residuals(cur_smoothing_value, xvals, data, degree, weights) if cur_error < best_error: best_smoothing_value = cur_smoothing_value best_estimate = i try: #fine optimization of smooting paramater cutoff = float(0) tries = 0 # shouldn't get smoothing coef's this small.. increase #the initial estimate and try again. WARNING: BLACK MAGIC while cutoff < 5: tries += 1 # increasing this may improve accuracy, #but at the cost of running time. if tries == 3: break spline = optimize.minimize( find_spline_residuals, best_smoothing_value, args=(xvals, data, degree, weights), options={ 'disp': False, 'maxiter': 10, }, #method="Powell", # old method method="L-BFGS-B", #abnormal termination sometimes #method="COBYLA", bounds=((.1, None), ), ) #fit a smoothing spline using an optimal parameter #for smoothing and with weights proportional to the #number of reads aligned at each position if weights #is set if spline.success: cutoff = spline.x #print "cutoff is %s" % (cutoff) else: #print "%s failed spline building at section %s" % (loc, sect) #print spline.message pass best_smoothing_value += sect_length except Exception as best_error: print >> sys.stderr, "best smoothing value is:", best_smoothing_value print >> sys.stderr, "%s failed spline fitting at section %s (major crash)" % ( loc, sect) print >> sys.stderr, best_error continue #verboseprint ("optimized smoothing parameter") #if we are going to save and output as a pickle fi is %s" %(str(cutoff)) #final fit spline spline = find_univariate_spline(cutoff, xvals, data, degree, weights) spline_values = array([round(x) for x in spline(xvals)]) if plotit is True: plot_spline(spline, data, xvals, peakn, threshold) starts_and_stops, starts, stops = get_regions_above_threshold( threshold, spline_values) #walks along spline, and calls peaks along spline #for each start, take the next stop and find the peak #between the start and the stop this is where I need to #fix, some peaks starts start right after another start, #but not on top of it make sure the next start is after the #previous stop #subsections that are above threshold for p_start, p_stop in starts_and_stops: #peaks with-in this subsection, indexed from section #(not subsection) start #find all local maxima peaks = [ x + p_start for x in xvals[find_local_maxima(spline_values[p_start:( p_stop + 1)])] ] #map(lambda x: x + p_start, # xvals[diff(sign(diff(spline(xvals[p_start:(p_stop + 1)])))) < 0]) if not len(peaks) in (0, 1): assert len(peaks) in ( 0, 1 ) #there should be one or zero peaks in every section #handles logic if there are multiple peaks between #start and stop if len(peaks) <= 0: continue if len(peaks) is 1: #TODO All this formatting logic doesn't belong here #should be simplifed #gets reads in peak n_reads_in_peak = sum(cts[p_start:(p_stop + 1)]) #verboseprint(""""Peak %d (%d - %d) has %d # reads""" % (peakn, # p_start, # (p_stop + 1), # n_reads_in_peak)) #makes sure there enough reads if (n_reads_in_peak < minreads or max(data[p_start:(p_stop + 1)]) < threshold): # verboseprint("""skipping peak, %d is not enough reads""" # % (n_reads_in_peak)) continue #formatting of bed track #start and stop for bed track to be created g_start = tx_start + sectstart + p_start g_stop = tx_start + sectstart + p_stop #highest point in start stop peak = tx_start + sectstart + peaks[0] #makes it thicker so we can see on the browser thick_start = peak - 2 thick_stop = peak + 2 #best_error checking logic to keep bed files from breaking if thick_start < g_start: thick_start = g_start if thick_stop > g_stop: thick_stop = g_stop peak_length = g_stop - g_start + 1 #skip really small peaks if peak_length < width_cutoff: continue peak_name = gene_name + "_" + str(peakn) + "_" + str( int(n_reads_in_peak)) #super local logic #best_error check to make sure area is in area of gene #distance from gene start if peak - tx_start - windowsize < 0: area_start = 0 #for super local gets area around peak for calculation else: area_start = peak - tx_start - windowsize #area_start = sectstart #same thing except for end of gene instead of start if peak + windowsize > tx_end: #distance to gene stop area_stop = tx_start - tx_end + 1 else: area_stop = peak - tx_start + windowsize #area_stop = sectstop #use area reads + 1/2 all other reads in gene: #area_reads = sum(pos_counts[area_start:area_stop]) + #0.5*(sum(pos_counts) - #sum(pos_counts[area_start:area_stop])) #use area reads: area_reads = sum(pos_counts[area_start:area_stop]) area_size = area_stop - area_start + 1 #area_reads = sum(pos_counts[sectstart:sectstop]) #area_size = sect_length #calcluates poisson based of whole gene vs peak gene_pois_p = poissonP(nreads_in_gene, n_reads_in_peak, gene_length, peak_length) if SloP is True: #same thing except for based on super local p-value slop_pois_p = poissonP(area_reads, n_reads_in_peak, area_size, peak_length) #makes sure spop_poisP is defined, even if its #just normal, something to be removed later, #slop should only be used when defined as true else: slop_pois_p = gene_pois_p if math.isnan(slop_pois_p): slop_pois_p = 1 #remove later if slop_pois_p > poisson_cutoff: #continue pass #defines the bedline of a peak for returning #TODO This should be abstracted out for now... seperate model from view bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % ( chrom, g_start, g_stop, peak_name, slop_pois_p, signstrand, thick_start, thick_stop) #metadata for the specific bedline peak_dict['clusters'][bedline] = {} peak_dict['clusters'][bedline]['GeneP'] = gene_pois_p peak_dict['clusters'][bedline]['SloP'] = slop_pois_p peak_dict['clusters'][bedline]['Nreads'] = n_reads_in_peak peak_dict['clusters'][bedline]['size'] = peak_length peakn += 1 #there are more than one peaks in this window #NO LONGER NESSESSARY SHOULD REMOVE else: #this handles peaks within peaks logic #local minima in subsection, relative to section start valleys = array( map( lambda x: x + p_start, xvals[diff( sign(diff(spline(xvals[p_start:p_stop + 1])))) > 0])) for subpeak in peaks: subpeak_start = int() subpeak_stop = int() if any(valleys < subpeak): subpeak_start = valleys[valleys < subpeak][-1] else: subpeak_start = starts[starts < subpeak][-1] if any(valleys > subpeak): subpeak_stop = valleys[valleys > subpeak][0] else: subpeak_stop = stops[stops > subpeak][0] peak_length = subpeak_stop - subpeak_start + 1 if peak_length < width_cutoff: #skip really small peaks continue n_reads_in_peak = sum(cts[subpeak_start:(subpeak_stop + 1)]) if (n_reads_in_peak < minreads or max(data[subpeak_start:(subpeak_stop + 1)]) < threshold): continue g_start = tx_start + subpeak_start + sectstart g_stop = tx_start + subpeak_stop + sectstart peak = tx_start + subpeak + sectstart thick_start = peak - 2 if thick_start < g_start: thick_start = g_start thick_stop = peak + 2 if thick_stop > g_stop: thick_stop = g_stop peak_name = "%s_%s_%s" % (gene_name, peakn, int(n_reads_in_peak)) #distance from gene start if peak - tx_start - windowsize < 0: area_start = 0 else: area_start = peak - tx_start - windowsize if peak + windowsize > tx_end: #distance to gene stop area_stop = tx_start - tx_end + 1 else: #area_stop = sectstop area_stop = peak - tx_start + windowsize area_reads = sum(pos_counts[area_start:area_stop]) area_size = area_stop - area_start + 1 gene_pois_p = poissonP(nreads_in_gene, n_reads_in_peak, gene_length, peak_length) if SloP is True: slop_pois_p = poissonP(area_reads, n_reads_in_peak, area_size, peak_length) else: slop_pois_p = gene_pois_p if math.isnan(slop_pois_p): slop_pois_p = 1 #leave these in to allow for BH p-value correction if slop_pois_p > poisson_cutoff: pass #output results again bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % ( chrom, g_start, g_stop, peak_name, slop_pois_p, signstrand, thick_start, thick_stop) peak_dict['clusters'][bedline] = {} peak_dict['clusters'][bedline]['SloP'] = slop_pois_p peak_dict['clusters'][bedline]['GeneP'] = gene_pois_p peak_dict['clusters'][bedline][ 'Nreads'] = n_reads_in_peak peak_dict['clusters'][bedline]['size'] = peak_length peakn += 1 except NameError as best_error: print >> sys.stderr, best_error print >> sys.stderr, "spline fitting failed for %s" % (loc) raise #inflate p-values based on # of comparisons #bonferroni corrected if correct_p is True: for peak in peak_dict['clusters']: peak_dict['clusters'][peak]['p'] = peak_dict['clusters'][peak][ 'p'] * peakn #bonferroni correct p-value for MHT peak_dict['Nclusters'] = peakn return peak_dict
def peaks_from_info( wiggle, pos_counts, lengths, loc, gene_length, margin=25, fdr_alpha=0.05, user_threshold=None, minreads=20, poisson_cutoff=0.05, plotit=False, width_cutoff=10, windowsize=1000, SloP=False, correct_p=False, ): """ same args as before wiggle is converted from bam file pos_counts - one point per read instead of coverage of entire read lengths - lengths aligned portions of reads rest are the same fix later calls peaks for an individual gene gene_length - effective length of gene margin - space between sections for calling new peaks fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup) user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha minreads - min reads in section to try and call peaks poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution plotit - makes figures w_cutoff - width cutoff, peaks narrower than this are discarted windowssize - for super local calculation distance left and right to look SloP - super local p-value instead of gene-wide p-value correct_p - boolean bonferoni correction of p-values from poisson """ peak_dict = {} # these are what is built in this dict, complicated enough that it might # be worth turning into an object # peak_dict['clusters'] = {} # peak_dict['sections'] = {} # peak_dict['nreads'] = int() # peak_dict['threshold'] = int() # peak_dict['loc'] = loc # data munging chrom, gene_name, tx_start, tx_end, signstrand = loc tx_start, tx_end = [int(x) for x in [tx_start, tx_end]] # used for poisson calclulation? nreads_in_gene = sum(pos_counts) # decides FDR calcalation, maybe move getFRDcutoff mean into c code if user_threshold is None: gene_threshold = get_FDR_cutoff_mean(lengths, gene_length, alpha=fdr_alpha) else: gene_threshold = user_threshold if gene_threshold == "best_error": # verboseprint("""I had a hard time with this one: %s. # I think I'll use a threshold of 50""" % (loc)) threshold = 50 peak_dict["clusters"] = {} peak_dict["sections"] = {} peak_dict["nreads"] = int(nreads_in_gene) peak_dict["threshold"] = gene_threshold peak_dict["loc"] = loc peakn = 1 # verboseprintprint("Testing %s" % (loc)) # verboseprint("Gene threshold is: %d" % (gene_threshold)) # print wiggle # print margin sections = find_sections(wiggle, margin) if plotit is True: plot_sections(wiggle, sections, gene_threshold) for sect in sections: sectstart, sectstop = sect sect_length = sectstop - sectstart + 1 data = wiggle[sectstart : (sectstop + 1)] cts = pos_counts[sectstart : (sectstop + 1)] xvals = arange(0, sect_length) Nreads = sum(cts) # gets random subset of lengths of reads for calculations on a section # not exactly the right way to do this but it should be very close. sect_read_lengths = rs(lengths, Nreads) peak_dict["sections"][sect] = {} threshold = int() # makes sure there are enough reads if Nreads < minreads: # verboseprint("""%d is not enough reads, skipping section: # %s""" % (Nreads, sect)) continue else: pass # verboseprint("""Analyzing section %s with %d reads""" # % (sect, Nreads)) # sets super-local if requested, might be able to factor this if user_threshold is None: if SloP is True: threshold = get_FDR_cutoff_mean(sect_read_lengths, sect_length, alpha=fdr_alpha) # verboseprint("Using super-local threshold %d" % (threshold)) else: threshold = gene_threshold else: threshold = user_threshold # saves threshold for each individual section peak_dict["sections"][sect]["threshold"] = threshold peak_dict["sections"][sect]["nreads"] = int(Nreads) # if wiggle track never excides threshold if max(data) < threshold: # verboseprint("data does not excede threshold, stopping") continue # fitting splines logic, black magic try: degree = 3 # cubic spline weights = None # for very large windows with many reads a large smoothing # parameter is required. test several different options # to determine a reasonable inital estimate # Goal is to find optimnal smooting paramater in multiple steps # initial_smoothing_value initial estimate of smoothing paramater # step 1, identify good initial value initial_smoothing_value = sectstop - sectstart + 1 best_smoothing_value = initial_smoothing_value best_estimate = 1 # step 2, refine so as not to runinto local minima later, # try to come up with a good way of getting optimal paramater best_error = find_spline_residuals(initial_smoothing_value, xvals, data, degree, weights) for i in range(2, 11): cur_smoothing_value = initial_smoothing_value * i # tries find optimal initial smooting paraater in this loop cur_error = find_spline_residuals(cur_smoothing_value, xvals, data, degree, weights) if cur_error < best_error: best_smoothing_value = cur_smoothing_value best_estimate = i # verboseprint("""I'm using (region length) * %d as the # initial estimate for the smoothing # parameter""" % (best_estimate)) try: # fine optimization of smooting paramater cutoff = float(0) tries = 0 # shouldn't get smoothing coef's this small.. increase # the initial estimate and try again. WARNING: BLACK MAGIC while cutoff < 5: tries += 1 # increasing this may improve accuracy, # but at the cost of running time. if tries == 3: break spline = optimize.minimize( find_spline_residuals, best_smoothing_value, args=(xvals, data, degree, weights), options={"disp": False, "maxiter": 10}, # method="Powell", # old method method="L-BFGS-B", # abnormal termination sometimes # method="COBYLA", bounds=((0.1, None),), ) # fit a smoothing spline using an optimal parameter # for smoothing and with weights proportional to the # number of reads aligned at each position if weights # is set if spline.success: cutoff = spline.x # print "cutoff is %s" % (cutoff) else: # print "%s failed spline building at section %s" % (loc, sect) # print spline.message pass best_smoothing_value += sect_length except Exception as best_error: print "best smoothing value is:", best_smoothing_value print >> sys.stderr, "%s failed spline fitting at section %s (major crash)" % (loc, sect) print >> sys.stderr, best_error continue # verboseprint ("optimized smoothing parameter") # if we are going to save and output as a pickle fi is %s" %(str(cutoff)) # final fit spline spline = find_univariate_spline(cutoff, xvals, data, degree, weights) spline_values = array([round(x) for x in spline(xvals)]) if plotit is True: plot_spline(spline, data, xvals, peakn, threshold) starts_and_stops, starts, stops = get_regions_above_threshold(threshold, spline_values) # walks along spline, and calls peaks along spline # for each start, take the next stop and find the peak # between the start and the stop this is where I need to # fix, some peaks starts start right after another start, # but not on top of it make sure the next start is after the # previous stop # subsections that are above threshold for p_start, p_stop in starts_and_stops: # peaks with-in this subsection, indexed from section # (not subsection) start # find all local maxima peaks = [x + p_start for x in xvals[find_local_maxima(spline_values[p_start : (p_stop + 1)])]] # map(lambda x: x + p_start, # xvals[diff(sign(diff(spline(xvals[p_start:(p_stop + 1)])))) < 0]) if not len(peaks) in (0, 1): # print gene_name # print "spline ", spline(xvals) # print "threshold: %s" % (threshold) # print "full spline ", spline_values # print "peaks", peaks # print p_start, p_stop # print starts_and_stops # print "spline values", spline_values[p_start:(p_stop + 1)] # print "peaks at in section", xvals[find_local_maxima(spline_values[p_start:(p_stop + 1)])] assert len(peaks) in (0, 1) # there should be one or zero peaks in every section # handles logic if there are multiple peaks between # start and stop if len(peaks) <= 0: continue if len(peaks) is 1: # gets reads in peak n_reads_in_peak = sum(cts[p_start : (p_stop + 1)]) # verboseprint(""""Peak %d (%d - %d) has %d # reads""" % (peakn, # p_start, # (p_stop + 1), # n_reads_in_peak)) # makes sure there enough reads if n_reads_in_peak < minreads or max(data[p_start : (p_stop + 1)]) < threshold: # verboseprint("""skipping peak, %d is not enough reads""" # % (n_reads_in_peak)) continue # formatting of bed track # start and stop for bed track to be created g_start = tx_start + sectstart + p_start g_stop = tx_start + sectstart + p_stop # highest point in start stop peak = tx_start + sectstart + peaks[0] # makes it thicker so we can see on the browser thick_start = peak - 2 thick_stop = peak + 2 # best_error checking logic to keep bed files from breaking if thick_start < g_start: thick_start = g_start if thick_stop > g_stop: thick_stop = g_stop peak_length = g_stop - g_start + 1 # skip really small peaks if peak_length < width_cutoff: continue peak_name = gene_name + "_" + str(peakn) + "_" + str(int(n_reads_in_peak)) # super local logic # best_error check to make sure area is in area of gene # distance from gene start if peak - tx_start - windowsize < 0: area_start = 0 # for super local gets area around peak for calculation else: area_start = peak - tx_start - windowsize # area_start = sectstart # same thing except for end of gene instead of start if peak + windowsize > tx_end: # distance to gene stop area_stop = tx_start - tx_end + 1 else: area_stop = peak - tx_start + windowsize # area_stop = sectstop # use area reads + 1/2 all other reads in gene: # area_reads = sum(pos_counts[area_start:area_stop]) + # 0.5*(sum(pos_counts) - # sum(pos_counts[area_start:area_stop])) # use area reads: area_reads = sum(pos_counts[area_start:area_stop]) area_size = area_stop - area_start + 1 # area_reads = sum(pos_counts[sectstart:sectstop]) # area_size = sect_length # calcluates poisson based of whole gene vs peak gene_pois_p = poissonP(nreads_in_gene, n_reads_in_peak, gene_length, peak_length) if SloP is True: # same thing except for based on super local p-value slop_pois_p = poissonP(area_reads, n_reads_in_peak, area_size, peak_length) # makes sure spop_poisP is defined, even if its # just normal, something to be removed later, # slop should only be used when defined as true else: slop_pois_p = gene_pois_p if math.isnan(slop_pois_p): slop_pois_p = 1 # remove later if slop_pois_p > poisson_cutoff: # continue pass # defines the bedline of a peak for returning # TODO This should be abstracted out for now... seperate model from view bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % ( chrom, g_start, g_stop, peak_name, slop_pois_p, signstrand, thick_start, thick_stop, ) # metadata for the specific bedline peak_dict["clusters"][bedline] = {} peak_dict["clusters"][bedline]["GeneP"] = gene_pois_p peak_dict["clusters"][bedline]["SloP"] = slop_pois_p peak_dict["clusters"][bedline]["Nreads"] = n_reads_in_peak peak_dict["clusters"][bedline]["size"] = peak_length peakn += 1 # there are more than one peaks in this window # NO LONGER NESSESSARY SHOULD REMOVE else: # this handles peaks within peaks logic # local minima in subsection, relative to section start valleys = array( map(lambda x: x + p_start, xvals[diff(sign(diff(spline(xvals[p_start : p_stop + 1])))) > 0]) ) for subpeak in peaks: subpeak_start = int() subpeak_stop = int() if any(valleys < subpeak): subpeak_start = valleys[valleys < subpeak][-1] else: subpeak_start = starts[starts < subpeak][-1] if any(valleys > subpeak): subpeak_stop = valleys[valleys > subpeak][0] else: subpeak_stop = stops[stops > subpeak][0] peak_length = subpeak_stop - subpeak_start + 1 if peak_length < width_cutoff: # skip really small peaks continue n_reads_in_peak = sum(cts[subpeak_start : (subpeak_stop + 1)]) if n_reads_in_peak < minreads or max(data[subpeak_start : (subpeak_stop + 1)]) < threshold: continue g_start = tx_start + subpeak_start + sectstart g_stop = tx_start + subpeak_stop + sectstart peak = tx_start + subpeak + sectstart thick_start = peak - 2 if thick_start < g_start: thick_start = g_start thick_stop = peak + 2 if thick_stop > g_stop: thick_stop = g_stop peak_name = "%s_%s_%s" % (gene_name, peakn, int(n_reads_in_peak)) # distance from gene start if peak - tx_start - windowsize < 0: area_start = 0 else: area_start = peak - tx_start - windowsize if peak + windowsize > tx_end: # distance to gene stop area_stop = tx_start - tx_end + 1 else: # area_stop = sectstop area_stop = peak - tx_start + windowsize area_reads = sum(pos_counts[area_start:area_stop]) area_size = area_stop - area_start + 1 gene_pois_p = poissonP(nreads_in_gene, n_reads_in_peak, gene_length, peak_length) if SloP is True: slop_pois_p = poissonP(area_reads, n_reads_in_peak, area_size, peak_length) else: slop_pois_p = gene_pois_p if math.isnan(slop_pois_p): slop_pois_p = 1 # leave these in to allow for BH p-value correction if slop_pois_p > poisson_cutoff: pass # output results again bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % ( chrom, g_start, g_stop, peak_name, slop_pois_p, signstrand, thick_start, thick_stop, ) peak_dict["clusters"][bedline] = {} peak_dict["clusters"][bedline]["SloP"] = slop_pois_p peak_dict["clusters"][bedline]["GeneP"] = gene_pois_p peak_dict["clusters"][bedline]["Nreads"] = n_reads_in_peak peak_dict["clusters"][bedline]["size"] = peak_length peakn += 1 except NameError as best_error: print >> sys.stderr, best_error print >> sys.stderr, "spline fitting failed for %s" % (loc) raise # inflate p-values based on # of comparisons #bonferroni corrected if correct_p is True: for peak in peak_dict["clusters"]: peak_dict["clusters"][peak]["p"] = ( peak_dict["clusters"][peak]["p"] * peakn ) # bonferroni correct p-value for MHT peak_dict["Nclusters"] = peakn return peak_dict
def get_namepos(n): maleNames = rs(mFirstNames, n/2) femaleNames = rs(fFirstNames, n/2) return map(lambda s: s+"\'s", maleNames+femaleNames)