Exemplo n.º 1
0
def replacementDict(tc, dc, cc, tr):
    MAX_PREDICATES = 3
    repl = {}
    for i in range(len(tc)):
        repl["tag_%i"%i] = tc[i]
        repl["tag_%i_aan"%i] = a_or_an(tc[i])

        if dc[i]:
            if len(dc[i].keys()) < MAX_PREDICATES:
                relVerbs = dc[i].keys()
                lemmas = [rc(dc[i][relVerb]) for relVerb in relVerbs]
                for j in range(MAX_PREDICATES - len(relVerbs)):
                    useAgain = rc(dc[i].keys())
                    relVerbs.append(useAgain)
                    lemmas.append(rc(dc[i][useAgain]))
            else:
                relVerbs = rs(dc[i].keys(), MAX_PREDICATES)
                lemmas = [rc(dc[i][relVerb]) for relVerb in relVerbs]

            zipped = zip(relVerbs, lemmas)
            for j in range(MAX_PREDICATES):
                repl["tag_%i_predicate_%i"%(i,j)] = "%s %s" % (rc(zipped[j][0]), zipped[j][1])

            dump = []
            for rv in dc[i].keys():
                for lemma in dc[i][rv]:
                    dump.append("%s it %s %s." % (rc(tr), rc(rv), lemma))
            random.shuffle(dump)
            repl["tag_%i_dump"%i] = dump
        else:
            for j in range(MAX_PREDICATES):
                repl["tag_%i_predicate_%i"%(i,j)] = "remains unknown"
            repl["tag_%i_dump"%i] = []

    return repl
Exemplo n.º 2
0
def replacementDict(tc, dc, cc, tr):
    MAX_PREDICATES = 3
    repl = {}
    for i in range(len(tc)):
        repl["tag_%i"%i] = tc[i]
        repl["tag_%i_aan"%i] = a_or_an(tc[i])

        if dc[i]:
            if len(dc[i].keys()) < MAX_PREDICATES:
                relVerbs = dc[i].keys()
                lemmas = [rc(dc[i][relVerb]) for relVerb in relVerbs]
                for j in range(MAX_PREDICATES - len(relVerbs)):
                    useAgain = rc(dc[i].keys())
                    relVerbs.append(useAgain)
                    lemmas.append(rc(dc[i][useAgain]))
            else:
                relVerbs = rs(dc[i].keys(), MAX_PREDICATES)
                lemmas = [rc(dc[i][relVerb]) for relVerb in relVerbs]

            zipped = zip(relVerbs, lemmas)
            for j in range(MAX_PREDICATES):
                repl["tag_%i_predicate_%i"%(i,j)] = " ".join(zipped[j])

            dump = []
            for rv in dc[i].keys():
                for lemma in dc[i][rv]:
                    dump.append("%s it %s %s." % (rc(tr), rv, lemma))
            random.shuffle(dump)
            repl["tag_%i_dump"%i] = dump
        else:
            for j in range(MAX_PREDICATES):
                repl["tag_%i_predicate_%i"%(i,j)] = "remains unknown"
            repl["tag_%i_dump"%i] = []

    return repl
Exemplo n.º 3
0
def chTitle(hi):
    htmlFile = open(APPPATH+'static/output/'+hi+'.html', 'r')
    html = htmlFile.read()
    htmlFile.close()
    soup = BeautifulSoup(html)
    text = "\n".join([unicode(i) for i in soup.p.contents]).replace("<br/>", "\n")
    s = parsetree(text)
    nounPhrases = []
    for sentence in s:
        for chunk in sentence.chunks:
            if chunk.type == "NP":
                nounPhrases.append(chunk.string)
    selectNPs = rs([np for np in nounPhrases if not "&" in np], ri(1,2))

    articles = ["a", "an", "the"]

    nps = []

    for np in selectNPs:
        if startsWithCheck(np, articles):
            nps.append(np)
        else:
            nps.append(a_or_an(np))

    if len(selectNPs) == 1:
        title = titlecase(nps[0])
    elif len(selectNPs) == 2:
        title = titlecase(" and ".join(nps))
    # elif len(selectNPs) == 3:
    #     title = titlecase("%s, %s, and %s" % tuple(nps))

    return title.encode('ascii', 'xmlcharrefreplace')
Exemplo n.º 4
0
    def get_random(self):
        seed = rs(1, self.max)

        # Shift off bits, discarding the sign.Discarding the sign is
        # important because OR w / 5 can give us + or - numbers.

        seed += (seed * seed) | 5

        r = (seed >> 32) / self.max

        return int(modf(r)[0] * 10000000) % self.scope
Exemplo n.º 5
0
def grafBuilder(exploDictsConf):
    MAX_TRANSITIONS = 7

    tags, exploDicts, confs = zip(*exploDictsConf)

    confMean = float(sum(confs)) / len(confs)

    grafs = []

    templates = [open_template(n) for n in range(1, 7)]

    transFile = open(APPPATH + 'lists/transitions.txt', 'r')
    transitions = [l.strip() for l in transFile.readlines()]
    transFile.close()

    i = 0
    for tagsChunk in chunks(tags, MAX_GRAF_DENSITY):
        dictsChunk = exploDicts[i:i + len(tagsChunk)]
        confsChunk = confs[i:i + len(tagsChunk)]

        replDict = replacementDict(tagsChunk, dictsChunk, confsChunk,
                                   transitions)

        transes = rs(transitions, MAX_TRANSITIONS)
        for j in range(MAX_TRANSITIONS):
            replDict["transition_%i" % j] = transes[j]

        for j in range(len(tagsChunk)):
            for k in range(2, 5, 2):
                if len(replDict["tag_%i_dump" % j]) > k:
                    replDict["tag_%i_dump%i" % (j, k)] = " ".join(
                        rs(replDict["tag_%i_dump" % j], k + 1))
                else:
                    replDict["tag_%i_dump%i" % (j, k)] = ""

        templNo = len(tagsChunk)
        grafs.append(templates[templNo - 1].substitute(**replDict))

        i += len(tagsChunk)

    return grafs
Exemplo n.º 6
0
def grafBuilder(exploDictsConf):
    MAX_TRANSITIONS = 7

    tags, exploDicts, confs = zip(*exploDictsConf)

    confMean = float(sum(confs))/len(confs)

    grafs = []

    templates = [open_template(n) for n in range(1,7)]

    transFile = open(APPPATH+'lists/transitions.txt', 'r')
    transitions = [l.strip() for l in transFile.readlines()]
    transFile.close()

    i = 0
    for tagsChunk in chunks(tags, MAX_GRAF_DENSITY):
        dictsChunk = exploDicts[i:i+len(tagsChunk)]
        confsChunk = confs[i:i+len(tagsChunk)]

        replDict = replacementDict(tagsChunk, dictsChunk, confsChunk, transitions)

        transes = rs(transitions, MAX_TRANSITIONS)
        for j in range(MAX_TRANSITIONS):
            replDict["transition_%i"%j] = transes[j]

        for j in range(len(tagsChunk)):
            for k in range(2,5,2):
                if len(replDict["tag_%i_dump"%j]) > k:
                    replDict["tag_%i_dump%i"%(j,k)] = " ".join(rs(replDict["tag_%i_dump"%j], k+1))
                else:
                    replDict["tag_%i_dump%i"%(j,k)] = ""

        templNo = len(tagsChunk)
        grafs.append(templates[templNo-1].substitute(**replDict))

        i+=len(tagsChunk)

    return grafs
Exemplo n.º 7
0
    def random_search_lgbm(self, param, n_iterations, X, y):
        '''Select the best parameters for the lgbm model'''
        # allocate all results in dataframe
        final_result = pd.DataFrame(
            columns=['mean f1-score', 'std', 'parameters'])
        for i in range(n_iterations):
            # choose values for parameters randomly
            hp = {k: rs(v, 1)[0] for k, v in param.items()}

            # model
            model_random_search = LGBMClassifier(
                objective='binary',
                num_leaves=hp['num_leaves'],
                min_data_in_leaf=hp['min_data_in_leaf'],
                learning_rate=hp['learning_rate'],
                n_estimators=hp['n_estimators'],
                max_depth=hp['max_depth'],
                colsample_bytree=hp['colsample_bytree'],
                min_child_weight=hp['min_child_weight'],
                random_state=42,
                n_jobs=-1).fit(X, y)

            # define CV strategy
            sk_fold = StratifiedKFold(n_splits=10, random_state=None)

            # calculate cross validation
            cv_scores = cross_val_score(model_random_search,
                                        X,
                                        y,
                                        cv=sk_fold,
                                        scoring='f1',
                                        n_jobs=-1)

            # append cv scores in dataframe
            result = pd.DataFrame(
                [[mean(cv_scores), std(cv_scores), hp]],
                columns=['mean f1-score', 'std', 'parameters'])
            final_result = pd.concat([final_result, result])

        return final_result.sort_values('mean f1-score',
                                        ascending=False).head(10)
Exemplo n.º 8
0
    def process_captions(self, ch, method, properties, body):
        def int_to_enc(i):
            return "{0:b}".format(i).replace('0', '~').replace('1', '|')

        img_hash, csv = body.split('#', 1)

        print img_hash, "CAPTIONED"

        captions_raw = list(set(csv.split(',')))

        if self.manual or len(captions_raw) <= self.sentence_count:
            captions_cut = captions_raw
        else:
            captions_cut = rs(captions_raw, self.sentence_count)

        self.unused_captions = list(set(captions_raw) - set(captions_cut))
        self.unused_captions_per_graf = len(self.unused_captions) / self.sentence_count
        
        captions = map(
            # lambda (i, x): int_to_enc(i%8) + x[0].upper() + x[1:],
            lambda (i, x): x[0].upper() + x[1:],
            enumerate(captions_cut)
        )
        approved_captions = list()

        for c in captions:
            approved = True
            if self.manual:
                if len(approved_captions) > self.sentence_count:
                    approved = False
                else:
                    approved = self.approve(c)
            if approved:
                approved_captions.append(c)

        for c in approved_captions:
            self.channel.basic_publish(
                exchange = '',
                routing_key = 'CaptionToExpand',
                body = img_hash + '#' + self.pre_seed + c
            )
Exemplo n.º 9
0
def peaks_from_info(bam_fileobj, wiggle, pos_counts, lengths, loc, gene_length, 
                    margin=25, fdr_alpha=0.05, binom_alpha=0.001, method="Randomization" ,user_threshold=None,
                    minreads=20, poisson_cutoff=0.05, plotit=False, 
                    width_cutoff=10, windowsize=1000, SloP=False, 
                    correct_p=False, max_width=None, min_width=None, 
                    max_gap=None, algorithm="spline"):

    """
    
    same args as before 
    wiggle is converted from bam file
    pos_counts - one point per read instead of coverage of entire read
    lengths - lengths aligned portions of reads 
    rest are the same fix later


    calls peaks for an individual gene 
    

    gene_length - effective length of gene
    margin - space between sections for calling new peaks
    fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup)
    user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha
    minreads - min reads in section to try and call peaks
    poisson_cutoff - p-value for signifance cut off for number of reads in genomic_center that gets called - might want to use ashifted distribution
    plotit - makes figures 
    
    w_cutoff - width cutoff, peaks narrower than this are discarted 
    windowssize - for super local calculation distance left and right to look 
    SloP - super local p-value instead of gene-wide p-value
    correct_p - boolean bonferoni correction of p-values from poisson
    algorithm - str the algorithm to run
    """

    peak_dict = {}
    
    #all the information nessessary to record a genomic_center, used later, but declared outside of loops

    
    #these are what is built in this dict, complicated enough that it might 
    #be worth turning into an object
    #peak_dict['clusters'] = {}
    #peak_dict['sections'] = {}
    #peak_dict['nreads'] = int()
    #peak_dict['threshold'] = int()
    #peak_dict['loc'] = loc
    
    #data munging
    chrom, gene_name, tx_start, tx_end, strand = loc
    tx_start, tx_end = [int(x) for x in [tx_start, tx_end]]    
    
    #used for poisson calclulation? 
    nreads_in_gene = sum(pos_counts)

    #decides FDR calcalation, maybe move getFRDcutoff mean into c code
    gene_threshold = 0
    
    if user_threshold is None:    
        if method == "Binomial":  #Uses Binomial Distribution to get cutoff if specified by user                             
            gene_threshold = get_Binom_cutoff(lengths,gene_length,binom_alpha)
        else:
            gene_threshold = get_FDR_cutoff_mean(lengths, gene_length,alpha=fdr_alpha)     
    else:
        logging.info("using user threshold")
        gene_threshold = user_threshold
        
        
    
    if not isinstance(gene_threshold, int):
        raise TypeError
        
    peak_dict['clusters'] = []
    peak_dict['sections'] = {}
    peak_dict['nreads'] = int(nreads_in_gene)
    peak_dict['threshold'] = gene_threshold
    peak_dict['loc'] = loc
    peak_number=1

 
    sections = find_sections(wiggle, margin)
    if plotit is True:      
        plot_sections(wiggle, sections, gene_threshold)

    for sect in sections:
        sectstart, sectstop = sect
        sect_length = sectstop - sectstart + 1
        data = wiggle[sectstart:(sectstop + 1)]
        
        #this cts is alright because we know the reads are bounded
        cts = pos_counts[sectstart:(sectstop + 1)]
        xvals = arange(0, sect_length)
        Nreads = sum(cts)

        peak_dict['sections'][sect] = {}
        threshold = int()
        peak_dict['sections'][sect]['nreads'] = int(Nreads)

        #makes sure there are enough reads
        if Nreads < minreads:
            logging.info("""%d is not enough reads, skipping section: %s""" %(Nreads, sect))
            peak_dict['sections'][sect]['tried'] = False            
            continue
        else:
            logging.info("""Analyzing section %s with %d reads""" %(sect, Nreads))
            pass
        
            
        if user_threshold == None:
            if SloP:
                
                #gets random subset of lengths of reads for calculations on a section
                #not exactly the right way to do this but it should be very close.
                sect_read_lengths = rs(lengths, Nreads) 
                
                #use the minimum FDR cutoff between superlocal and gene-wide calculations
                threshold = min(gene_threshold, get_FDR_cutoff_mean(sect_read_lengths, 
                                                sect_length, 
                                                alpha=fdr_alpha))
                logging.info("Using super-local threshold %d" %(threshold))
                
            else:
                threshold = gene_threshold
        else:
            threshold = user_threshold

        #saves threshold for each individual section
        peak_dict['sections'][sect]['threshold'] = threshold
        peak_dict['sections'][sect]['nreads'] = int(Nreads)
        peak_dict['sections'][sect]['tried'] = True
        peak_dict['sections'][sect]['nPeaks'] = 0
        
        if max(data) < threshold:
            logging.info("data does not excede threshold, stopping")
            continue
        
        if algorithm == "spline":
            
            initial_smoothing_value = (sectstop - sectstart + 1)
            fitter = SmoothingSpline(xvals, data, initial_smoothing_value,
                            lossFunction="get_norm_penalized_residuals")
            
        elif algorithm == "gaussian":
            fitter = GaussMix(xvals, data)
            
        elif algorithm == "classic":
            fitter = Classic(xvals, data, max_width, min_width, max_gap)
        try:
            peak_definitions = fitter.peaks(threshold, plotit)

        except Exception as error:
            logging.error(gene_name)
            raise error
            
        #subsections that are above threshold
        #peak center is actually the location where we think binding should
        #occur, not the average of start and stop
        for peak_start, peak_stop, peak_center in peak_definitions: 
 
             genomic_start = tx_start + sectstart + peak_start
             genomic_stop = tx_start + sectstart + peak_stop
             
             number_reads_in_peak = bam_fileobj.count(chrom, start=genomic_start, end=genomic_stop)
             #sum(cts[peak_start:(peak_stop + 1)])
             logging.info("""Peak %d (%d - %d) has %d 
                              reads""" %(peak_number,                                             
                                          peak_start,
                                          (peak_stop + 1),
                                          number_reads_in_peak))

             #makes sure there enough reads
             if (number_reads_in_peak < minreads or 
                 max(data[peak_start:(peak_stop + 1)]) < threshold):
                 logging.info("""skipping genomic_center, %d is not enough reads"""
                              %(number_reads_in_peak))
                 continue

  
             #highest point in start stop
             genomic_center = tx_start + sectstart + peak_center

             #makes it thicker so we can see on the browser 
             thick_start = genomic_center - 2
             thick_stop = genomic_center + 2

             #best_error checking logic to keep bed files from breaking
             if thick_start < genomic_start:
                 thick_start = genomic_start
             if thick_stop > genomic_stop:
                 thick_stop = genomic_stop

             peak_length = genomic_stop - genomic_start + 1

             #skip really small peaks
             if peak_length < width_cutoff:
                 continue
           

             #super local logic 
             #best_error check to make sure area is in area of gene

             #distance from gene start
             if genomic_center - tx_start - windowsize < 0: 
                 area_start = 0

             #for super local gets area around genomic_center for calculation
             else:  
                 area_start = genomic_center - tx_start - windowsize
                 #area_start = sectstart

             #same thing except for end of gene instead of start
             if genomic_center + windowsize > tx_end: #distance to gene stop
                 area_stop = tx_start - tx_end + 1
             else:
                 area_stop = genomic_center - tx_start + windowsize
                 #area_stop = sectstop

             #use area reads + 1/2 all other reads in gene: 
             #area_reads = sum(pos_counts[area_start:area_stop]) + 
             #0.5*(sum(pos_counts) - 
             #sum(pos_counts[area_start:area_stop]))

             #use area reads:
             area_reads = sum(pos_counts[area_start:area_stop])
             area_size = area_stop - area_start + 1

             #area_reads = sum(pos_counts[sectstart:sectstop])
             #area_size = sect_length

             #calcluates poisson based of whole gene vs genomic_center
             if algorithm == "classic" and peak_length < min_width:
                 peak_length = min_width
                 
             gene_pois_p = poissonP(nreads_in_gene, 
                                    number_reads_in_peak, 
                                    gene_length, 
                                    peak_length)
             if SloP is True:
                 #same thing except for based on super local p-value
                 slop_pois_p = poissonP(area_reads, 
                                       number_reads_in_peak, 
                                       area_size, 
                                       peak_length)

             #makes sure spop_poisP is defined, even if its 
             #just normal, something to be removed later,
             #slop should only be used when defined as true
             else:
                 slop_pois_p = gene_pois_p


             if math.isnan(slop_pois_p):
                 slop_pois_p = 1

             #defines the bedline of a genomic_center for returning
             #TODO This should be abstracted out for now... seperate model from view
             
             peak_dict['clusters'].append(Peak(chrom, 
                                               genomic_start, 
                                               genomic_stop, 
                                               gene_name, #need this is a unique id for later analysis
                                               slop_pois_p, 
                                               strand,
                                               thick_start,
                                               thick_stop,
                                               peak_number,
                                               number_reads_in_peak,
                                               gene_pois_p,
                                               peak_length,
                                               0
                                               )
                                          )

             peak_number += 1
             peak_dict['sections'][sect]['nPeaks'] +=1
           
    #inflate p-values based on # of comparisons #bonferroni corrected
    if correct_p is True:
        #best I can tell this never executes...            
        for genomic_center in peak_dict['clusters']:
            genomic_center.p = genomic_center.p * peak_number  #bonferroni correct p-value for MHT
        
        

    peak_dict['Nclusters'] = peak_number
    if plotit:
        import sys
        plt.show()
        v = sys.stdin.read(1)
    return peak_dict
Exemplo n.º 10
0
	def make_chars(self):
		# establish gender ratio
		charGenders = [ri(0,1) for _ in range(CC)]
		
		# initialize list of characters
		chars = []

		# add user defined characters
		for firstlast in args.charnames:
			fl_list = firstlast.split('_')  # Note that split is an underscore!
			chars.append(Character(fl_list[0], fl_list[1]))

		# add generated characters
		for b in charGenders:
			if b:
				chars.append(Character(rc(fFirstNames), rc(surnames)))
			else:
				chars.append(Character(rc(mFirstNames), rc(surnames)))

		# establish list of intro scenes
		introScenePaths = rs(characterTropeFiles, len(chars))

		# establish list of settings
		settings = rs(settingTropeFiles, len(chars)*TSV)

		# establish list of drug trips
		trips = rs(erowidExpPaths, len(chars)*DTV)

		# establish list of scp articles
		scps = rs(scpPaths, len(chars)*SCP)

		# establish list of gberg excerpts
		gbergs = rs(gPaths.values(), len(chars)*GGV)

		i = 0
		j = 0
		m = 0
		p = 0
		s = 0
		for c in chars:

			# make friends
			c.friends += rs(chars, ri(1,len(chars)-1))
			if c in c.friends:
				c.friends.remove(c)

			# add introduction description
			c.introDesc = self.personal_trope([c], introScenePaths[i])

			# add setting scenes
			for k in range(TSV):
				c.scenes.append(self.personal_trope([c]+c.friends, settings[j+k]))

			# add drug trip scenes
			for n in range(DTV):
				c.drugTrips.append(self.personal_trip([c]+c.friends, trips[m+n]))

			# add scp articles
			for q in range(SCP):
				c.scpReports.append(self.personal_scp([c]+c.friends, scps[p+q]))

			# add gberg excerpts
			for t in range(GGV):
				c.gbergExcerpts.append(self.personal_gberg([c]+c.friends, gbergs[s+t]))

			i += 1
			j += TSV
			m += DTV
			p += SCP
			s += GGV

		self.characters = chars
Exemplo n.º 11
0
 def shuffle(self) -> List[int]:
     '''
     Returns a random shuffling of the list of numbers.
     '''
     rs(self.rand)
     return self.rand
Exemplo n.º 12
0
		current_game = None
		nextgame_counter = 10
		return

	ret = current_game.step()
	if ret in ["draw", "win"]:
		status = ret


if __name__=="__main__":
	# fix bot permissions
	os.system("chmod u+x bots/*")

	# create games. every bot plays agains every bot, including itself and every game is played two times with different colors
	games = []
	for g in itertools.product([i for i in os.listdir("bots") if i[0]!="."], repeat = 2):
		games.append(game.Game([game.Bot(g[0].split(".")[0], "bots/"+g[0]), game.Bot(g[1].split(".")[0], "bots/"+g[1])]))

	rs(games)
	# debug: drop a is b games
	games = [i for i in games if i.white.name != i.black.name]

	status = "running"
	current_game = None
	nextgame_counter = NEXT_GAME_DELAY


	task.LoopingCall(step).start(GAME_STEP_INTERVAL, True)
	reactor.listenTCP(8080, server.Site(ChessGameServer()))
	reactor.run()
Exemplo n.º 13
0
from random import shuffle as rs

NOVICE = [
    'r', 'r', 'r', 'r', 'r', 'r', 'b', 'b', 'b', 'b', 'b', 'b', 'w', 'w', 'e'
]
APPRENTICE = [
    'r', 'r', 'r', 'r', 'r', 'r', 'b', 'b', 'b', 'b', 'w', 'w', 'y', 'y', 'e'
]
EXPERT = [
    'r', 'r', 'r', 'r', 'b', 'b', 'b', 'b', 'w', 'w', 'y', 'y', 'g', 'g', 'e'
]
MASTER = [
    'r', 'r', 'r', 'r', 'b', 'b', 'w', 'w', 'y', 'y', 'g', 'g', 'p', 'p', 'e'
]

rs(NOVICE)
rs(APPRENTICE)
rs(EXPERT)
rs(MASTER)

with open("input1.txt", "w") as f:
    for i in range(50):
        rs(NOVICE)
        for char in NOVICE:
            f.write(char + " ")
        f.write("\n")

with open("input2.txt", "w") as f:
    for i in range(50):
        rs(APPRENTICE)
        for char in APPRENTICE:
Exemplo n.º 14
0
def peaks_from_info(wiggle,
                    pos_counts,
                    lengths,
                    loc,
                    gene_length,
                    margin=25,
                    fdr_alpha=0.05,
                    user_threshold=None,
                    minreads=20,
                    poisson_cutoff=0.05,
                    plotit=False,
                    width_cutoff=10,
                    windowsize=1000,
                    SloP=False,
                    correct_p=False):
    """
    
    same args as before 
    wiggle is converted from bam file
    pos_counts - one point per read instead of coverage of entire read
    lengths - lengths aligned portions of reads 
    rest are the same fix later


    calls peaks for an individual gene 
    

    gene_length - effective length of gene
    margin - space between sections for calling new peaks
    fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup)
    user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha
    minreads - min reads in section to try and call peaks
    poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution
    plotit - makes figures 
    
    w_cutoff - width cutoff, peaks narrower than this are discarted 
    windowssize - for super local calculation distance left and right to look 
    SloP - super local p-value instead of gene-wide p-value
    correct_p - boolean bonferoni correction of p-values from poisson
        
    """

    peak_dict = {}

    #these are what is built in this dict, complicated enough that it might
    #be worth turning into an object
    #peak_dict['clusters'] = {}
    #peak_dict['sections'] = {}
    #peak_dict['nreads'] = int()
    #peak_dict['threshold'] = int()
    #peak_dict['loc'] = loc

    #data munging
    chrom, gene_name, tx_start, tx_end, signstrand = loc
    tx_start, tx_end = [int(x) for x in [tx_start, tx_end]]

    #used for poisson calclulation?
    nreads_in_gene = sum(pos_counts)

    #decides FDR calcalation, maybe move getFRDcutoff mean into c code

    if user_threshold is None:
        gene_threshold = get_FDR_cutoff_mean(lengths,
                                             gene_length,
                                             alpha=fdr_alpha)

    else:
        gene_threshold = user_threshold

    if gene_threshold == "best_error":
        #verboseprint("""I had a hard time with this one: %s.
        #                I think I'll use a threshold of 50""" % (loc))

        threshold = 50

    peak_dict['clusters'] = {}
    peak_dict['sections'] = {}
    peak_dict['nreads'] = int(nreads_in_gene)
    peak_dict['threshold'] = gene_threshold
    peak_dict['loc'] = loc
    peakn = 1

    sections = find_sections(wiggle, margin)
    if plotit is True:
        plot_sections(wiggle, sections, gene_threshold)

    for sect in sections:
        sectstart, sectstop = sect
        sect_length = sectstop - sectstart + 1
        data = wiggle[sectstart:(sectstop + 1)]
        cts = pos_counts[sectstart:(sectstop + 1)]
        xvals = arange(0, sect_length)
        Nreads = sum(cts)

        #gets random subset of lengths of reads for calculations on a section
        #not exactly the right way to do this but it should be very close.
        sect_read_lengths = rs(lengths, Nreads)
        peak_dict['sections'][sect] = {}
        threshold = int()

        #makes sure there are enough reads
        if Nreads < minreads:
            #verboseprint("""%d is not enough reads, skipping section:
            #                %s""" % (Nreads, sect))
            continue

        else:
            pass
            #verboseprint("""Analyzing section %s with %d reads"""
            #              % (sect, Nreads))

        #sets super-local if requested, might be able to factor this
        if user_threshold is None:
            if SloP is True:
                #use the minimum FDR cutoff between superlocal and gene-wide calculations
                threshold = min(
                    gene_theshold,
                    get_FDR_cutoff_mean(sect_read_lengths,
                                        sect_length,
                                        alpha=fdr_alpha))

            #verboseprint("Using super-local threshold %d" % (threshold))

            else:
                threshold = gene_threshold
        else:
            threshold = user_threshold

        #saves threshold for each individual section
        peak_dict['sections'][sect]['threshold'] = threshold
        peak_dict['sections'][sect]['nreads'] = int(Nreads)

        #if wiggle track never excides threshold
        if max(data) < threshold:
            #verboseprint("data does not excede threshold, stopping")
            continue

        #fitting splines logic, black magic
        try:
            degree = 3  #cubic spline
            weights = None

            #for very large windows with many reads a large smoothing
            #parameter is required.  test several different options
            #to determine a reasonable inital estimate
            #Goal is to find optimnal smooting paramater in multiple steps
            #initial_smoothing_value initial estimate of smoothing paramater
            #step 1, identify good initial value
            initial_smoothing_value = (sectstop - sectstart + 1)
            best_smoothing_value = initial_smoothing_value
            best_estimate = 1

            #step 2, refine so as not to runinto local minima later,
            #try to come up with a good way of getting optimal paramater
            best_error = find_spline_residuals(initial_smoothing_value, xvals,
                                               data, degree, weights)

            for i in range(2, 11):
                cur_smoothing_value = initial_smoothing_value * i

                #tries find optimal initial smooting paraater in this loop
                cur_error = find_spline_residuals(cur_smoothing_value, xvals,
                                                  data, degree, weights)
                if cur_error < best_error:
                    best_smoothing_value = cur_smoothing_value
                    best_estimate = i

            try:
                #fine optimization of smooting paramater
                cutoff = float(0)
                tries = 0

                # shouldn't get smoothing coef's this small.. increase
                #the initial estimate and try again. WARNING: BLACK MAGIC
                while cutoff < 5:
                    tries += 1

                    # increasing this may improve accuracy,
                    #but at the cost of running time.
                    if tries == 3:
                        break

                    spline = optimize.minimize(
                        find_spline_residuals,
                        best_smoothing_value,
                        args=(xvals, data, degree, weights),
                        options={
                            'disp': False,
                            'maxiter': 10,
                        },
                        #method="Powell", # old method
                        method="L-BFGS-B",  #abnormal termination sometimes
                        #method="COBYLA",
                        bounds=((.1, None), ),
                    )

                    #fit a smoothing spline using an optimal parameter
                    #for smoothing and with weights proportional to the
                    #number of reads aligned at each position if weights
                    #is set
                    if spline.success:
                        cutoff = spline.x
                        #print "cutoff is %s" % (cutoff)
                    else:
                        #print "%s failed spline building at section %s" % (loc, sect)
                        #print spline.message
                        pass

                    best_smoothing_value += sect_length
            except Exception as best_error:
                print >> sys.stderr, "best smoothing value is:", best_smoothing_value
                print >> sys.stderr, "%s failed spline fitting at section %s (major crash)" % (
                    loc, sect)
                print >> sys.stderr, best_error
                continue

            #verboseprint ("optimized smoothing parameter")
            #if we are going to save and output as a pickle fi is %s" %(str(cutoff))
            #final fit spline

            spline = find_univariate_spline(cutoff, xvals, data, degree,
                                            weights)

            spline_values = array([round(x) for x in spline(xvals)])
            if plotit is True:
                plot_spline(spline, data, xvals, peakn, threshold)

            starts_and_stops, starts, stops = get_regions_above_threshold(
                threshold, spline_values)

            #walks along spline, and calls peaks along spline
            #for each start, take the next stop and find the peak
            #between the start and the stop this is where I need to
            #fix, some peaks starts start right after another start,
            #but not on top of it make sure the next start is after the
            #previous stop

            #subsections that are above threshold
            for p_start, p_stop in starts_and_stops:

                #peaks with-in this subsection, indexed from section
                #(not subsection) start
                #find all local maxima
                peaks = [
                    x + p_start
                    for x in xvals[find_local_maxima(spline_values[p_start:(
                        p_stop + 1)])]
                ]
                #map(lambda x: x + p_start,
                #            xvals[diff(sign(diff(spline(xvals[p_start:(p_stop + 1)])))) < 0])

                if not len(peaks) in (0, 1):
                    assert len(peaks) in (
                        0, 1
                    )  #there should be one or zero peaks in every section

                #handles logic if there are multiple peaks between
                #start and stop
                if len(peaks) <= 0:
                    continue
                if len(peaks) is 1:
                    #TODO All this formatting logic doesn't belong here
                    #should be simplifed
                    #gets reads in peak
                    n_reads_in_peak = sum(cts[p_start:(p_stop + 1)])
                    #verboseprint(""""Peak %d (%d - %d) has %d
                    #                 reads""" % (peakn,
                    #                             p_start,
                    #                             (p_stop + 1),
                    #                             n_reads_in_peak))

                    #makes sure there enough reads
                    if (n_reads_in_peak < minreads
                            or max(data[p_start:(p_stop + 1)]) < threshold):
                        #    verboseprint("""skipping peak, %d is not enough reads"""
                        #                  % (n_reads_in_peak))
                        continue

                    #formatting of bed track
                    #start and stop for bed track to be created
                    g_start = tx_start + sectstart + p_start
                    g_stop = tx_start + sectstart + p_stop

                    #highest point in start stop
                    peak = tx_start + sectstart + peaks[0]

                    #makes it thicker so we can see on the browser
                    thick_start = peak - 2
                    thick_stop = peak + 2

                    #best_error checking logic to keep bed files from breaking
                    if thick_start < g_start:
                        thick_start = g_start
                    if thick_stop > g_stop:
                        thick_stop = g_stop

                    peak_length = g_stop - g_start + 1

                    #skip really small peaks
                    if peak_length < width_cutoff:
                        continue
                    peak_name = gene_name + "_" + str(peakn) + "_" + str(
                        int(n_reads_in_peak))

                    #super local logic
                    #best_error check to make sure area is in area of gene

                    #distance from gene start
                    if peak - tx_start - windowsize < 0:
                        area_start = 0

                    #for super local gets area around peak for calculation
                    else:
                        area_start = peak - tx_start - windowsize
                        #area_start = sectstart

                    #same thing except for end of gene instead of start
                    if peak + windowsize > tx_end:  #distance to gene stop
                        area_stop = tx_start - tx_end + 1
                    else:
                        area_stop = peak - tx_start + windowsize
                        #area_stop = sectstop

                    #use area reads + 1/2 all other reads in gene:
                    #area_reads = sum(pos_counts[area_start:area_stop]) +
                    #0.5*(sum(pos_counts) -
                    #sum(pos_counts[area_start:area_stop]))

                    #use area reads:
                    area_reads = sum(pos_counts[area_start:area_stop])
                    area_size = area_stop - area_start + 1

                    #area_reads = sum(pos_counts[sectstart:sectstop])
                    #area_size = sect_length

                    #calcluates poisson based of whole gene vs peak
                    gene_pois_p = poissonP(nreads_in_gene, n_reads_in_peak,
                                           gene_length, peak_length)
                    if SloP is True:
                        #same thing except for based on super local p-value
                        slop_pois_p = poissonP(area_reads, n_reads_in_peak,
                                               area_size, peak_length)

                    #makes sure spop_poisP is defined, even if its
                    #just normal, something to be removed later,
                    #slop should only be used when defined as true
                    else:
                        slop_pois_p = gene_pois_p

                    if math.isnan(slop_pois_p):
                        slop_pois_p = 1

                    #remove later
                    if slop_pois_p > poisson_cutoff:
                        #continue
                        pass

                    #defines the bedline of a peak for returning
                    #TODO This should be abstracted out for now... seperate model from view
                    bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % (
                        chrom, g_start, g_stop, peak_name, slop_pois_p,
                        signstrand, thick_start, thick_stop)

                    #metadata for the specific bedline
                    peak_dict['clusters'][bedline] = {}
                    peak_dict['clusters'][bedline]['GeneP'] = gene_pois_p
                    peak_dict['clusters'][bedline]['SloP'] = slop_pois_p
                    peak_dict['clusters'][bedline]['Nreads'] = n_reads_in_peak
                    peak_dict['clusters'][bedline]['size'] = peak_length

                    peakn += 1

                #there are more than one peaks in this window
                #NO LONGER NESSESSARY SHOULD REMOVE
                else:
                    #this handles peaks within peaks logic

                    #local minima in subsection, relative to section start
                    valleys = array(
                        map(
                            lambda x: x + p_start, xvals[diff(
                                sign(diff(spline(xvals[p_start:p_stop +
                                                       1])))) > 0]))

                    for subpeak in peaks:
                        subpeak_start = int()
                        subpeak_stop = int()

                        if any(valleys < subpeak):
                            subpeak_start = valleys[valleys < subpeak][-1]
                        else:
                            subpeak_start = starts[starts < subpeak][-1]

                        if any(valleys > subpeak):
                            subpeak_stop = valleys[valleys > subpeak][0]
                        else:
                            subpeak_stop = stops[stops > subpeak][0]
                        peak_length = subpeak_stop - subpeak_start + 1

                        if peak_length < width_cutoff:  #skip really small peaks
                            continue
                        n_reads_in_peak = sum(cts[subpeak_start:(subpeak_stop +
                                                                 1)])

                        if (n_reads_in_peak < minreads
                                or max(data[subpeak_start:(subpeak_stop + 1)])
                                < threshold):
                            continue

                        g_start = tx_start + subpeak_start + sectstart
                        g_stop = tx_start + subpeak_stop + sectstart
                        peak = tx_start + subpeak + sectstart
                        thick_start = peak - 2

                        if thick_start < g_start:
                            thick_start = g_start
                        thick_stop = peak + 2

                        if thick_stop > g_stop:
                            thick_stop = g_stop
                        peak_name = "%s_%s_%s" % (gene_name, peakn,
                                                  int(n_reads_in_peak))

                        #distance from gene start
                        if peak - tx_start - windowsize < 0:
                            area_start = 0
                        else:
                            area_start = peak - tx_start - windowsize

                        if peak + windowsize > tx_end:  #distance to gene stop
                            area_stop = tx_start - tx_end + 1
                        else:
                            #area_stop = sectstop
                            area_stop = peak - tx_start + windowsize

                        area_reads = sum(pos_counts[area_start:area_stop])
                        area_size = area_stop - area_start + 1

                        gene_pois_p = poissonP(nreads_in_gene, n_reads_in_peak,
                                               gene_length, peak_length)

                        if SloP is True:
                            slop_pois_p = poissonP(area_reads, n_reads_in_peak,
                                                   area_size, peak_length)
                        else:
                            slop_pois_p = gene_pois_p

                        if math.isnan(slop_pois_p):
                            slop_pois_p = 1

                        #leave these in to allow for BH p-value correction
                        if slop_pois_p > poisson_cutoff:
                            pass

                        #output results again
                        bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % (
                            chrom, g_start, g_stop, peak_name, slop_pois_p,
                            signstrand, thick_start, thick_stop)

                        peak_dict['clusters'][bedline] = {}
                        peak_dict['clusters'][bedline]['SloP'] = slop_pois_p
                        peak_dict['clusters'][bedline]['GeneP'] = gene_pois_p
                        peak_dict['clusters'][bedline][
                            'Nreads'] = n_reads_in_peak
                        peak_dict['clusters'][bedline]['size'] = peak_length
                        peakn += 1
        except NameError as best_error:
            print >> sys.stderr, best_error
            print >> sys.stderr, "spline fitting failed for %s" % (loc)
            raise

    #inflate p-values based on # of comparisons #bonferroni corrected
    if correct_p is True:
        for peak in peak_dict['clusters']:
            peak_dict['clusters'][peak]['p'] = peak_dict['clusters'][peak][
                'p'] * peakn  #bonferroni correct p-value for MHT

    peak_dict['Nclusters'] = peakn

    return peak_dict
Exemplo n.º 15
0
def peaks_from_info(
    wiggle,
    pos_counts,
    lengths,
    loc,
    gene_length,
    margin=25,
    fdr_alpha=0.05,
    user_threshold=None,
    minreads=20,
    poisson_cutoff=0.05,
    plotit=False,
    width_cutoff=10,
    windowsize=1000,
    SloP=False,
    correct_p=False,
):

    """
    
    same args as before 
    wiggle is converted from bam file
    pos_counts - one point per read instead of coverage of entire read
    lengths - lengths aligned portions of reads 
    rest are the same fix later


    calls peaks for an individual gene 
    

    gene_length - effective length of gene
    margin - space between sections for calling new peaks
    fdr_alpha - false discovery rate, p-value bonferoni correct from peaks script (called in setup)
    user_threshold - user defined FDR thershold (probably should be factored into fdr_alpha
    minreads - min reads in section to try and call peaks
    poisson_cutoff - p-value for signifance cut off for number of reads in peak that gets called - might want to use ashifted distribution
    plotit - makes figures 
    
    w_cutoff - width cutoff, peaks narrower than this are discarted 
    windowssize - for super local calculation distance left and right to look 
    SloP - super local p-value instead of gene-wide p-value
    correct_p - boolean bonferoni correction of p-values from poisson
        
    """

    peak_dict = {}

    # these are what is built in this dict, complicated enough that it might
    # be worth turning into an object
    # peak_dict['clusters'] = {}
    # peak_dict['sections'] = {}
    # peak_dict['nreads'] = int()
    # peak_dict['threshold'] = int()
    # peak_dict['loc'] = loc

    # data munging
    chrom, gene_name, tx_start, tx_end, signstrand = loc
    tx_start, tx_end = [int(x) for x in [tx_start, tx_end]]

    # used for poisson calclulation?
    nreads_in_gene = sum(pos_counts)

    # decides FDR calcalation, maybe move getFRDcutoff mean into c code

    if user_threshold is None:
        gene_threshold = get_FDR_cutoff_mean(lengths, gene_length, alpha=fdr_alpha)

    else:
        gene_threshold = user_threshold

    if gene_threshold == "best_error":
        # verboseprint("""I had a hard time with this one: %s.
        #                I think I'll use a threshold of 50""" % (loc))

        threshold = 50

    peak_dict["clusters"] = {}
    peak_dict["sections"] = {}
    peak_dict["nreads"] = int(nreads_in_gene)
    peak_dict["threshold"] = gene_threshold
    peak_dict["loc"] = loc
    peakn = 1

    # verboseprintprint("Testing %s" % (loc))
    # verboseprint("Gene threshold is: %d" % (gene_threshold))

    # print wiggle
    # print margin
    sections = find_sections(wiggle, margin)
    if plotit is True:
        plot_sections(wiggle, sections, gene_threshold)

    for sect in sections:
        sectstart, sectstop = sect
        sect_length = sectstop - sectstart + 1
        data = wiggle[sectstart : (sectstop + 1)]
        cts = pos_counts[sectstart : (sectstop + 1)]
        xvals = arange(0, sect_length)
        Nreads = sum(cts)

        # gets random subset of lengths of reads for calculations on a section
        # not exactly the right way to do this but it should be very close.
        sect_read_lengths = rs(lengths, Nreads)
        peak_dict["sections"][sect] = {}
        threshold = int()

        # makes sure there are enough reads
        if Nreads < minreads:
            # verboseprint("""%d is not enough reads, skipping section:
            #                %s""" % (Nreads, sect))
            continue

        else:
            pass
            # verboseprint("""Analyzing section %s with %d reads"""
            #              % (sect, Nreads))

        # sets super-local if requested, might be able to factor this
        if user_threshold is None:
            if SloP is True:
                threshold = get_FDR_cutoff_mean(sect_read_lengths, sect_length, alpha=fdr_alpha)

            # verboseprint("Using super-local threshold %d" % (threshold))

            else:
                threshold = gene_threshold
        else:
            threshold = user_threshold

        # saves threshold for each individual section
        peak_dict["sections"][sect]["threshold"] = threshold
        peak_dict["sections"][sect]["nreads"] = int(Nreads)

        # if wiggle track never excides threshold
        if max(data) < threshold:
            # verboseprint("data does not excede threshold, stopping")
            continue

        # fitting splines logic, black magic
        try:
            degree = 3  # cubic spline
            weights = None

            # for very large windows with many reads a large smoothing
            # parameter is required.  test several different options
            # to determine a reasonable inital estimate
            # Goal is to find optimnal smooting paramater in multiple steps
            # initial_smoothing_value initial estimate of smoothing paramater
            # step 1, identify good initial value
            initial_smoothing_value = sectstop - sectstart + 1
            best_smoothing_value = initial_smoothing_value
            best_estimate = 1

            # step 2, refine so as not to runinto local minima later,
            # try to come up with a good way of getting optimal paramater
            best_error = find_spline_residuals(initial_smoothing_value, xvals, data, degree, weights)

            for i in range(2, 11):
                cur_smoothing_value = initial_smoothing_value * i

                # tries find optimal initial smooting paraater in this loop
                cur_error = find_spline_residuals(cur_smoothing_value, xvals, data, degree, weights)
                if cur_error < best_error:
                    best_smoothing_value = cur_smoothing_value
                    best_estimate = i

            # verboseprint("""I'm using (region length) * %d as the
            #                initial estimate for the smoothing
            #                parameter""" % (best_estimate))

            try:
                # fine optimization of smooting paramater
                cutoff = float(0)
                tries = 0

                # shouldn't get smoothing coef's this small.. increase
                # the initial estimate and try again. WARNING: BLACK MAGIC
                while cutoff < 5:
                    tries += 1

                    # increasing this may improve accuracy,
                    # but at the cost of running time.
                    if tries == 3:
                        break

                    spline = optimize.minimize(
                        find_spline_residuals,
                        best_smoothing_value,
                        args=(xvals, data, degree, weights),
                        options={"disp": False, "maxiter": 10},
                        # method="Powell", # old method
                        method="L-BFGS-B",  # abnormal termination sometimes
                        # method="COBYLA",
                        bounds=((0.1, None),),
                    )

                    # fit a smoothing spline using an optimal parameter
                    # for smoothing and with weights proportional to the
                    # number of reads aligned at each position if weights
                    # is set
                    if spline.success:
                        cutoff = spline.x
                        # print "cutoff is %s" % (cutoff)
                    else:
                        # print "%s failed spline building at section %s" % (loc, sect)
                        # print spline.message
                        pass

                    best_smoothing_value += sect_length
            except Exception as best_error:
                print "best smoothing value is:", best_smoothing_value
                print >> sys.stderr, "%s failed spline fitting at section %s (major crash)" % (loc, sect)
                print >> sys.stderr, best_error
                continue

            # verboseprint ("optimized smoothing parameter")
            # if we are going to save and output as a pickle fi is %s" %(str(cutoff))
            # final fit spline

            spline = find_univariate_spline(cutoff, xvals, data, degree, weights)

            spline_values = array([round(x) for x in spline(xvals)])
            if plotit is True:
                plot_spline(spline, data, xvals, peakn, threshold)

            starts_and_stops, starts, stops = get_regions_above_threshold(threshold, spline_values)

            # walks along spline, and calls peaks along spline
            # for each start, take the next stop and find the peak
            # between the start and the stop this is where I need to
            # fix, some peaks starts start right after another start,
            # but not on top of it make sure the next start is after the
            # previous stop

            # subsections that are above threshold
            for p_start, p_stop in starts_and_stops:

                # peaks with-in this subsection, indexed from section
                # (not subsection) start
                # find all local maxima
                peaks = [x + p_start for x in xvals[find_local_maxima(spline_values[p_start : (p_stop + 1)])]]
                # map(lambda x: x + p_start,
                #            xvals[diff(sign(diff(spline(xvals[p_start:(p_stop + 1)])))) < 0])

                if not len(peaks) in (0, 1):
                    # print gene_name
                    # print "spline ", spline(xvals)
                    # print "threshold: %s" % (threshold)
                    # print "full spline ", spline_values
                    # print "peaks", peaks
                    # print p_start, p_stop
                    # print starts_and_stops
                    # print "spline values", spline_values[p_start:(p_stop + 1)]
                    # print "peaks at in section", xvals[find_local_maxima(spline_values[p_start:(p_stop + 1)])]
                    assert len(peaks) in (0, 1)  # there should be one or zero peaks in every section

                # handles logic if there are multiple peaks between
                # start and stop
                if len(peaks) <= 0:
                    continue
                if len(peaks) is 1:

                    # gets reads in peak
                    n_reads_in_peak = sum(cts[p_start : (p_stop + 1)])
                    # verboseprint(""""Peak %d (%d - %d) has %d
                    #                 reads""" % (peakn,
                    #                             p_start,
                    #                             (p_stop + 1),
                    #                             n_reads_in_peak))

                    # makes sure there enough reads
                    if n_reads_in_peak < minreads or max(data[p_start : (p_stop + 1)]) < threshold:
                        #    verboseprint("""skipping peak, %d is not enough reads"""
                        #                  % (n_reads_in_peak))
                        continue

                    # formatting of bed track
                    # start and stop for bed track to be created
                    g_start = tx_start + sectstart + p_start
                    g_stop = tx_start + sectstart + p_stop

                    # highest point in start stop
                    peak = tx_start + sectstart + peaks[0]

                    # makes it thicker so we can see on the browser
                    thick_start = peak - 2
                    thick_stop = peak + 2

                    # best_error checking logic to keep bed files from breaking
                    if thick_start < g_start:
                        thick_start = g_start
                    if thick_stop > g_stop:
                        thick_stop = g_stop

                    peak_length = g_stop - g_start + 1

                    # skip really small peaks
                    if peak_length < width_cutoff:
                        continue
                    peak_name = gene_name + "_" + str(peakn) + "_" + str(int(n_reads_in_peak))

                    # super local logic
                    # best_error check to make sure area is in area of gene

                    # distance from gene start
                    if peak - tx_start - windowsize < 0:
                        area_start = 0

                    # for super local gets area around peak for calculation
                    else:
                        area_start = peak - tx_start - windowsize
                        # area_start = sectstart

                    # same thing except for end of gene instead of start
                    if peak + windowsize > tx_end:  # distance to gene stop
                        area_stop = tx_start - tx_end + 1
                    else:
                        area_stop = peak - tx_start + windowsize
                        # area_stop = sectstop

                    # use area reads + 1/2 all other reads in gene:
                    # area_reads = sum(pos_counts[area_start:area_stop]) +
                    # 0.5*(sum(pos_counts) -
                    # sum(pos_counts[area_start:area_stop]))

                    # use area reads:
                    area_reads = sum(pos_counts[area_start:area_stop])
                    area_size = area_stop - area_start + 1

                    # area_reads = sum(pos_counts[sectstart:sectstop])
                    # area_size = sect_length

                    # calcluates poisson based of whole gene vs peak
                    gene_pois_p = poissonP(nreads_in_gene, n_reads_in_peak, gene_length, peak_length)
                    if SloP is True:
                        # same thing except for based on super local p-value
                        slop_pois_p = poissonP(area_reads, n_reads_in_peak, area_size, peak_length)

                    # makes sure spop_poisP is defined, even if its
                    # just normal, something to be removed later,
                    # slop should only be used when defined as true
                    else:
                        slop_pois_p = gene_pois_p

                    if math.isnan(slop_pois_p):
                        slop_pois_p = 1

                    # remove later
                    if slop_pois_p > poisson_cutoff:
                        # continue
                        pass

                    # defines the bedline of a peak for returning
                    # TODO This should be abstracted out for now... seperate model from view
                    bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % (
                        chrom,
                        g_start,
                        g_stop,
                        peak_name,
                        slop_pois_p,
                        signstrand,
                        thick_start,
                        thick_stop,
                    )

                    # metadata for the specific bedline
                    peak_dict["clusters"][bedline] = {}
                    peak_dict["clusters"][bedline]["GeneP"] = gene_pois_p
                    peak_dict["clusters"][bedline]["SloP"] = slop_pois_p
                    peak_dict["clusters"][bedline]["Nreads"] = n_reads_in_peak
                    peak_dict["clusters"][bedline]["size"] = peak_length

                    peakn += 1

                # there are more than one peaks in this window
                # NO LONGER NESSESSARY SHOULD REMOVE
                else:
                    # this handles peaks within peaks logic

                    # local minima in subsection, relative to section start
                    valleys = array(
                        map(lambda x: x + p_start, xvals[diff(sign(diff(spline(xvals[p_start : p_stop + 1])))) > 0])
                    )

                    for subpeak in peaks:
                        subpeak_start = int()
                        subpeak_stop = int()

                        if any(valleys < subpeak):
                            subpeak_start = valleys[valleys < subpeak][-1]
                        else:
                            subpeak_start = starts[starts < subpeak][-1]

                        if any(valleys > subpeak):
                            subpeak_stop = valleys[valleys > subpeak][0]
                        else:
                            subpeak_stop = stops[stops > subpeak][0]
                        peak_length = subpeak_stop - subpeak_start + 1

                        if peak_length < width_cutoff:  # skip really small peaks
                            continue
                        n_reads_in_peak = sum(cts[subpeak_start : (subpeak_stop + 1)])

                        if n_reads_in_peak < minreads or max(data[subpeak_start : (subpeak_stop + 1)]) < threshold:
                            continue

                        g_start = tx_start + subpeak_start + sectstart
                        g_stop = tx_start + subpeak_stop + sectstart
                        peak = tx_start + subpeak + sectstart
                        thick_start = peak - 2

                        if thick_start < g_start:
                            thick_start = g_start
                        thick_stop = peak + 2

                        if thick_stop > g_stop:
                            thick_stop = g_stop
                        peak_name = "%s_%s_%s" % (gene_name, peakn, int(n_reads_in_peak))

                        # distance from gene start
                        if peak - tx_start - windowsize < 0:
                            area_start = 0
                        else:
                            area_start = peak - tx_start - windowsize

                        if peak + windowsize > tx_end:  # distance to gene stop
                            area_stop = tx_start - tx_end + 1
                        else:
                            # area_stop = sectstop
                            area_stop = peak - tx_start + windowsize

                        area_reads = sum(pos_counts[area_start:area_stop])
                        area_size = area_stop - area_start + 1

                        gene_pois_p = poissonP(nreads_in_gene, n_reads_in_peak, gene_length, peak_length)

                        if SloP is True:
                            slop_pois_p = poissonP(area_reads, n_reads_in_peak, area_size, peak_length)
                        else:
                            slop_pois_p = gene_pois_p

                        if math.isnan(slop_pois_p):
                            slop_pois_p = 1

                        # leave these in to allow for BH p-value correction
                        if slop_pois_p > poisson_cutoff:
                            pass

                        # output results again
                        bedline = "%s\t%d\t%d\t%s\t%s\t%s\t%d\t%d" % (
                            chrom,
                            g_start,
                            g_stop,
                            peak_name,
                            slop_pois_p,
                            signstrand,
                            thick_start,
                            thick_stop,
                        )

                        peak_dict["clusters"][bedline] = {}
                        peak_dict["clusters"][bedline]["SloP"] = slop_pois_p
                        peak_dict["clusters"][bedline]["GeneP"] = gene_pois_p
                        peak_dict["clusters"][bedline]["Nreads"] = n_reads_in_peak
                        peak_dict["clusters"][bedline]["size"] = peak_length
                        peakn += 1
        except NameError as best_error:
            print >> sys.stderr, best_error
            print >> sys.stderr, "spline fitting failed for %s" % (loc)
            raise

    # inflate p-values based on # of comparisons #bonferroni corrected
    if correct_p is True:
        for peak in peak_dict["clusters"]:
            peak_dict["clusters"][peak]["p"] = (
                peak_dict["clusters"][peak]["p"] * peakn
            )  # bonferroni correct p-value for MHT

    peak_dict["Nclusters"] = peakn

    return peak_dict
Exemplo n.º 16
0
def get_namepos(n):
    maleNames = rs(mFirstNames, n/2)
    femaleNames = rs(fFirstNames, n/2)
    return map(lambda s: s+"\'s", maleNames+femaleNames)