Пример #1
0
def score_counts(counts, state, EmissionParameters):
	'''
	This function scores the the coutns for each mixture component
	'''
	
	nr_mixture_components = EmissionParameters['Diag_event_params']['nr_mix_comp']
	#Initialize the return array
	scored_counts = np.zeros((nr_mixture_components, counts.shape[1]))

	#Compute for each state the log-likelihood of the counts
	for mix_comp in range(nr_mixture_components):
		scored_counts[mix_comp, :] = diag_event_model.pred_log_lik(counts, state, EmissionParameters, single_mix=mix_comp)
		scored_counts[mix_comp, :] += np.log(EmissionParameters['Diag_event_params']['mix_comp'][state][mix_comp])

	return scored_counts
Пример #2
0
def PlotGene(Sequences,
             Background,
             gene,
             IterParameters,
             TransitionTypeFirst='nonhomo',
             no_plot=False,
             Start=0,
             Stop=-1,
             figsize=(6, 8),
             dir_ylim=[],
             out_name=None):
    '''
    This function plot the coverage and the parameters for the model
    '''

    reload(diag_event_model)
    reload(emission)
    set2 = brewer2mpl.get_map('Dark2', 'qualitative', 8).mpl_colors
    TransitionParameters = IterParameters[1]
    EmissionParameters = IterParameters[0]
    TransitionType = EmissionParameters['TransitionType']
    PriorMatrix = EmissionParameters['PriorMatrix']
    NrOfStates = EmissionParameters['NrOfStates']

    Sequences_per_gene = PreloadSequencesForGene(Sequences, gene)
    Background_per_gene = PreloadSequencesForGene(Background, gene)

    if EmissionParameters['FilterSNPs']:
        Ix = tools.GetModelIx(Sequences_per_gene,
                              Type='no_snps_conv',
                              snps_thresh=EmissionParameters['SnpRatio'],
                              snps_min_cov=EmissionParameters['SnpAbs'],
                              Background=Background_per_gene)
    else:
        Ix = tools.GetModelIx(Sequences_per_gene)

    #2) Compute the probabilities for both states
    EmmisionProbGene = np.log(
        np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates)))
    EmmisionProbGene_Dir = np.log(
        np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates)))
    EmmisionProbGeneNB_fg = np.log(
        np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates)))
    EmmisionProbGeneNB_bg = np.log(
        np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates)))

    CurrStackSum = tools.StackData(Sequences_per_gene)
    CurrStackVar = tools.StackData(Sequences_per_gene, add='no')
    nr_of_genes = len(Sequences.keys())
    gene_nr_dict = {}
    for i, curr_gene in enumerate(Sequences.keys()):
        gene_nr_dict[curr_gene] = i

    #Compute the emission probapility
    for State in range(NrOfStates):
        if not EmissionParameters['ExpressionParameters'][0] == None:
            EmmisionProbGene[
                State, :] = emission.predict_expression_log_likelihood_for_gene(
                    CurrStackSum, State, nr_of_genes, gene_nr_dict[gene],
                    EmissionParameters)
            EmmisionProbGeneNB_fg[
                State, :] = emission.predict_expression_log_likelihood_for_gene(
                    CurrStackSum, State, nr_of_genes, gene_nr_dict[gene],
                    EmissionParameters)
            if EmissionParameters['BckType'] == 'Coverage':
                EmmisionProbGene[
                    State, :] += emission.predict_expression_log_likelihood_for_gene(
                        tools.StackData(Background, gene, add='only_cov') + 0,
                        State,
                        nr_of_genes,
                        gene_nr_dict[gene],
                        EmissionParameters,
                        curr_type='bg')
                EmmisionProbGeneNB_bg[
                    State, :] = emission.predict_expression_log_likelihood_for_gene(
                        tools.StackData(Background, gene, add='only_cov') + 0,
                        State,
                        nr_of_genes,
                        gene_nr_dict[gene],
                        EmissionParameters,
                        curr_type='bg')
            if EmissionParameters['BckType'] == 'Coverage_bck':
                EmmisionProbGene[
                    State, :] += emission.predict_expression_log_likelihood_for_gene(
                        tools.StackData(Background, gene, add='only_cov') + 0,
                        State,
                        nr_of_genes,
                        gene_nr_dict[gene],
                        EmissionParameters,
                        curr_type='bg')
                EmmisionProbGeneNB_bg[
                    State, :] = emission.predict_expression_log_likelihood_for_gene(
                        tools.StackData(Background, gene, add='only_cov') + 0,
                        State,
                        nr_of_genes,
                        gene_nr_dict[gene],
                        EmissionParameters,
                        curr_type='bg')
        if not EmissionParameters['ign_diag']:
            EmmisionProbGene[State, Ix] += diag_event_model.pred_log_lik(
                CurrStackVar[:, Ix], State, EmissionParameters)
            EmmisionProbGene_Dir[State, Ix] = diag_event_model.pred_log_lik(
                CurrStackVar[:, Ix], State, EmissionParameters)

    #Get the transition probabilities
    if TransitionTypeFirst == 'nonhomo':
        if TransitionType == 'unif_bck' or TransitionType == 'binary_bck':
            CountsSeq = tools.StackData(Sequences_per_gene, add='all')
            CountsBck = tools.StackData(Background_per_gene, add='only_cov')
            Counts = np.vstack((CountsSeq, CountsBck))
        else:
            Counts = tools.StackData(Sequences_per_gene, add='all')
        TransistionProbabilities = np.float64(
            trans.PredictTransistions(Counts, TransitionParameters, NrOfStates,
                                      TransitionType))
    else:
        TransistionProbabilities = np.float64(
            np.tile(np.log(TransitionParameters[0]),
                    (EmmisionProbGene.shape[1], 1, 1)).T)

    MostLikelyPath, LogLik = viterbi.viterbi(np.float64(EmmisionProbGene),
                                             TransistionProbabilities,
                                             np.float64(np.log(PriorMatrix)))
    for j in range(NrOfStates):
        print str(np.sum(MostLikelyPath == j))

    if no_plot:
        return MostLikelyPath, TransistionProbabilities, EmmisionProbGene
    #pdb.set_trace()
    fig, axes = plt.subplots(nrows=9, figsize=figsize)
    fig.subplots_adjust(hspace=1.001)

    Counts = tools.StackData(Sequences_per_gene, gene, add='no')
    if Stop == -1:
        Stop = Counts.shape[1]
    if Stop == -1:
        plt_rng = np.array(range(Start, Counts.shape[1]))
    else:
        plt_rng = np.array(range(Start, Stop))

    i = 0
    color = set2[i]
    nr_of_rep_fg = len(Sequences[gene]['Coverage'].keys())
    i += 1
    Ix = repl_track_nr([2, 16], 22, nr_of_rep_fg)
    ppl.plot(axes[0],
             plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop],
             label='TC',
             linewidth=2,
             color=color)
    color = set2[i]
    i += 1
    Ix = repl_track_nr([0, 1, 3, 5, 6, 7, 8, 10, 11, 12, 13, 15, 17, 18], 22,
                       nr_of_rep_fg)
    ppl.plot(axes[0],
             plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop],
             label='NonTC',
             linewidth=2,
             color=color)
    color = set2[i]
    i += 1
    Ix = repl_track_nr([20], 22, nr_of_rep_fg)
    ppl.plot(axes[0],
             plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop],
             label='Read-ends',
             linewidth=2,
             color=color)
    color = set2[i]
    i += 1
    Ix = repl_track_nr([4, 9, 14, 19], 22, nr_of_rep_fg)
    ppl.plot(axes[0],
             plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop],
             label='Deletions',
             linewidth=2,
             color=color)
    color = set2[i]
    i += 1
    Ix = repl_track_nr([21], 22, nr_of_rep_fg)
    ppl.plot(axes[0],
             plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop],
             label='Coverage',
             linewidth=2,
             color=color)
    color = set2[i]
    i += 1
    axes[0].set_ylabel('Counts')
    axes[0].set_xlabel('Position')
    axes[0].set_title('Coverage and Conversions')
    axes[0].get_xaxis().get_major_formatter().set_useOffset(False)

    BckCov = Background_per_gene['Coverage'][0]
    for i in range(1, len(Background_per_gene['Coverage'].keys())):
        BckCov += Background_per_gene['Coverage'][str(i)]

    ppl.plot(axes[0],
             plt_rng, (BckCov.T)[Start:Stop],
             ls='-',
             label='Bck',
             linewidth=2,
             color=color)
    ppl.legend(axes[0])

    for j in range(NrOfStates):
        color = set2[j]
        ppl.plot(axes[1],
                 plt_rng, (TransistionProbabilities[j, j, :])[Start:Stop],
                 label='Transition ' + str(j) + ' ' + str(j),
                 linewidth=2,
                 color=color)

    ppl.legend(axes[1])
    axes[1].set_ylabel('log-transition probability')
    axes[1].set_xlabel('Position')
    axes[1].set_title('Transition probability')
    axes[1].get_xaxis().get_major_formatter().set_useOffset(False)

    for j in range(NrOfStates):
        color = set2[j]
        ppl.plot(axes[2],
                 plt_rng, (EmmisionProbGene[j, :][Start:Stop]),
                 label='Emission ' + str(j),
                 linewidth=2,
                 color=color)
    if EmissionParameters['BckType'] == 'Coverage_bck':
        axes[2].set_ylim(
            (np.min(np.min(EmmisionProbGene[0:2, :][:, Start:Stop])), 1))

    ppl.legend(axes[2])
    axes[2].set_ylabel('log-GLM probability')
    axes[2].set_xlabel('Position')
    axes[2].set_title('Emission probability')
    axes[2].get_xaxis().get_major_formatter().set_useOffset(False)

    ppl.plot(axes[3], plt_rng, MostLikelyPath[Start:Stop])
    axes[3].set_ylabel('State')
    axes[3].set_xlabel('Position')
    axes[3].set_title('Most likely path')
    axes[3].get_xaxis().get_major_formatter().set_useOffset(False)

    for j in range(NrOfStates):
        color = set2[j]
        ppl.plot(axes[4],
                 plt_rng,
                 EmmisionProbGene_Dir[j, :][Start:Stop],
                 label='Dir State ' + str(j),
                 linewidth=2,
                 color=color)
    if len(dir_ylim) > 0:
        axes[4].set_ylim(dir_ylim)
    ppl.legend(axes[4])
    axes[4].set_ylabel('log-DMM probability')
    axes[4].set_xlabel('Position')
    axes[4].set_title('DMM probability')
    axes[4].get_xaxis().get_major_formatter().set_useOffset(False)

    for j in range(NrOfStates):
        color = set2[j]
        ppl.plot(axes[5],
                 plt_rng,
                 EmmisionProbGeneNB_fg[j, :][Start:Stop],
                 label='NB fg ' + str(j),
                 linewidth=2,
                 color=color)
    if EmissionParameters['BckType'] == 'Coverage_bck':
        axes[5].set_ylim(
            [np.min(np.min(EmmisionProbGeneNB_fg[0:2, :][:, Start:Stop])), 1])

    ppl.legend(axes[5])
    axes[5].set_ylabel('prob')
    axes[5].set_xlabel('Position')
    axes[5].set_title('prob-fg')
    axes[5].get_xaxis().get_major_formatter().set_useOffset(False)

    for j in range(NrOfStates):
        color = set2[j]
        ppl.plot(axes[6],
                 plt_rng,
                 EmmisionProbGeneNB_bg[j, :][Start:Stop],
                 label='NB bg ' + str(j),
                 linewidth=2,
                 color=color)
    if EmissionParameters['BckType'] == 'Coverage_bck':
        axes[6].set_ylim(
            [np.min(np.min(EmmisionProbGeneNB_bg[0:3, :][:, Start:Stop])), 1])
    ppl.legend(axes[6])
    axes[6].set_ylabel('prob')
    axes[6].set_xlabel('Position')
    axes[6].set_title('prob-bg')
    axes[6].get_xaxis().get_major_formatter().set_useOffset(False)

    fg_state, bg_state = emission.get_fg_and_bck_state(EmissionParameters,
                                                       final_pred=True)
    ix_bg = range(EmmisionProbGene.shape[0])
    ix_bg.remove(fg_state)
    FGScore = EmmisionProbGene[fg_state, :]
    AltScore = EmmisionProbGene[ix_bg, :]
    norm = logsumexp(AltScore, axis=0)

    ix_ok = np.isinf(norm) + np.isnan(norm)
    if np.sum(ix_ok) < norm.shape[0]:
        SiteScore = FGScore[ix_ok == 0] - norm[ix_ok == 0]
    else:
        print 'Score problematic'
        SiteScore = FGScore
    ppl.plot(axes[7], plt_rng, SiteScore[Start:Stop])
    axes[7].set_ylabel('log-odd score')
    axes[7].set_xlabel('Position')
    axes[7].set_title('log-odd score')
    axes[7].get_xaxis().get_major_formatter().set_useOffset(False)

    FGScore = EmmisionProbGene_Dir[fg_state, :]
    AltScore = EmmisionProbGene_Dir[ix_bg, :]
    norm = logsumexp(AltScore, axis=0)
    ix_ok = np.isinf(norm) + np.isnan(norm)
    if np.sum(ix_ok) < norm.shape[0]:
        SiteScore = FGScore[ix_ok == 0] - norm[ix_ok == 0]
    else:
        print 'Score problematic'
        SiteScore = FGScore
    ppl.plot(axes[8], plt_rng, SiteScore[Start:Stop])
    axes[8].set_ylabel('DMM log-odd score')
    axes[8].set_xlabel('Position')
    axes[8].set_title('DMM log-odd score')
    axes[8].get_xaxis().get_major_formatter().set_useOffset(False)
    if not (out_name is None):
        print 'Saving result'
        fig.savefig(out_name)

    plt.show()

    return MostLikelyPath, TransistionProbabilities, EmmisionProbGeneNB_fg
Пример #3
0
def GetMostLikelyPath(MostLikelyPaths, Sequences, Background, EmissionParameters, TransitionParameters, TransitionTypeFirst, RandomNoise = False, verbosity=1):
    '''
    This function computes the most likely path. Ther are two options, 'h**o' and 'nonhomo' for TransitionType.
    This specifies whether the transition probabilities should be homogenous or non-homogenous.
    '''

    MostLikelyPaths = {}
    alpha = EmissionParameters['Diag_event_params']
    PriorMatrix = EmissionParameters['PriorMatrix']
    NrOfStates = EmissionParameters['NrOfStates']
    np_proc = EmissionParameters['NbProc']
    LogLikelihood = 0


    #Iterate over genes
    nr_of_genes = len(list(Sequences.keys()))
    gene_nr_dict = {}
    for i, curr_gene in enumerate(Sequences.keys()):
        gene_nr_dict[curr_gene] = i
        
    #print("Computing most likely path")
    t = time.time()
    for i, gene in enumerate(Sequences.keys()):
        if i % 1000 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()
        #Score the state sequences
        #1) Determine the positions where an observation is possible
        Sequences_per_gene = PreloadSequencesForGene(Sequences, gene)
        Background_per_gene = PreloadSequencesForGene(Background, gene)

        Ix = GetModelIx(Sequences_per_gene, Type='all')

        if np.sum(Ix) == 0:
            MostLikelyPaths[gene] = 2 * np.ones((0, Ix.shape[0]), dtype=np.int)
            continue 

        if EmissionParameters['FilterSNPs']:
            Ix = GetModelIx(Sequences_per_gene, Type='no_snps_conv', snps_thresh=EmissionParameters['SnpRatio'], snps_min_cov=EmissionParameters['SnpAbs'], Background=Background_per_gene)
        else:
            Ix = GetModelIx(Sequences_per_gene)

        #2) Compute the probabilities for both states
        EmmisionProbGene = np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates))
        
        CurrStackSum = StackData(Sequences_per_gene)
        CurrStackVar = StackData(Sequences_per_gene, add = 'no')
        
        for State in range(NrOfStates):
            if not EmissionParameters['ExpressionParameters'][0] == None:
                #EmmisionProbGene[State, :] = FitBinoDirchEmmisionProbabilities.ComputeStateProbForGeneNB_unif(CurrStack, alpha, State, EmissionParameters)
                EmmisionProbGene[State, :] = emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSum, State, nr_of_genes, gene_nr_dict[gene], EmissionParameters)
                if EmissionParameters['BckType'] == 'Coverage':
                    EmmisionProbGene[State, :] += emission_prob.predict_expression_log_likelihood_for_gene(StackData(Background, gene, add = 'only_cov'), State, nr_of_genes, gene_nr_dict[gene], EmissionParameters, 'bg')
                if EmissionParameters['BckType'] == 'Coverage_bck':
                    EmmisionProbGene[State, :] += emission_prob.predict_expression_log_likelihood_for_gene(StackData(Background, gene, add = 'only_cov'), State, nr_of_genes, gene_nr_dict[gene], EmissionParameters, 'bg')

            
            EmmisionProbGene[State, Ix] += diag_event_model.pred_log_lik(CurrStackVar[:, Ix], State, EmissionParameters)

        if RandomNoise:
            EmmisionProbGene = np.logaddexp(EmmisionProbGene, np.random.uniform(np.min(EmmisionProbGene[np.isfinite(EmmisionProbGene)]) - 4, np.min(EmmisionProbGene[np.isfinite(EmmisionProbGene)]) -1, EmmisionProbGene.shape)) #Add some random noise 
            
        #Get the transition probabilities
        TransistionProbabilities = np.float64(np.tile(np.log(TransitionParameters[0]), (EmmisionProbGene.shape[1],1,1)).T)

        #Perform Viterbi algorithm and append Path
        CurrPath, Currloglik = viterbi.viterbi(np.float64(EmmisionProbGene), TransistionProbabilities, np.float64(np.log(PriorMatrix)))
        MostLikelyPaths[gene] = CurrPath
        #Compute the logliklihood of the gene
        
        LogLikelihood += Currloglik
        del TransistionProbabilities, EmmisionProbGene, CurrStackSum, CurrStackVar
    
    if verbosity > 0:
        print('\nDone: Elapsed time: ' + str(time.time() - t))
    
    return MostLikelyPaths, LogLikelihood
Пример #4
0
def GetSitesForGene(data):
    '''
    This function determines for each gene the score of the sites
    '''

    #Computing the probabilities for the current gene

    Sites, gene, nr_of_genes, gene_nr, seq_file, bck_file, EmissionParameters, TransitionParameters, TransitionTypeFirst, fg_state, merge_neighbouring_sites, minimal_site_length = data
    #Turn the Sequence and Bacground objects into dictionaries again such that the subsequent methods for using these do not need to be modified
    if len(Sites) == 0:
        return gene, []

    NrOfStates = EmissionParameters['NrOfStates']
    
    Sites = dict([(gene, Sites)])

    Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
    Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')

    Sequences_per_gene = PreloadSequencesForGene(Sequences, gene)
    Background_per_gene = PreloadSequencesForGene(Background, gene)

    Ix = GetModelIx(Sequences_per_gene, Type='all')

    if np.sum(Ix) == 0:
        return gene, []

    if EmissionParameters['FilterSNPs']:
        Ix = GetModelIx(Sequences_per_gene, Type='no_snps_conv', snps_thresh=EmissionParameters['SnpRatio'], snps_min_cov=EmissionParameters['SnpAbs'], Background=Background_per_gene)
    else:
        Ix = GetModelIx(Sequences_per_gene, Type='Conv')

    #Only compute the emission probability for regions where a site is
    ix_sites = np.zeros_like(Ix)
    ix_sites_len = Ix.shape[0]
    for currsite in Sites[gene]:
        ix_sites[max(0, currsite[0] - 1) : min(ix_sites_len, currsite[1] + 1)] = 1
    ix_sites = ix_sites == 1

    #2) Compute the probabilities for both states
    EmmisionProbGene = np.log(np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates)))
    CurrStackSum = StackData(Sequences_per_gene)
    CurrStackVar = StackData(Sequences_per_gene, add = 'no')
    CurrStackSumBck = StackData(Background_per_gene, add = 'only_cov')

    CurrStackVarSumm = StackData(Sequences_per_gene, add = 'only_var_summed')
    EmmisionProbGeneDir = np.zeros_like(EmmisionProbGene)

    if EmissionParameters['glm_weight'] < 0.0:
        weight1 = 1.0
        weight2 = 1.0
    elif EmissionParameters['glm_weight'] == 0.0:
        weight1 = 0.0000001
        weight2 = 1.0 - weight1 
    elif EmissionParameters['glm_weight'] == 1.0:
        weight1 = 0.9999999
        weight2 = 1.0 - weight1 
    else:
        weight1 = EmissionParameters['glm_weight'] 
        weight2 = (1.0 - EmissionParameters['glm_weight']) 

        
    for State in range(NrOfStates):
        EmmisionProbGene[State, ix_sites] = np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSum[:, ix_sites], State, nr_of_genes, gene_nr, EmissionParameters)
        if EmissionParameters['BckType'] == 'Coverage':
            EmmisionProbGene[State, ix_sites] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck[:, ix_sites], State, nr_of_genes, gene_nr, EmissionParameters, 'bg')
        if EmissionParameters['BckType'] == 'Coverage_bck':
            EmmisionProbGene[State, ix_sites] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck[:, ix_sites], State, nr_of_genes, gene_nr, EmissionParameters, 'bg')
        EmmisionProbGeneDir[State, Ix] = np.log(weight2) + diag_event_model.pred_log_lik(CurrStackVar[:, Ix], State, EmissionParameters)
        EmmisionProbGene[State, Ix] += np.log(weight2) + EmmisionProbGeneDir[State, Ix]

    Counts = StackData(Sequences_per_gene, add = 'all')
    

    Score = EmmisionProbGene
    CurrStack = CurrStackVar
    #Compute the scores when staying in the same state
    #RowIx = list(range(16)) + list(range(17, 38)) + list(range(39,44))
    strand = Sequences_per_gene['strand']

    #Get the coverages for the froeground and background
    CountsSeq = StackData(Sequences_per_gene, add = 'only_cov')
    CountsBck = StackData(Background_per_gene, add = 'only_cov')

    if strand == 0:
        strand = -1
    #Since we the transition probabilty is the same for all States we do not need to compute it for the bayes factor
    #this list contains the returned sites
    sites = []
    for currsite in Sites[gene]:
        mean_mat_fg, var_mat_fg, mean_mat_bg, var_mat_bg, counts_fg, counts_bg = ComputeStatsForSite(CountsSeq, CountsBck, currsite, fg_state, nr_of_genes, gene_nr, EmissionParameters)

        site = {}
        site['Start'] = currsite[0]
        site['Stop'] = currsite[1]
        site['Strand'] = strand
        site['SiteScore'] = EvaluateSite(Score, currsite, fg_state)
        site['Coverage'] = np.sum(np.sum(Counts[:, site['Start'] : site['Stop']], axis=0))
        site['Variants'] = np.sum(CurrStackVarSumm[:, site['Start'] : site['Stop']], axis=1)
        site['mean_mat_fg'] = mean_mat_fg
        site['var_mat_fg'] = var_mat_fg
        site['mean_mat_bg'] = mean_mat_bg
        site['var_mat_bg'] = var_mat_bg
        site['counts_fg'] = counts_fg
        site['counts_bg'] = counts_bg

        p = mean_mat_fg / var_mat_fg
        n = (mean_mat_fg ** 2) / (var_mat_fg - mean_mat_fg)
        site['pv'] = nbinom.logsf(counts_fg, n, p)
        site['max_pos'] = get_max_position(Score, currsite, fg_state, strand)
        site['dir_score'] = EvaluateSite(EmmisionProbGeneDir, currsite, fg_state)
        if site['SiteScore'] < 0.0:
            continue
        sites.append(site)

    Sequences.close()
    Background.close()

    return gene, sites
Пример #5
0
def ParallelGetMostLikelyPathForGene(data):
    ''' 
    This function computes the most likely path for a gene 
    '''
    
    gene, nr_of_genes, gene_nr, EmissionParameters, TransitionParameters, TransitionTypeFirst, RandomNoise = data
    
    #Turn the Sequence and Bacground objects into dictionaries again such that the subsequent methods for using these do not need to be modified
    Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
    Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')

    #Parse the parameters
    alpha = EmissionParameters['Diag_event_params']
    PriorMatrix = EmissionParameters['PriorMatrix']
    NrOfStates = EmissionParameters['NrOfStates']


    fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True)
    fg_pen = EmissionParameters['fg_pen']
    #Score the state sequences
    #1) Determine the positions where an observation is possible

    Sequences_per_gene = PreloadSequencesForGene(Sequences, gene)
    Background_per_gene = PreloadSequencesForGene(Background, gene)

    Ix = GetModelIx(Sequences_per_gene, Type='all')

    if np.sum(Ix) == 0:
        CurrPath = 2 * np.ones((0, Ix.shape[0]), dtype=np.int)
        return  [gene, CurrPath, 0]

    if EmissionParameters['FilterSNPs']:
        Ix = GetModelIx(Sequences_per_gene, Type='no_snps_conv', snps_thresh=EmissionParameters['SnpRatio'], snps_min_cov=EmissionParameters['SnpAbs'], Background=Background_per_gene)
    else:
        Ix = GetModelIx(Sequences_per_gene)
    
    #2) Compute the probabilities for both states
    EmmisionProbGene = np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates))
    
    CurrStackSum = StackData(Sequences_per_gene)
    CurrStackVar = StackData(Sequences_per_gene, add = 'no')
    CurrStackSumBck = StackData(Background_per_gene, add = 'only_cov')

    if EmissionParameters['glm_weight'] < 0.0:
        weight1 = 1.0
        weight2 = 1.0
    elif EmissionParameters['glm_weight'] == 0.0:
        weight1 = 0.0000001
        weight2 = 1.0 - weight1 
    elif EmissionParameters['glm_weight'] == 1.0:
        weight1 = 0.9999999
        weight2 = 1.0 - weight1 
    else:
        weight1 = EmissionParameters['glm_weight'] 
        weight2 = (1.0 - EmissionParameters['glm_weight']) 

    for State in range(NrOfStates):
        if not EmissionParameters['ign_GLM']:
            if isinstance(EmissionParameters['ExpressionParameters'][0], np.ndarray):
                #EmmisionProbGene[State, :] = FitBinoDirchEmmisionProbabilities.ComputeStateProbForGeneNB_unif(CurrStack, alpha, State, EmissionParameters)
                EmmisionProbGene[State, :] = np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSum, State, nr_of_genes, gene_nr, EmissionParameters)
                if EmissionParameters['BckType'] == 'Coverage':
                    EmmisionProbGene[State, :] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck, State, nr_of_genes, gene_nr, EmissionParameters, 'bg')
                if EmissionParameters['BckType'] == 'Coverage_bck':
                    EmmisionProbGene[State, :] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck, State, nr_of_genes, gene_nr, EmissionParameters, 'bg')
        if not EmissionParameters['ign_diag']:
            EmmisionProbGene[State, Ix] += np.log(weight2) + diag_event_model.pred_log_lik(CurrStackVar[:, Ix], State, EmissionParameters)
        if State == fg_state:
            if EmissionParameters['LastIter']:
                EmmisionProbGene[State, :] -= fg_pen
    if RandomNoise:
        EmmisionProbGene = np.logaddexp(EmmisionProbGene, np.random.uniform(np.min(EmmisionProbGene[np.isfinite(EmmisionProbGene)]) - 4, np.min(EmmisionProbGene[np.isfinite(EmmisionProbGene)]) - 0.1, EmmisionProbGene.shape)) #Add some random noise 
        
    #Get the transition probabilities
    TransistionProbabilities = np.float64(np.tile(np.log(TransitionParameters[0]), (EmmisionProbGene.shape[1],1,1)).T)
    
    CurrPath, Currloglik = viterbi.viterbi(np.float64(EmmisionProbGene), TransistionProbabilities, np.float64(np.log(PriorMatrix)))
    CurrPath = np.int8(CurrPath)
    
    del TransistionProbabilities, EmmisionProbGene, CurrStackSum, CurrStackVar, CurrStackSumBck, Ix
    Sequences.close()
    Background.close()

    return [gene, CurrPath, Currloglik]