Exemplos de StackData em Python, exemplos de tools.StackData em Python

Exemplo n.º 1

0

Exibir arquivo

def PlotGene(Sequences,
             Background,
             gene,
             IterParameters,
             TransitionTypeFirst='nonhomo',
             no_plot=False,
             Start=0,
             Stop=-1,
             figsize=(6, 8),
             dir_ylim=[],
             out_name=None):
    '''
    This function plot the coverage and the parameters for the model
    '''

    reload(diag_event_model)
    reload(emission)
    set2 = brewer2mpl.get_map('Dark2', 'qualitative', 8).mpl_colors
    TransitionParameters = IterParameters[1]
    EmissionParameters = IterParameters[0]
    TransitionType = EmissionParameters['TransitionType']
    PriorMatrix = EmissionParameters['PriorMatrix']
    NrOfStates = EmissionParameters['NrOfStates']

    Sequences_per_gene = PreloadSequencesForGene(Sequences, gene)
    Background_per_gene = PreloadSequencesForGene(Background, gene)

    if EmissionParameters['FilterSNPs']:
        Ix = tools.GetModelIx(Sequences_per_gene,
                              Type='no_snps_conv',
                              snps_thresh=EmissionParameters['SnpRatio'],
                              snps_min_cov=EmissionParameters['SnpAbs'],
                              Background=Background_per_gene)
    else:
        Ix = tools.GetModelIx(Sequences_per_gene)

    #2) Compute the probabilities for both states
    EmmisionProbGene = np.log(
        np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates)))
    EmmisionProbGene_Dir = np.log(
        np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates)))
    EmmisionProbGeneNB_fg = np.log(
        np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates)))
    EmmisionProbGeneNB_bg = np.log(
        np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates)))

    CurrStackSum = tools.StackData(Sequences_per_gene)
    CurrStackVar = tools.StackData(Sequences_per_gene, add='no')
    nr_of_genes = len(Sequences.keys())
    gene_nr_dict = {}
    for i, curr_gene in enumerate(Sequences.keys()):
        gene_nr_dict[curr_gene] = i

    #Compute the emission probapility
    for State in range(NrOfStates):
        if not EmissionParameters['ExpressionParameters'][0] == None:
            EmmisionProbGene[
                State, :] = emission.predict_expression_log_likelihood_for_gene(
                    CurrStackSum, State, nr_of_genes, gene_nr_dict[gene],
                    EmissionParameters)
            EmmisionProbGeneNB_fg[
                State, :] = emission.predict_expression_log_likelihood_for_gene(
                    CurrStackSum, State, nr_of_genes, gene_nr_dict[gene],
                    EmissionParameters)
            if EmissionParameters['BckType'] == 'Coverage':
                EmmisionProbGene[
                    State, :] += emission.predict_expression_log_likelihood_for_gene(
                        tools.StackData(Background, gene, add='only_cov') + 0,
                        State,
                        nr_of_genes,
                        gene_nr_dict[gene],
                        EmissionParameters,
                        curr_type='bg')
                EmmisionProbGeneNB_bg[
                    State, :] = emission.predict_expression_log_likelihood_for_gene(
                        tools.StackData(Background, gene, add='only_cov') + 0,
                        State,
                        nr_of_genes,
                        gene_nr_dict[gene],
                        EmissionParameters,
                        curr_type='bg')
            if EmissionParameters['BckType'] == 'Coverage_bck':
                EmmisionProbGene[
                    State, :] += emission.predict_expression_log_likelihood_for_gene(
                        tools.StackData(Background, gene, add='only_cov') + 0,
                        State,
                        nr_of_genes,
                        gene_nr_dict[gene],
                        EmissionParameters,
                        curr_type='bg')
                EmmisionProbGeneNB_bg[
                    State, :] = emission.predict_expression_log_likelihood_for_gene(
                        tools.StackData(Background, gene, add='only_cov') + 0,
                        State,
                        nr_of_genes,
                        gene_nr_dict[gene],
                        EmissionParameters,
                        curr_type='bg')
        if not EmissionParameters['ign_diag']:
            EmmisionProbGene[State, Ix] += diag_event_model.pred_log_lik(
                CurrStackVar[:, Ix], State, EmissionParameters)
            EmmisionProbGene_Dir[State, Ix] = diag_event_model.pred_log_lik(
                CurrStackVar[:, Ix], State, EmissionParameters)

    #Get the transition probabilities
    if TransitionTypeFirst == 'nonhomo':
        if TransitionType == 'unif_bck' or TransitionType == 'binary_bck':
            CountsSeq = tools.StackData(Sequences_per_gene, add='all')
            CountsBck = tools.StackData(Background_per_gene, add='only_cov')
            Counts = np.vstack((CountsSeq, CountsBck))
        else:
            Counts = tools.StackData(Sequences_per_gene, add='all')
        TransistionProbabilities = np.float64(
            trans.PredictTransistions(Counts, TransitionParameters, NrOfStates,
                                      TransitionType))
    else:
        TransistionProbabilities = np.float64(
            np.tile(np.log(TransitionParameters[0]),
                    (EmmisionProbGene.shape[1], 1, 1)).T)

    MostLikelyPath, LogLik = viterbi.viterbi(np.float64(EmmisionProbGene),
                                             TransistionProbabilities,
                                             np.float64(np.log(PriorMatrix)))
    for j in range(NrOfStates):
        print str(np.sum(MostLikelyPath == j))

    if no_plot:
        return MostLikelyPath, TransistionProbabilities, EmmisionProbGene
    #pdb.set_trace()
    fig, axes = plt.subplots(nrows=9, figsize=figsize)
    fig.subplots_adjust(hspace=1.001)

    Counts = tools.StackData(Sequences_per_gene, gene, add='no')
    if Stop == -1:
        Stop = Counts.shape[1]
    if Stop == -1:
        plt_rng = np.array(range(Start, Counts.shape[1]))
    else:
        plt_rng = np.array(range(Start, Stop))

    i = 0
    color = set2[i]
    nr_of_rep_fg = len(Sequences[gene]['Coverage'].keys())
    i += 1
    Ix = repl_track_nr([2, 16], 22, nr_of_rep_fg)
    ppl.plot(axes[0],
             plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop],
             label='TC',
             linewidth=2,
             color=color)
    color = set2[i]
    i += 1
    Ix = repl_track_nr([0, 1, 3, 5, 6, 7, 8, 10, 11, 12, 13, 15, 17, 18], 22,
                       nr_of_rep_fg)
    ppl.plot(axes[0],
             plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop],
             label='NonTC',
             linewidth=2,
             color=color)
    color = set2[i]
    i += 1
    Ix = repl_track_nr([20], 22, nr_of_rep_fg)
    ppl.plot(axes[0],
             plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop],
             label='Read-ends',
             linewidth=2,
             color=color)
    color = set2[i]
    i += 1
    Ix = repl_track_nr([4, 9, 14, 19], 22, nr_of_rep_fg)
    ppl.plot(axes[0],
             plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop],
             label='Deletions',
             linewidth=2,
             color=color)
    color = set2[i]
    i += 1
    Ix = repl_track_nr([21], 22, nr_of_rep_fg)
    ppl.plot(axes[0],
             plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop],
             label='Coverage',
             linewidth=2,
             color=color)
    color = set2[i]
    i += 1
    axes[0].set_ylabel('Counts')
    axes[0].set_xlabel('Position')
    axes[0].set_title('Coverage and Conversions')
    axes[0].get_xaxis().get_major_formatter().set_useOffset(False)

    BckCov = Background_per_gene['Coverage'][0]
    for i in range(1, len(Background_per_gene['Coverage'].keys())):
        BckCov += Background_per_gene['Coverage'][str(i)]

    ppl.plot(axes[0],
             plt_rng, (BckCov.T)[Start:Stop],
             ls='-',
             label='Bck',
             linewidth=2,
             color=color)
    ppl.legend(axes[0])

    for j in range(NrOfStates):
        color = set2[j]
        ppl.plot(axes[1],
                 plt_rng, (TransistionProbabilities[j, j, :])[Start:Stop],
                 label='Transition ' + str(j) + ' ' + str(j),
                 linewidth=2,
                 color=color)

    ppl.legend(axes[1])
    axes[1].set_ylabel('log-transition probability')
    axes[1].set_xlabel('Position')
    axes[1].set_title('Transition probability')
    axes[1].get_xaxis().get_major_formatter().set_useOffset(False)

    for j in range(NrOfStates):
        color = set2[j]
        ppl.plot(axes[2],
                 plt_rng, (EmmisionProbGene[j, :][Start:Stop]),
                 label='Emission ' + str(j),
                 linewidth=2,
                 color=color)
    if EmissionParameters['BckType'] == 'Coverage_bck':
        axes[2].set_ylim(
            (np.min(np.min(EmmisionProbGene[0:2, :][:, Start:Stop])), 1))

    ppl.legend(axes[2])
    axes[2].set_ylabel('log-GLM probability')
    axes[2].set_xlabel('Position')
    axes[2].set_title('Emission probability')
    axes[2].get_xaxis().get_major_formatter().set_useOffset(False)

    ppl.plot(axes[3], plt_rng, MostLikelyPath[Start:Stop])
    axes[3].set_ylabel('State')
    axes[3].set_xlabel('Position')
    axes[3].set_title('Most likely path')
    axes[3].get_xaxis().get_major_formatter().set_useOffset(False)

    for j in range(NrOfStates):
        color = set2[j]
        ppl.plot(axes[4],
                 plt_rng,
                 EmmisionProbGene_Dir[j, :][Start:Stop],
                 label='Dir State ' + str(j),
                 linewidth=2,
                 color=color)
    if len(dir_ylim) > 0:
        axes[4].set_ylim(dir_ylim)
    ppl.legend(axes[4])
    axes[4].set_ylabel('log-DMM probability')
    axes[4].set_xlabel('Position')
    axes[4].set_title('DMM probability')
    axes[4].get_xaxis().get_major_formatter().set_useOffset(False)

    for j in range(NrOfStates):
        color = set2[j]
        ppl.plot(axes[5],
                 plt_rng,
                 EmmisionProbGeneNB_fg[j, :][Start:Stop],
                 label='NB fg ' + str(j),
                 linewidth=2,
                 color=color)
    if EmissionParameters['BckType'] == 'Coverage_bck':
        axes[5].set_ylim(
            [np.min(np.min(EmmisionProbGeneNB_fg[0:2, :][:, Start:Stop])), 1])

    ppl.legend(axes[5])
    axes[5].set_ylabel('prob')
    axes[5].set_xlabel('Position')
    axes[5].set_title('prob-fg')
    axes[5].get_xaxis().get_major_formatter().set_useOffset(False)

    for j in range(NrOfStates):
        color = set2[j]
        ppl.plot(axes[6],
                 plt_rng,
                 EmmisionProbGeneNB_bg[j, :][Start:Stop],
                 label='NB bg ' + str(j),
                 linewidth=2,
                 color=color)
    if EmissionParameters['BckType'] == 'Coverage_bck':
        axes[6].set_ylim(
            [np.min(np.min(EmmisionProbGeneNB_bg[0:3, :][:, Start:Stop])), 1])
    ppl.legend(axes[6])
    axes[6].set_ylabel('prob')
    axes[6].set_xlabel('Position')
    axes[6].set_title('prob-bg')
    axes[6].get_xaxis().get_major_formatter().set_useOffset(False)

    fg_state, bg_state = emission.get_fg_and_bck_state(EmissionParameters,
                                                       final_pred=True)
    ix_bg = range(EmmisionProbGene.shape[0])
    ix_bg.remove(fg_state)
    FGScore = EmmisionProbGene[fg_state, :]
    AltScore = EmmisionProbGene[ix_bg, :]
    norm = logsumexp(AltScore, axis=0)

    ix_ok = np.isinf(norm) + np.isnan(norm)
    if np.sum(ix_ok) < norm.shape[0]:
        SiteScore = FGScore[ix_ok == 0] - norm[ix_ok == 0]
    else:
        print 'Score problematic'
        SiteScore = FGScore
    ppl.plot(axes[7], plt_rng, SiteScore[Start:Stop])
    axes[7].set_ylabel('log-odd score')
    axes[7].set_xlabel('Position')
    axes[7].set_title('log-odd score')
    axes[7].get_xaxis().get_major_formatter().set_useOffset(False)

    FGScore = EmmisionProbGene_Dir[fg_state, :]
    AltScore = EmmisionProbGene_Dir[ix_bg, :]
    norm = logsumexp(AltScore, axis=0)
    ix_ok = np.isinf(norm) + np.isnan(norm)
    if np.sum(ix_ok) < norm.shape[0]:
        SiteScore = FGScore[ix_ok == 0] - norm[ix_ok == 0]
    else:
        print 'Score problematic'
        SiteScore = FGScore
    ppl.plot(axes[8], plt_rng, SiteScore[Start:Stop])
    axes[8].set_ylabel('DMM log-odd score')
    axes[8].set_xlabel('Position')
    axes[8].set_title('DMM log-odd score')
    axes[8].get_xaxis().get_major_formatter().set_useOffset(False)
    if not (out_name is None):
        print 'Saving result'
        fig.savefig(out_name)

    plt.show()

    return MostLikelyPath, TransistionProbabilities, EmmisionProbGeneNB_fg

Exemplo n.º 2

0

Exibir arquivo

Arquivo: trans.py Projeto: ejeanvoi/omniCLIP

def FitTransistionParametersSimple(Sequences,
                                   Background,
                                   TransitionParameters,
                                   CurrPath,
                                   C,
                                   verbosity=1):
    '''
    This function determines the optimal parameters of the logistic regression for predicting the TransitionParameters
    '''

    #Generate features from the CurrPaths and the Information in the coverage
    TransitionMatrix = TransitionParameters[0]
    NewTransitionParametersLogReg = {}
    t = time.time()
    #Iterate over the possible transitions
    assert (TransitionMatrix.shape[0] >
            1), 'Only two states are currently allowed'

    genes = list(CurrPath.keys())
    genes = random.sample(genes, min(len(genes), 1000))

    NrOfStates = TransitionMatrix.shape[0]
    Xs = []
    Ys = []
    SampleSame = []
    SampleOther = []
    print("Learning transition model")
    print("Iterating over genes")
    if verbosity > 0:
        print('Fitting transition parameters: I')
        print('Memory usage: %s (kb)' %
              resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
    for i, gene in enumerate(genes):
        if i % 1000 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()
        #Get data
        Sequences_per_gene = tools.PreloadSequencesForGene(Sequences, gene)
        CovMat = tools.StackData(Sequences_per_gene, add='all')
        CovMat[CovMat < 0] = 0
        nr_of_samples = CovMat.shape[0]
        for CurrState in range(NrOfStates):
            for NextState in range(NrOfStates):
                #Positions where the path is in the current state
                Ix1 = CurrPath[gene][:-1] == CurrState
                #Positions where the subsequent position path is in the "next" state
                Ix2 = CurrPath[gene][1:] == NextState
                #Positions where the path changes from the current state to the other state
                Ix = np.where(Ix1 * Ix2)[0]

                if np.sum(np.sum(np.isnan(CovMat))) > 0:
                    pdb.set_trace()
                CovMatIx = GenerateFeatures(Ix, CovMat)
                if np.sum(np.sum(np.isnan(CovMatIx))) > 0 or np.sum(
                        np.sum(np.isinf(CovMatIx))) > 0:
                    pdb.set_trace()

                if CurrState == NextState:
                    if CovMatIx.shape[1] == 0:
                        CovMatIx = np.zeros((nr_of_samples, 1))
                        SampleSame.append(CovMatIx)
                    else:
                        SampleSame.append(CovMatIx)
                else:
                    if CovMatIx.shape[1] == 0:
                        CovMatIx = np.zeros((nr_of_samples, 1))
                        SampleOther.append(CovMatIx)
                    else:
                        SampleOther.append(CovMatIx)
        del Sequences_per_gene, CovMat

    if verbosity > 0:
        print('Fitting transition parameters: II')
        print('Memory usage: %s (kb)' %
              resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)

    len_same = np.sum([Mat.shape[1] for Mat in SampleSame])
    len_other = np.sum([Mat.shape[1] for Mat in SampleOther])

    X = np.concatenate(SampleSame + SampleOther, axis=1).T
    del SampleSame, SampleOther

    #Create Y
    Y = np.hstack((np.ones((1, len_same),
                           dtype=np.int), np.zeros((1, len_other),
                                                   dtype=np.int)))[0, :].T
    classes = np.unique(Y)
    if verbosity > 0:
        print('Fitting transition parameters: III')
        print('Memory usage: %s (kb)' %
              resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
    n_iter = max(5, np.ceil(10**6 / Y.shape[0]))

    NewTransitionParametersLogReg = SGDClassifier(loss="log", max_iter=n_iter)
    ix_shuffle = np.arange(X.shape[0])
    for n in range(int(n_iter)):
        np.random.shuffle(ix_shuffle)
        for batch_ix in np.array_split(ix_shuffle, 50):
            NewTransitionParametersLogReg.partial_fit(X[batch_ix, :],
                                                      Y[batch_ix],
                                                      classes=classes)

    if verbosity > 0:
        print('Fitting transition parameters: IV')
        print('Memory usage: %s (kb)' %
              resource.getrusage(resource.RUSAGE_SELF).ru_maxrss)
    del Ix1, Ix2, Ix, X, Y, Xs, Ys
    if verbosity > 0:
        print('Done: Elapsed time: ' + str(time.time() - t))

    return NewTransitionParametersLogReg

Exemplo n.º 3

0

Exibir arquivo

def FitTransistionParametersMultinomialSeparate(Sequences, Background,
                                                TransitionParameters, CurrPath,
                                                C):
    '''
    This function determines the optimal parameters of the logistic regression for predicting the TransitionParameters
    '''

    #Generate features from the CurrPaths and the Information in the coverage
    TransitionMatrix = TransitionParameters[0]
    NewTransitionParametersLogReg = {}
    t = time.time()
    #Iterate over the possible transistions
    assert (TransitionMatrix.shape[0] >
            1), 'Only two states are currently allowed'
    CurrClass = 0

    genes = CurrPath.keys()
    genes = random.sample(genes, min(len(genes), 1000))

    NrOfStates = TransitionMatrix.shape[0]
    for CurrState in range(NrOfStates):
        CurrClass = 0
        Xs = []
        Ys = []
        print "Learning transistion model for State " + str(CurrState)
        for NextState in range(NrOfStates):
            SampleSame = []
            SampleOther = []
            #Iterate over the genes
            print 'Loading data'
            for i, gene in enumerate(genes):
                if i % 1000 == 0:
                    sys.stdout.write('.')
                    sys.stdout.flush()
                #Positions where the path is in the current state
                Ix1 = CurrPath[gene][:-1] == CurrState
                #Positions where the subsequent position path is in the "next" state
                Ix2 = CurrPath[gene][1:] == NextState
                #Positions where the path changes from the current state to the other state
                Ix = np.where(Ix1 * Ix2)[0]
                Sequences_per_gene = tools.PreloadSequencesForGene(
                    Sequences, gene)
                CovMat = tools.StackData(Sequences_per_gene, add='all')
                CovMat[CovMat < 0] = 0
                if np.sum(np.sum(np.isnan(CovMat))) > 0:
                    pdb.set_trace()
                CovMat = GenerateFeatures(Ix, CovMat)
                if np.sum(np.sum(np.isnan(CovMat))) > 0 or np.sum(
                        np.sum(np.isinf(CovMat))) > 0:
                    pdb.set_trace()
                if CovMat.shape[1] == 0:
                    CovMat = np.zeros((2, 1))
                    SampleOther.append(CovMat)
                else:
                    SampleOther.append(CovMat)
                del CovMat
            print '\n'
            #Create X
            X = np.concatenate(SampleOther, axis=1)
            #Create Y
            Y = (np.ones((1, np.sum([Mat.shape[1] for Mat in SampleOther])),
                         dtype=np.int) * CurrClass)[0, :]
            Xs.append(X)
            Ys.append(Y)
            CurrClass += 1

        X = np.concatenate(Xs, axis=1)
        Y = np.concatenate(Ys)
        n_iter = max(5, np.ceil(10**6 / len(Y)))
        LR = SGDClassifier(loss="log", n_iter=n_iter).fit(X.T, Y.T)

        NewTransitionParametersLogReg[CurrState] = LR
        del Ix1, Ix2, Ix, SampleSame, SampleOther, X, Y, Xs, Ys
    print 'Done: Elapsed time: ' + str(time.time() - t)

    return NewTransitionParametersLogReg

Exemplo n.º 4

0

Exibir arquivo

def FitTransistionParametersSimpleBck(Sequences, Background,
                                      TransitionParameters, CurrPath, C):
    '''
    This function determines the optimal parameters of the logistic regression for predicting the TransitionParameters
    '''
    #Generate features from the CurrPaths and the Information in the coverage
    TransitionMatrix = TransitionParameters[0]
    NewTransitionParametersLogReg = {}

    t = time.time()
    #Iterate over the possible transistions
    assert (TransitionMatrix.shape[0] >
            1), 'Only two states are currently allowed'

    genes = CurrPath.keys()
    genes = random.sample(genes, min(len(genes), 1000))

    NrOfStates = TransitionMatrix.shape[0]
    Xs = []
    Ys = []
    SampleSame = []
    SampleOther = []
    print "Learning transistion model"
    print "Iterating over genes"

    for i, gene in enumerate(genes):
        if i % 1000 == 0:
            sys.stdout.write('.')
            sys.stdout.flush()
        #Get data
        Sequences_per_gene = tools.PreloadSequencesForGene(Sequences, gene)
        Background_per_gene = tools.PreloadSequencesForGene(Background, gene)

        CovMatSeq = tools.StackData(Sequences_per_gene, add='all')
        CovMatBck = tools.StackData(Background_per_gene, add='only_cov')
        CovMat = np.vstack((CovMatSeq, CovMatBck))
        nr_of_samples = CovMat.shape[0]
        CovMat[CovMat < 0] = 0
        for CurrState in range(NrOfStates):
            for NextState in range(NrOfStates):
                #Positions where the path is in the current state
                Ix1 = CurrPath[gene][:-1] == CurrState
                #Positions where the subsequent position path is in the "next" state
                Ix2 = CurrPath[gene][1:] == NextState
                #Positions where the path changes from the current state to the other state
                Ix = np.where(Ix1 * Ix2)[0]

                if np.sum(np.sum(np.isnan(CovMat))) > 0:
                    continue
                CovMatIx = GenerateFeatures(Ix, CovMat)
                if np.sum(np.sum(np.isnan(CovMatIx))) > 0 or np.sum(
                        np.sum(np.isinf(CovMatIx))) > 0:
                    continue
                if CurrState == NextState:
                    if CovMatIx.shape[1] == 0:
                        CovMatIx = np.zeros((nr_of_samples, 1))
                        SampleSame.append(CovMatIx)
                    else:
                        SampleSame.append(CovMatIx)
                else:
                    if CovMatIx.shape[1] == 0:
                        CovMatIx = np.zeros((nr_of_samples, 1))
                        SampleOther.append(CovMatIx)
                    else:
                        SampleOther.append(CovMatIx)
        del CovMat, CovMatIx, CovMatSeq, CovMatBck
    X = np.concatenate(SampleSame + SampleOther, axis=1)
    #Create Y
    len_same = np.sum([Mat.shape[1] for Mat in SampleSame])
    len_other = np.sum([Mat.shape[1] for Mat in SampleOther])
    Y = np.hstack((np.ones((1, len_same),
                           dtype=np.int), np.zeros((1, len_other),
                                                   dtype=np.int)))[0, :]

    n_iter = max(5, np.ceil(10**6 / len(Y)))
    NewTransitionParametersLogReg = SGDClassifier(loss="log",
                                                  n_iter=n_iter).fit(X.T, Y.T)

    del Ix1, Ix2, Ix, SampleSame, SampleOther, X, Y, Xs, Ys

    print 'Memory usage: %s (kb)' % resource.getrusage(
        resource.RUSAGE_SELF).ru_maxrss
    print 'Done: Elapsed time: ' + str(time.time() - t)

    return NewTransitionParametersLogReg

Exemplo n.º 5

0

Exibir arquivo

def FitTransistionParametersUnif2(Sequences, Background, TransitionParameters,
                                  CurrPath, C):
    '''
    This function determines the optimal parameters of the logistic regression for predicting the TransitionParameters
    '''

    #Generate features from the CurrPaths and the Information in the coverage
    TransitionMatrix = TransitionParameters[0]
    NewTransitionParametersLogReg = {}
    t = time.time()

    #Iterate over the possible transistions
    assert (TransitionMatrix.shape[0] >
            1), 'Only two states are currently allowed'
    for CurrState in range(TransitionMatrix.shape[0]):
        print "Learning transistion model for State " + str(CurrState)
        SampleSame = []
        SampleOther = []

        #Iterate over the genes
        print 'Loading data'
        for i, gene in enumerate(CurrPath.keys()):
            if i % 1000 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()
            #Positions where the path is in the current state
            Ix1 = CurrPath[gene][:-1] == CurrState
            #Positions where the subsequent position path is in the other state
            Ix2 = CurrPath[gene][1:] == (1 - CurrState)
            #Positions where the path changes from the current state to the other state
            Ix = np.where(Ix1 * Ix2)[0]
            #CovMat = Sequences[gene]['CovNr'].toarray()
            Sequences_per_gene = PreloadSequencesForGene(Sequences, gene)
            CovMat = tools.StackData(Sequences_per_gene, add='all')
            CovMat = GenerateFeatures(Ix, CovMat)
            SampleOther.append(CovMat[:, np.sum(CovMat, axis=0) > 0])

            #Positions where the path is in the current state
            Ix1 = CurrPath[gene][:-1] == CurrState
            #Positions where the subsequent position path is in the same state
            Ix2 = CurrPath[gene][1:] == CurrState
            #Positions where the path stays in the current stae
            Ix = np.where(Ix1 * Ix2)[0]
            #CovMat = Sequences[gene]['CovNr'].toarray()
            Sequences_per_gene = PreloadSequencesForGene(Sequences, gene)
            CovMat = tools.StackData(Sequences_per_gene, add='all')
            CovMat = GenerateFeatures(Ix, CovMat)
            SampleSame.append(CovMat[:, np.sum(CovMat, axis=0) > 0])
            del CovMat
        print '\n'
        #Create X
        X = np.concatenate(SampleSame + SampleOther, axis=1)
        #Create Y
        Y0 = np.zeros((1, np.sum([Mat.shape[1] for Mat in SampleSame])),
                      dtype=np.int)
        Y1 = np.ones((1, np.sum([Mat.shape[1] for Mat in SampleOther])),
                     dtype=np.int)
        Y = np.hstack((Y0, Y1))[0, :]

        Cs = [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5]
        LR = LogisticRegressionCV(Cs=Cs,
                                  penalty='l2',
                                  tol=0.01,
                                  class_weight='auto')
        LR.fit(X.T, Y.T)
        NewTransitionParametersLogReg[CurrState] = LR
        print 'Elapsed time: ' + str(time.time() - t)
        del Ix1, Ix2, Ix, SampleSame, SampleOther

    return NewTransitionParametersLogReg