def PlotGene(Sequences, Background, gene, IterParameters, TransitionTypeFirst='nonhomo', no_plot=False, Start=0, Stop=-1, figsize=(6, 8), dir_ylim=[], out_name=None): ''' This function plot the coverage and the parameters for the model ''' reload(diag_event_model) reload(emission) set2 = brewer2mpl.get_map('Dark2', 'qualitative', 8).mpl_colors TransitionParameters = IterParameters[1] EmissionParameters = IterParameters[0] TransitionType = EmissionParameters['TransitionType'] PriorMatrix = EmissionParameters['PriorMatrix'] NrOfStates = EmissionParameters['NrOfStates'] Sequences_per_gene = PreloadSequencesForGene(Sequences, gene) Background_per_gene = PreloadSequencesForGene(Background, gene) if EmissionParameters['FilterSNPs']: Ix = tools.GetModelIx(Sequences_per_gene, Type='no_snps_conv', snps_thresh=EmissionParameters['SnpRatio'], snps_min_cov=EmissionParameters['SnpAbs'], Background=Background_per_gene) else: Ix = tools.GetModelIx(Sequences_per_gene) #2) Compute the probabilities for both states EmmisionProbGene = np.log( np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates))) EmmisionProbGene_Dir = np.log( np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates))) EmmisionProbGeneNB_fg = np.log( np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates))) EmmisionProbGeneNB_bg = np.log( np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates))) CurrStackSum = tools.StackData(Sequences_per_gene) CurrStackVar = tools.StackData(Sequences_per_gene, add='no') nr_of_genes = len(Sequences.keys()) gene_nr_dict = {} for i, curr_gene in enumerate(Sequences.keys()): gene_nr_dict[curr_gene] = i #Compute the emission probapility for State in range(NrOfStates): if not EmissionParameters['ExpressionParameters'][0] == None: EmmisionProbGene[ State, :] = emission.predict_expression_log_likelihood_for_gene( CurrStackSum, State, nr_of_genes, gene_nr_dict[gene], EmissionParameters) EmmisionProbGeneNB_fg[ State, :] = emission.predict_expression_log_likelihood_for_gene( CurrStackSum, State, nr_of_genes, gene_nr_dict[gene], EmissionParameters) if EmissionParameters['BckType'] == 'Coverage': EmmisionProbGene[ State, :] += emission.predict_expression_log_likelihood_for_gene( tools.StackData(Background, gene, add='only_cov') + 0, State, nr_of_genes, gene_nr_dict[gene], EmissionParameters, curr_type='bg') EmmisionProbGeneNB_bg[ State, :] = emission.predict_expression_log_likelihood_for_gene( tools.StackData(Background, gene, add='only_cov') + 0, State, nr_of_genes, gene_nr_dict[gene], EmissionParameters, curr_type='bg') if EmissionParameters['BckType'] == 'Coverage_bck': EmmisionProbGene[ State, :] += emission.predict_expression_log_likelihood_for_gene( tools.StackData(Background, gene, add='only_cov') + 0, State, nr_of_genes, gene_nr_dict[gene], EmissionParameters, curr_type='bg') EmmisionProbGeneNB_bg[ State, :] = emission.predict_expression_log_likelihood_for_gene( tools.StackData(Background, gene, add='only_cov') + 0, State, nr_of_genes, gene_nr_dict[gene], EmissionParameters, curr_type='bg') if not EmissionParameters['ign_diag']: EmmisionProbGene[State, Ix] += diag_event_model.pred_log_lik( CurrStackVar[:, Ix], State, EmissionParameters) EmmisionProbGene_Dir[State, Ix] = diag_event_model.pred_log_lik( CurrStackVar[:, Ix], State, EmissionParameters) #Get the transition probabilities if TransitionTypeFirst == 'nonhomo': if TransitionType == 'unif_bck' or TransitionType == 'binary_bck': CountsSeq = tools.StackData(Sequences_per_gene, add='all') CountsBck = tools.StackData(Background_per_gene, add='only_cov') Counts = np.vstack((CountsSeq, CountsBck)) else: Counts = tools.StackData(Sequences_per_gene, add='all') TransistionProbabilities = np.float64( trans.PredictTransistions(Counts, TransitionParameters, NrOfStates, TransitionType)) else: TransistionProbabilities = np.float64( np.tile(np.log(TransitionParameters[0]), (EmmisionProbGene.shape[1], 1, 1)).T) MostLikelyPath, LogLik = viterbi.viterbi(np.float64(EmmisionProbGene), TransistionProbabilities, np.float64(np.log(PriorMatrix))) for j in range(NrOfStates): print str(np.sum(MostLikelyPath == j)) if no_plot: return MostLikelyPath, TransistionProbabilities, EmmisionProbGene #pdb.set_trace() fig, axes = plt.subplots(nrows=9, figsize=figsize) fig.subplots_adjust(hspace=1.001) Counts = tools.StackData(Sequences_per_gene, gene, add='no') if Stop == -1: Stop = Counts.shape[1] if Stop == -1: plt_rng = np.array(range(Start, Counts.shape[1])) else: plt_rng = np.array(range(Start, Stop)) i = 0 color = set2[i] nr_of_rep_fg = len(Sequences[gene]['Coverage'].keys()) i += 1 Ix = repl_track_nr([2, 16], 22, nr_of_rep_fg) ppl.plot(axes[0], plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop], label='TC', linewidth=2, color=color) color = set2[i] i += 1 Ix = repl_track_nr([0, 1, 3, 5, 6, 7, 8, 10, 11, 12, 13, 15, 17, 18], 22, nr_of_rep_fg) ppl.plot(axes[0], plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop], label='NonTC', linewidth=2, color=color) color = set2[i] i += 1 Ix = repl_track_nr([20], 22, nr_of_rep_fg) ppl.plot(axes[0], plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop], label='Read-ends', linewidth=2, color=color) color = set2[i] i += 1 Ix = repl_track_nr([4, 9, 14, 19], 22, nr_of_rep_fg) ppl.plot(axes[0], plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop], label='Deletions', linewidth=2, color=color) color = set2[i] i += 1 Ix = repl_track_nr([21], 22, nr_of_rep_fg) ppl.plot(axes[0], plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop], label='Coverage', linewidth=2, color=color) color = set2[i] i += 1 axes[0].set_ylabel('Counts') axes[0].set_xlabel('Position') axes[0].set_title('Coverage and Conversions') axes[0].get_xaxis().get_major_formatter().set_useOffset(False) BckCov = Background_per_gene['Coverage'][0] for i in range(1, len(Background_per_gene['Coverage'].keys())): BckCov += Background_per_gene['Coverage'][str(i)] ppl.plot(axes[0], plt_rng, (BckCov.T)[Start:Stop], ls='-', label='Bck', linewidth=2, color=color) ppl.legend(axes[0]) for j in range(NrOfStates): color = set2[j] ppl.plot(axes[1], plt_rng, (TransistionProbabilities[j, j, :])[Start:Stop], label='Transition ' + str(j) + ' ' + str(j), linewidth=2, color=color) ppl.legend(axes[1]) axes[1].set_ylabel('log-transition probability') axes[1].set_xlabel('Position') axes[1].set_title('Transition probability') axes[1].get_xaxis().get_major_formatter().set_useOffset(False) for j in range(NrOfStates): color = set2[j] ppl.plot(axes[2], plt_rng, (EmmisionProbGene[j, :][Start:Stop]), label='Emission ' + str(j), linewidth=2, color=color) if EmissionParameters['BckType'] == 'Coverage_bck': axes[2].set_ylim( (np.min(np.min(EmmisionProbGene[0:2, :][:, Start:Stop])), 1)) ppl.legend(axes[2]) axes[2].set_ylabel('log-GLM probability') axes[2].set_xlabel('Position') axes[2].set_title('Emission probability') axes[2].get_xaxis().get_major_formatter().set_useOffset(False) ppl.plot(axes[3], plt_rng, MostLikelyPath[Start:Stop]) axes[3].set_ylabel('State') axes[3].set_xlabel('Position') axes[3].set_title('Most likely path') axes[3].get_xaxis().get_major_formatter().set_useOffset(False) for j in range(NrOfStates): color = set2[j] ppl.plot(axes[4], plt_rng, EmmisionProbGene_Dir[j, :][Start:Stop], label='Dir State ' + str(j), linewidth=2, color=color) if len(dir_ylim) > 0: axes[4].set_ylim(dir_ylim) ppl.legend(axes[4]) axes[4].set_ylabel('log-DMM probability') axes[4].set_xlabel('Position') axes[4].set_title('DMM probability') axes[4].get_xaxis().get_major_formatter().set_useOffset(False) for j in range(NrOfStates): color = set2[j] ppl.plot(axes[5], plt_rng, EmmisionProbGeneNB_fg[j, :][Start:Stop], label='NB fg ' + str(j), linewidth=2, color=color) if EmissionParameters['BckType'] == 'Coverage_bck': axes[5].set_ylim( [np.min(np.min(EmmisionProbGeneNB_fg[0:2, :][:, Start:Stop])), 1]) ppl.legend(axes[5]) axes[5].set_ylabel('prob') axes[5].set_xlabel('Position') axes[5].set_title('prob-fg') axes[5].get_xaxis().get_major_formatter().set_useOffset(False) for j in range(NrOfStates): color = set2[j] ppl.plot(axes[6], plt_rng, EmmisionProbGeneNB_bg[j, :][Start:Stop], label='NB bg ' + str(j), linewidth=2, color=color) if EmissionParameters['BckType'] == 'Coverage_bck': axes[6].set_ylim( [np.min(np.min(EmmisionProbGeneNB_bg[0:3, :][:, Start:Stop])), 1]) ppl.legend(axes[6]) axes[6].set_ylabel('prob') axes[6].set_xlabel('Position') axes[6].set_title('prob-bg') axes[6].get_xaxis().get_major_formatter().set_useOffset(False) fg_state, bg_state = emission.get_fg_and_bck_state(EmissionParameters, final_pred=True) ix_bg = range(EmmisionProbGene.shape[0]) ix_bg.remove(fg_state) FGScore = EmmisionProbGene[fg_state, :] AltScore = EmmisionProbGene[ix_bg, :] norm = logsumexp(AltScore, axis=0) ix_ok = np.isinf(norm) + np.isnan(norm) if np.sum(ix_ok) < norm.shape[0]: SiteScore = FGScore[ix_ok == 0] - norm[ix_ok == 0] else: print 'Score problematic' SiteScore = FGScore ppl.plot(axes[7], plt_rng, SiteScore[Start:Stop]) axes[7].set_ylabel('log-odd score') axes[7].set_xlabel('Position') axes[7].set_title('log-odd score') axes[7].get_xaxis().get_major_formatter().set_useOffset(False) FGScore = EmmisionProbGene_Dir[fg_state, :] AltScore = EmmisionProbGene_Dir[ix_bg, :] norm = logsumexp(AltScore, axis=0) ix_ok = np.isinf(norm) + np.isnan(norm) if np.sum(ix_ok) < norm.shape[0]: SiteScore = FGScore[ix_ok == 0] - norm[ix_ok == 0] else: print 'Score problematic' SiteScore = FGScore ppl.plot(axes[8], plt_rng, SiteScore[Start:Stop]) axes[8].set_ylabel('DMM log-odd score') axes[8].set_xlabel('Position') axes[8].set_title('DMM log-odd score') axes[8].get_xaxis().get_major_formatter().set_useOffset(False) if not (out_name is None): print 'Saving result' fig.savefig(out_name) plt.show() return MostLikelyPath, TransistionProbabilities, EmmisionProbGeneNB_fg
def FitTransistionParametersSimple(Sequences, Background, TransitionParameters, CurrPath, C, verbosity=1): ''' This function determines the optimal parameters of the logistic regression for predicting the TransitionParameters ''' #Generate features from the CurrPaths and the Information in the coverage TransitionMatrix = TransitionParameters[0] NewTransitionParametersLogReg = {} t = time.time() #Iterate over the possible transitions assert (TransitionMatrix.shape[0] > 1), 'Only two states are currently allowed' genes = list(CurrPath.keys()) genes = random.sample(genes, min(len(genes), 1000)) NrOfStates = TransitionMatrix.shape[0] Xs = [] Ys = [] SampleSame = [] SampleOther = [] print("Learning transition model") print("Iterating over genes") if verbosity > 0: print('Fitting transition parameters: I') print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) for i, gene in enumerate(genes): if i % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() #Get data Sequences_per_gene = tools.PreloadSequencesForGene(Sequences, gene) CovMat = tools.StackData(Sequences_per_gene, add='all') CovMat[CovMat < 0] = 0 nr_of_samples = CovMat.shape[0] for CurrState in range(NrOfStates): for NextState in range(NrOfStates): #Positions where the path is in the current state Ix1 = CurrPath[gene][:-1] == CurrState #Positions where the subsequent position path is in the "next" state Ix2 = CurrPath[gene][1:] == NextState #Positions where the path changes from the current state to the other state Ix = np.where(Ix1 * Ix2)[0] if np.sum(np.sum(np.isnan(CovMat))) > 0: pdb.set_trace() CovMatIx = GenerateFeatures(Ix, CovMat) if np.sum(np.sum(np.isnan(CovMatIx))) > 0 or np.sum( np.sum(np.isinf(CovMatIx))) > 0: pdb.set_trace() if CurrState == NextState: if CovMatIx.shape[1] == 0: CovMatIx = np.zeros((nr_of_samples, 1)) SampleSame.append(CovMatIx) else: SampleSame.append(CovMatIx) else: if CovMatIx.shape[1] == 0: CovMatIx = np.zeros((nr_of_samples, 1)) SampleOther.append(CovMatIx) else: SampleOther.append(CovMatIx) del Sequences_per_gene, CovMat if verbosity > 0: print('Fitting transition parameters: II') print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) len_same = np.sum([Mat.shape[1] for Mat in SampleSame]) len_other = np.sum([Mat.shape[1] for Mat in SampleOther]) X = np.concatenate(SampleSame + SampleOther, axis=1).T del SampleSame, SampleOther #Create Y Y = np.hstack((np.ones((1, len_same), dtype=np.int), np.zeros((1, len_other), dtype=np.int)))[0, :].T classes = np.unique(Y) if verbosity > 0: print('Fitting transition parameters: III') print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) n_iter = max(5, np.ceil(10**6 / Y.shape[0])) NewTransitionParametersLogReg = SGDClassifier(loss="log", max_iter=n_iter) ix_shuffle = np.arange(X.shape[0]) for n in range(int(n_iter)): np.random.shuffle(ix_shuffle) for batch_ix in np.array_split(ix_shuffle, 50): NewTransitionParametersLogReg.partial_fit(X[batch_ix, :], Y[batch_ix], classes=classes) if verbosity > 0: print('Fitting transition parameters: IV') print('Memory usage: %s (kb)' % resource.getrusage(resource.RUSAGE_SELF).ru_maxrss) del Ix1, Ix2, Ix, X, Y, Xs, Ys if verbosity > 0: print('Done: Elapsed time: ' + str(time.time() - t)) return NewTransitionParametersLogReg
def FitTransistionParametersMultinomialSeparate(Sequences, Background, TransitionParameters, CurrPath, C): ''' This function determines the optimal parameters of the logistic regression for predicting the TransitionParameters ''' #Generate features from the CurrPaths and the Information in the coverage TransitionMatrix = TransitionParameters[0] NewTransitionParametersLogReg = {} t = time.time() #Iterate over the possible transistions assert (TransitionMatrix.shape[0] > 1), 'Only two states are currently allowed' CurrClass = 0 genes = CurrPath.keys() genes = random.sample(genes, min(len(genes), 1000)) NrOfStates = TransitionMatrix.shape[0] for CurrState in range(NrOfStates): CurrClass = 0 Xs = [] Ys = [] print "Learning transistion model for State " + str(CurrState) for NextState in range(NrOfStates): SampleSame = [] SampleOther = [] #Iterate over the genes print 'Loading data' for i, gene in enumerate(genes): if i % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() #Positions where the path is in the current state Ix1 = CurrPath[gene][:-1] == CurrState #Positions where the subsequent position path is in the "next" state Ix2 = CurrPath[gene][1:] == NextState #Positions where the path changes from the current state to the other state Ix = np.where(Ix1 * Ix2)[0] Sequences_per_gene = tools.PreloadSequencesForGene( Sequences, gene) CovMat = tools.StackData(Sequences_per_gene, add='all') CovMat[CovMat < 0] = 0 if np.sum(np.sum(np.isnan(CovMat))) > 0: pdb.set_trace() CovMat = GenerateFeatures(Ix, CovMat) if np.sum(np.sum(np.isnan(CovMat))) > 0 or np.sum( np.sum(np.isinf(CovMat))) > 0: pdb.set_trace() if CovMat.shape[1] == 0: CovMat = np.zeros((2, 1)) SampleOther.append(CovMat) else: SampleOther.append(CovMat) del CovMat print '\n' #Create X X = np.concatenate(SampleOther, axis=1) #Create Y Y = (np.ones((1, np.sum([Mat.shape[1] for Mat in SampleOther])), dtype=np.int) * CurrClass)[0, :] Xs.append(X) Ys.append(Y) CurrClass += 1 X = np.concatenate(Xs, axis=1) Y = np.concatenate(Ys) n_iter = max(5, np.ceil(10**6 / len(Y))) LR = SGDClassifier(loss="log", n_iter=n_iter).fit(X.T, Y.T) NewTransitionParametersLogReg[CurrState] = LR del Ix1, Ix2, Ix, SampleSame, SampleOther, X, Y, Xs, Ys print 'Done: Elapsed time: ' + str(time.time() - t) return NewTransitionParametersLogReg
def FitTransistionParametersSimpleBck(Sequences, Background, TransitionParameters, CurrPath, C): ''' This function determines the optimal parameters of the logistic regression for predicting the TransitionParameters ''' #Generate features from the CurrPaths and the Information in the coverage TransitionMatrix = TransitionParameters[0] NewTransitionParametersLogReg = {} t = time.time() #Iterate over the possible transistions assert (TransitionMatrix.shape[0] > 1), 'Only two states are currently allowed' genes = CurrPath.keys() genes = random.sample(genes, min(len(genes), 1000)) NrOfStates = TransitionMatrix.shape[0] Xs = [] Ys = [] SampleSame = [] SampleOther = [] print "Learning transistion model" print "Iterating over genes" for i, gene in enumerate(genes): if i % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() #Get data Sequences_per_gene = tools.PreloadSequencesForGene(Sequences, gene) Background_per_gene = tools.PreloadSequencesForGene(Background, gene) CovMatSeq = tools.StackData(Sequences_per_gene, add='all') CovMatBck = tools.StackData(Background_per_gene, add='only_cov') CovMat = np.vstack((CovMatSeq, CovMatBck)) nr_of_samples = CovMat.shape[0] CovMat[CovMat < 0] = 0 for CurrState in range(NrOfStates): for NextState in range(NrOfStates): #Positions where the path is in the current state Ix1 = CurrPath[gene][:-1] == CurrState #Positions where the subsequent position path is in the "next" state Ix2 = CurrPath[gene][1:] == NextState #Positions where the path changes from the current state to the other state Ix = np.where(Ix1 * Ix2)[0] if np.sum(np.sum(np.isnan(CovMat))) > 0: continue CovMatIx = GenerateFeatures(Ix, CovMat) if np.sum(np.sum(np.isnan(CovMatIx))) > 0 or np.sum( np.sum(np.isinf(CovMatIx))) > 0: continue if CurrState == NextState: if CovMatIx.shape[1] == 0: CovMatIx = np.zeros((nr_of_samples, 1)) SampleSame.append(CovMatIx) else: SampleSame.append(CovMatIx) else: if CovMatIx.shape[1] == 0: CovMatIx = np.zeros((nr_of_samples, 1)) SampleOther.append(CovMatIx) else: SampleOther.append(CovMatIx) del CovMat, CovMatIx, CovMatSeq, CovMatBck X = np.concatenate(SampleSame + SampleOther, axis=1) #Create Y len_same = np.sum([Mat.shape[1] for Mat in SampleSame]) len_other = np.sum([Mat.shape[1] for Mat in SampleOther]) Y = np.hstack((np.ones((1, len_same), dtype=np.int), np.zeros((1, len_other), dtype=np.int)))[0, :] n_iter = max(5, np.ceil(10**6 / len(Y))) NewTransitionParametersLogReg = SGDClassifier(loss="log", n_iter=n_iter).fit(X.T, Y.T) del Ix1, Ix2, Ix, SampleSame, SampleOther, X, Y, Xs, Ys print 'Memory usage: %s (kb)' % resource.getrusage( resource.RUSAGE_SELF).ru_maxrss print 'Done: Elapsed time: ' + str(time.time() - t) return NewTransitionParametersLogReg
def FitTransistionParametersUnif2(Sequences, Background, TransitionParameters, CurrPath, C): ''' This function determines the optimal parameters of the logistic regression for predicting the TransitionParameters ''' #Generate features from the CurrPaths and the Information in the coverage TransitionMatrix = TransitionParameters[0] NewTransitionParametersLogReg = {} t = time.time() #Iterate over the possible transistions assert (TransitionMatrix.shape[0] > 1), 'Only two states are currently allowed' for CurrState in range(TransitionMatrix.shape[0]): print "Learning transistion model for State " + str(CurrState) SampleSame = [] SampleOther = [] #Iterate over the genes print 'Loading data' for i, gene in enumerate(CurrPath.keys()): if i % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() #Positions where the path is in the current state Ix1 = CurrPath[gene][:-1] == CurrState #Positions where the subsequent position path is in the other state Ix2 = CurrPath[gene][1:] == (1 - CurrState) #Positions where the path changes from the current state to the other state Ix = np.where(Ix1 * Ix2)[0] #CovMat = Sequences[gene]['CovNr'].toarray() Sequences_per_gene = PreloadSequencesForGene(Sequences, gene) CovMat = tools.StackData(Sequences_per_gene, add='all') CovMat = GenerateFeatures(Ix, CovMat) SampleOther.append(CovMat[:, np.sum(CovMat, axis=0) > 0]) #Positions where the path is in the current state Ix1 = CurrPath[gene][:-1] == CurrState #Positions where the subsequent position path is in the same state Ix2 = CurrPath[gene][1:] == CurrState #Positions where the path stays in the current stae Ix = np.where(Ix1 * Ix2)[0] #CovMat = Sequences[gene]['CovNr'].toarray() Sequences_per_gene = PreloadSequencesForGene(Sequences, gene) CovMat = tools.StackData(Sequences_per_gene, add='all') CovMat = GenerateFeatures(Ix, CovMat) SampleSame.append(CovMat[:, np.sum(CovMat, axis=0) > 0]) del CovMat print '\n' #Create X X = np.concatenate(SampleSame + SampleOther, axis=1) #Create Y Y0 = np.zeros((1, np.sum([Mat.shape[1] for Mat in SampleSame])), dtype=np.int) Y1 = np.ones((1, np.sum([Mat.shape[1] for Mat in SampleOther])), dtype=np.int) Y = np.hstack((Y0, Y1))[0, :] Cs = [1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5] LR = LogisticRegressionCV(Cs=Cs, penalty='l2', tol=0.01, class_weight='auto') LR.fit(X.T, Y.T) NewTransitionParametersLogReg[CurrState] = LR print 'Elapsed time: ' + str(time.time() - t) del Ix1, Ix2, Ix, SampleSame, SampleOther return NewTransitionParametersLogReg