def FitEmissionParameters(Sequences, Background, NewPaths, OldEmissionParameters, First): print 'Fitting emission parameters' t = time.time() #Unpack the arguments OldAlpha = OldEmissionParameters['Diag_event_params'] NrOfStates = OldEmissionParameters['NrOfStates'] OldPriorMatrix = OldEmissionParameters['PriorMatrix'] NewEmissionParameters = OldEmissionParameters #Compute new prior matrix PriorMatrix = np.zeros_like(OldPriorMatrix) for State in range(NrOfStates): for path in NewPaths: PriorMatrix[State] += np.sum(NewPaths[path] == State) CorrectedPriorMatrix = np.copy(PriorMatrix) CorrectedPriorMatrix[CorrectedPriorMatrix == 0] = np.min( CorrectedPriorMatrix[CorrectedPriorMatrix > 0]) / 10 CorrectedPriorMatrix /= np.sum(CorrectedPriorMatrix) #Keep a copy to check which states are not used NewEmissionParameters['PriorMatrix'] = CorrectedPriorMatrix #Add Pseudo gene to Sequences, Background and Paths if NewEmissionParameters['ExpressionParameters'][0] is not None: Sequences, Background, NewPaths, pseudo_gene_names = add_pseudo_gene( Sequences, Background, NewPaths, PriorMatrix) #Compute parameters for the expression sample_size = 10000 if NewEmissionParameters['BckType'] != 'None': if 'Pseudo' in Sequences: nr_of_genes = len(Sequences.keys()) new_pars = NewEmissionParameters['ExpressionParameters'][0] new_pars = np.vstack((new_pars[:(nr_of_genes), :], np.mean(new_pars[:(nr_of_genes), :]), new_pars[(nr_of_genes):, :])) NewEmissionParameters['ExpressionParameters'][0] = new_pars print 'Estimating expression parameters' print 'Memory usage: %s (kb)' % resource.getrusage( resource.RUSAGE_SELF).ru_maxrss bg_type = NewEmissionParameters['BckType'] expr_data = (NewEmissionParameters, Sequences, Background, NewPaths, sample_size, bg_type) NewEmissionParameters = emission.estimate_expression_param(expr_data) print 'Memory usage: %s (kb)' % resource.getrusage( resource.RUSAGE_SELF).ru_maxrss if NewEmissionParameters['BckType'] != 'None': if 'Pseudo' in Sequences: nr_of_genes = len(Sequences.keys()) new_pars = NewEmissionParameters['ExpressionParameters'][0] new_pars = np.vstack( (new_pars[:(nr_of_genes - 1), :], new_pars[(nr_of_genes):, :])) NewEmissionParameters['ExpressionParameters'][0] = new_pars if (NewEmissionParameters['skip_diag_event_mdl'] == False) or ( not (EmissionParameters['use_precomp_diagmod'] is None)): #Compute parameters for the ratios print 'computing sufficient statitics for fitting md' print 'Memory usage: %s (kb)' % resource.getrusage( resource.RUSAGE_SELF).ru_maxrss SuffStat = tools.GetSuffStat(Sequences, Background, NewPaths, NrOfStates, Type='Conv', EmissionParameters=NewEmissionParameters) #Vectorize SuffStat Counts, NrOfCounts = tools.ConvertSuffStatToArrays(SuffStat) print 'Memory usage: %s (kb)' % resource.getrusage( resource.RUSAGE_SELF).ru_maxrss if NewEmissionParameters['Subsample']: Counts, NrOfCounts = tools.subsample_suff_stat(Counts, NrOfCounts) print 'fitting md distribution' print 'Memory usage: %s (kb)' % resource.getrusage( resource.RUSAGE_SELF).ru_maxrss if NewEmissionParameters['diag_bg']: print "Adjusting background" SuffStatBck = tools.GetSuffStatBck( Sequences, Background, NewPaths, NrOfStates, Type='Conv', EmissionParameters=NewEmissionParameters) #Vectorize SuffStat CountsBck, NrOfCountsBck = tools.ConvertSuffStatToArrays( SuffStatBck) if NewEmissionParameters['Subsample']: CountsBck, NrOfCountsBck = tools.subsample_suff_stat( CountsBck, NrOfCountsBck) #Overwrite counts in other bins fg_state, bg_state = emission.get_fg_and_bck_state( NewEmissionParameters, final_pred=True) for curr_state in Counts.keys(): if curr_state != fg_state: Counts[curr_state] = CountsBck[fg_state] NrOfCounts[curr_state] = NrOfCountsBck[fg_state] NewEmissionParameters = mixture_tools.em(Counts, NrOfCounts, NewEmissionParameters, x_0=OldAlpha, First=First) print 'Memory usage: %s (kb)' % resource.getrusage( resource.RUSAGE_SELF).ru_maxrss del Counts, NrOfCounts, SuffStat if 'Pseudo' in Sequences: del Sequences['Pseudo'] del Background['Pseudo'] del NewPaths['Pseudo'] print 'Done: Elapsed time: ' + str(time.time() - t) return NewEmissionParameters
def PlotGene(Sequences, Background, gene, IterParameters, TransitionTypeFirst='nonhomo', no_plot=False, Start=0, Stop=-1, figsize=(6, 8), dir_ylim=[], out_name=None): ''' This function plot the coverage and the parameters for the model ''' reload(diag_event_model) reload(emission) set2 = brewer2mpl.get_map('Dark2', 'qualitative', 8).mpl_colors TransitionParameters = IterParameters[1] EmissionParameters = IterParameters[0] TransitionType = EmissionParameters['TransitionType'] PriorMatrix = EmissionParameters['PriorMatrix'] NrOfStates = EmissionParameters['NrOfStates'] Sequences_per_gene = PreloadSequencesForGene(Sequences, gene) Background_per_gene = PreloadSequencesForGene(Background, gene) if EmissionParameters['FilterSNPs']: Ix = tools.GetModelIx(Sequences_per_gene, Type='no_snps_conv', snps_thresh=EmissionParameters['SnpRatio'], snps_min_cov=EmissionParameters['SnpAbs'], Background=Background_per_gene) else: Ix = tools.GetModelIx(Sequences_per_gene) #2) Compute the probabilities for both states EmmisionProbGene = np.log( np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates))) EmmisionProbGene_Dir = np.log( np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates))) EmmisionProbGeneNB_fg = np.log( np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates))) EmmisionProbGeneNB_bg = np.log( np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates))) CurrStackSum = tools.StackData(Sequences_per_gene) CurrStackVar = tools.StackData(Sequences_per_gene, add='no') nr_of_genes = len(Sequences.keys()) gene_nr_dict = {} for i, curr_gene in enumerate(Sequences.keys()): gene_nr_dict[curr_gene] = i #Compute the emission probapility for State in range(NrOfStates): if not EmissionParameters['ExpressionParameters'][0] == None: EmmisionProbGene[ State, :] = emission.predict_expression_log_likelihood_for_gene( CurrStackSum, State, nr_of_genes, gene_nr_dict[gene], EmissionParameters) EmmisionProbGeneNB_fg[ State, :] = emission.predict_expression_log_likelihood_for_gene( CurrStackSum, State, nr_of_genes, gene_nr_dict[gene], EmissionParameters) if EmissionParameters['BckType'] == 'Coverage': EmmisionProbGene[ State, :] += emission.predict_expression_log_likelihood_for_gene( tools.StackData(Background, gene, add='only_cov') + 0, State, nr_of_genes, gene_nr_dict[gene], EmissionParameters, curr_type='bg') EmmisionProbGeneNB_bg[ State, :] = emission.predict_expression_log_likelihood_for_gene( tools.StackData(Background, gene, add='only_cov') + 0, State, nr_of_genes, gene_nr_dict[gene], EmissionParameters, curr_type='bg') if EmissionParameters['BckType'] == 'Coverage_bck': EmmisionProbGene[ State, :] += emission.predict_expression_log_likelihood_for_gene( tools.StackData(Background, gene, add='only_cov') + 0, State, nr_of_genes, gene_nr_dict[gene], EmissionParameters, curr_type='bg') EmmisionProbGeneNB_bg[ State, :] = emission.predict_expression_log_likelihood_for_gene( tools.StackData(Background, gene, add='only_cov') + 0, State, nr_of_genes, gene_nr_dict[gene], EmissionParameters, curr_type='bg') if not EmissionParameters['ign_diag']: EmmisionProbGene[State, Ix] += diag_event_model.pred_log_lik( CurrStackVar[:, Ix], State, EmissionParameters) EmmisionProbGene_Dir[State, Ix] = diag_event_model.pred_log_lik( CurrStackVar[:, Ix], State, EmissionParameters) #Get the transition probabilities if TransitionTypeFirst == 'nonhomo': if TransitionType == 'unif_bck' or TransitionType == 'binary_bck': CountsSeq = tools.StackData(Sequences_per_gene, add='all') CountsBck = tools.StackData(Background_per_gene, add='only_cov') Counts = np.vstack((CountsSeq, CountsBck)) else: Counts = tools.StackData(Sequences_per_gene, add='all') TransistionProbabilities = np.float64( trans.PredictTransistions(Counts, TransitionParameters, NrOfStates, TransitionType)) else: TransistionProbabilities = np.float64( np.tile(np.log(TransitionParameters[0]), (EmmisionProbGene.shape[1], 1, 1)).T) MostLikelyPath, LogLik = viterbi.viterbi(np.float64(EmmisionProbGene), TransistionProbabilities, np.float64(np.log(PriorMatrix))) for j in range(NrOfStates): print str(np.sum(MostLikelyPath == j)) if no_plot: return MostLikelyPath, TransistionProbabilities, EmmisionProbGene #pdb.set_trace() fig, axes = plt.subplots(nrows=9, figsize=figsize) fig.subplots_adjust(hspace=1.001) Counts = tools.StackData(Sequences_per_gene, gene, add='no') if Stop == -1: Stop = Counts.shape[1] if Stop == -1: plt_rng = np.array(range(Start, Counts.shape[1])) else: plt_rng = np.array(range(Start, Stop)) i = 0 color = set2[i] nr_of_rep_fg = len(Sequences[gene]['Coverage'].keys()) i += 1 Ix = repl_track_nr([2, 16], 22, nr_of_rep_fg) ppl.plot(axes[0], plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop], label='TC', linewidth=2, color=color) color = set2[i] i += 1 Ix = repl_track_nr([0, 1, 3, 5, 6, 7, 8, 10, 11, 12, 13, 15, 17, 18], 22, nr_of_rep_fg) ppl.plot(axes[0], plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop], label='NonTC', linewidth=2, color=color) color = set2[i] i += 1 Ix = repl_track_nr([20], 22, nr_of_rep_fg) ppl.plot(axes[0], plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop], label='Read-ends', linewidth=2, color=color) color = set2[i] i += 1 Ix = repl_track_nr([4, 9, 14, 19], 22, nr_of_rep_fg) ppl.plot(axes[0], plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop], label='Deletions', linewidth=2, color=color) color = set2[i] i += 1 Ix = repl_track_nr([21], 22, nr_of_rep_fg) ppl.plot(axes[0], plt_rng, (np.sum(Counts[Ix, :], axis=0))[Start:Stop], label='Coverage', linewidth=2, color=color) color = set2[i] i += 1 axes[0].set_ylabel('Counts') axes[0].set_xlabel('Position') axes[0].set_title('Coverage and Conversions') axes[0].get_xaxis().get_major_formatter().set_useOffset(False) BckCov = Background_per_gene['Coverage'][0] for i in range(1, len(Background_per_gene['Coverage'].keys())): BckCov += Background_per_gene['Coverage'][str(i)] ppl.plot(axes[0], plt_rng, (BckCov.T)[Start:Stop], ls='-', label='Bck', linewidth=2, color=color) ppl.legend(axes[0]) for j in range(NrOfStates): color = set2[j] ppl.plot(axes[1], plt_rng, (TransistionProbabilities[j, j, :])[Start:Stop], label='Transition ' + str(j) + ' ' + str(j), linewidth=2, color=color) ppl.legend(axes[1]) axes[1].set_ylabel('log-transition probability') axes[1].set_xlabel('Position') axes[1].set_title('Transition probability') axes[1].get_xaxis().get_major_formatter().set_useOffset(False) for j in range(NrOfStates): color = set2[j] ppl.plot(axes[2], plt_rng, (EmmisionProbGene[j, :][Start:Stop]), label='Emission ' + str(j), linewidth=2, color=color) if EmissionParameters['BckType'] == 'Coverage_bck': axes[2].set_ylim( (np.min(np.min(EmmisionProbGene[0:2, :][:, Start:Stop])), 1)) ppl.legend(axes[2]) axes[2].set_ylabel('log-GLM probability') axes[2].set_xlabel('Position') axes[2].set_title('Emission probability') axes[2].get_xaxis().get_major_formatter().set_useOffset(False) ppl.plot(axes[3], plt_rng, MostLikelyPath[Start:Stop]) axes[3].set_ylabel('State') axes[3].set_xlabel('Position') axes[3].set_title('Most likely path') axes[3].get_xaxis().get_major_formatter().set_useOffset(False) for j in range(NrOfStates): color = set2[j] ppl.plot(axes[4], plt_rng, EmmisionProbGene_Dir[j, :][Start:Stop], label='Dir State ' + str(j), linewidth=2, color=color) if len(dir_ylim) > 0: axes[4].set_ylim(dir_ylim) ppl.legend(axes[4]) axes[4].set_ylabel('log-DMM probability') axes[4].set_xlabel('Position') axes[4].set_title('DMM probability') axes[4].get_xaxis().get_major_formatter().set_useOffset(False) for j in range(NrOfStates): color = set2[j] ppl.plot(axes[5], plt_rng, EmmisionProbGeneNB_fg[j, :][Start:Stop], label='NB fg ' + str(j), linewidth=2, color=color) if EmissionParameters['BckType'] == 'Coverage_bck': axes[5].set_ylim( [np.min(np.min(EmmisionProbGeneNB_fg[0:2, :][:, Start:Stop])), 1]) ppl.legend(axes[5]) axes[5].set_ylabel('prob') axes[5].set_xlabel('Position') axes[5].set_title('prob-fg') axes[5].get_xaxis().get_major_formatter().set_useOffset(False) for j in range(NrOfStates): color = set2[j] ppl.plot(axes[6], plt_rng, EmmisionProbGeneNB_bg[j, :][Start:Stop], label='NB bg ' + str(j), linewidth=2, color=color) if EmissionParameters['BckType'] == 'Coverage_bck': axes[6].set_ylim( [np.min(np.min(EmmisionProbGeneNB_bg[0:3, :][:, Start:Stop])), 1]) ppl.legend(axes[6]) axes[6].set_ylabel('prob') axes[6].set_xlabel('Position') axes[6].set_title('prob-bg') axes[6].get_xaxis().get_major_formatter().set_useOffset(False) fg_state, bg_state = emission.get_fg_and_bck_state(EmissionParameters, final_pred=True) ix_bg = range(EmmisionProbGene.shape[0]) ix_bg.remove(fg_state) FGScore = EmmisionProbGene[fg_state, :] AltScore = EmmisionProbGene[ix_bg, :] norm = logsumexp(AltScore, axis=0) ix_ok = np.isinf(norm) + np.isnan(norm) if np.sum(ix_ok) < norm.shape[0]: SiteScore = FGScore[ix_ok == 0] - norm[ix_ok == 0] else: print 'Score problematic' SiteScore = FGScore ppl.plot(axes[7], plt_rng, SiteScore[Start:Stop]) axes[7].set_ylabel('log-odd score') axes[7].set_xlabel('Position') axes[7].set_title('log-odd score') axes[7].get_xaxis().get_major_formatter().set_useOffset(False) FGScore = EmmisionProbGene_Dir[fg_state, :] AltScore = EmmisionProbGene_Dir[ix_bg, :] norm = logsumexp(AltScore, axis=0) ix_ok = np.isinf(norm) + np.isnan(norm) if np.sum(ix_ok) < norm.shape[0]: SiteScore = FGScore[ix_ok == 0] - norm[ix_ok == 0] else: print 'Score problematic' SiteScore = FGScore ppl.plot(axes[8], plt_rng, SiteScore[Start:Stop]) axes[8].set_ylabel('DMM log-odd score') axes[8].set_xlabel('Position') axes[8].set_title('DMM log-odd score') axes[8].get_xaxis().get_major_formatter().set_useOffset(False) if not (out_name is None): print 'Saving result' fig.savefig(out_name) plt.show() return MostLikelyPath, TransistionProbabilities, EmmisionProbGeneNB_fg
def pred_sites(args): # Get the args args = parser.parse_args() print args #Check parameters if len(args.fg_libs) == 0: raise sys.exit('No CLIP-libraries given') if len(args.bg_libs) == 0: bg_type = 'None' else: bg_type = args.bg_type if args.out_dir == None: out_path = os.getcwdu() else: out_path = args.out_dir MaxIter = args.max_it # process the parameters if not (bg_type == 'Coverage' or bg_type == 'Coverage_bck'): print 'Bg-type: ' + bg_type + ' has not been implemented yet' return #Load the gene annotation print 'Loading gene annotation' GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file, keep_order=True) GenomeDir = args.genome_dir #Load the reads t = time.time() print 'Loading reads' DataOutFile = os.path.join(out_path, 'fg_reads.dat') Sequences = LoadReads.load_data( args.fg_libs, GenomeDir, GeneAnnotation, DataOutFile, load_from_file=True, save_results=False, Collapse=args.fg_collapsed, ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand']) DataOutFile = os.path.join(out_path, 'bg_reads.dat') Background = LoadReads.load_data( args.bg_libs, GenomeDir, GeneAnnotation, DataOutFile, load_from_file=True, save_results=False, Collapse=args.bg_collapsed, OnlyCoverage=True, ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand']) #Removing genes without any reads in the CLIP data genes_to_keep = [] all_genes = Sequences.keys() for i, gene in enumerate(Sequences.keys()): curr_cov = np.sum( np.array([ np.sum(Sequences[gene]['Coverage'][rep].toarray()) for rep in Sequences[gene]['Coverage'].keys() ])) curr_neg_vars = np.sum( np.array([ np.sum(np.sum(Sequences[gene]['Variants'][rep].toarray() < 0)) for rep in Sequences[gene]['Variants'].keys() ])) if curr_cov < 100 or curr_neg_vars > 0: continue genes_to_keep.append(gene) if i > args.gene_sample: break genes_to_del = list(set(all_genes).difference(set(genes_to_keep))) for gene in genes_to_del: del Sequences[gene] del Background[gene] del all_genes, genes_to_del, genes_to_keep print 'Done: Elapsed time: ' + str(time.time() - t) #Load data tmp_file = cPickle.load( open(os.path.join(out_path, 'IterSaveFile.dat'), 'r')) IterParameters = tmp_file[0] args = tmp_file[1] EmissionParameters = IterParameters[0] fg_state, bg_state = emission.get_fg_and_bck_state(EmissionParameters, final_pred=True) if EmissionParameters['fg_pen'] > 0.0: print 'Recomputing paths' EmissionParameters['LastIter'] = True Paths, LogLike = tools.ParallelGetMostLikelyPath( Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo') Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') tools.GeneratePred(Sequences, Background, IterParameters, GeneAnnotation, OutFile, fg_state, bg_state) print 'Done'
def run_omniCLIP(args): # Get the args args = parser.parse_args() print args #Check parameters if len(args.fg_libs) == 0: raise sys.exit('No CLIP-libraries given') if len(args.bg_libs) == 0: bg_type = 'None' else: bg_type = args.bg_type if args.out_dir == None: out_path = os.getcwdu() else: out_path = args.out_dir MaxIter = args.max_it # process the parameters if not (bg_type == 'Coverage' or bg_type == 'Coverage_bck'): print 'Bg-type: ' + bg_type + ' has not been implemented yet' return #Set seed for the random number generators if args.rnd_seed is not None: random.seed(args.rnd_seed) print 'setting seed' #Set the p-value cutoff for the bed-file creation pv_cutoff = args.pv_cutoff #Load the gene annotation print 'Loading gene annotation' GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file, keep_order=True) GenomeDir = args.genome_dir #Load the reads print 'Memory usage: %s (kb)' % resource.getrusage( resource.RUSAGE_SELF).ru_maxrss print 'Loading reads' EmissionParameters = {} #Check whether existing iteration parameters should be used restart_from_file = args.restart_from_file EmissionParameters['restart_from_file'] = restart_from_file EmissionParameters['glm_weight'] = args.glm_weight EmissionParameters['mask_flank_variants'] = args.mask_flank_variants EmissionParameters['max_mm'] = args.max_mm EmissionParameters['rev_strand'] = args.rev_strand EmissionParameters['skip_diag_event_mdl'] = args.skip_diag_event_mdl EmissionParameters['ign_out_rds'] = args.ign_out_rds EmissionParameters['DataOutFile_seq'] = os.path.join( out_path, 'fg_reads.dat') EmissionParameters['DataOutFile_bck'] = os.path.join( out_path, 'bg_reads.dat') EmissionParameters['tmp_dir'] = args.tmp_dir t = time.time() Sequences = LoadReads.load_data( args.fg_libs, GenomeDir, GeneAnnotation, EmissionParameters['DataOutFile_seq'], load_from_file=((not args.overwrite_fg) or restart_from_file), save_results=True, Collapse=args.fg_collapsed, mask_flank_variants=EmissionParameters['mask_flank_variants'], max_mm=EmissionParameters['max_mm'], ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand']) Background = LoadReads.load_data( args.bg_libs, GenomeDir, GeneAnnotation, EmissionParameters['DataOutFile_bck'], load_from_file=((not args.overwrite_bg) or restart_from_file), save_results=True, Collapse=args.bg_collapsed, OnlyCoverage=args.only_coverage, mask_flank_variants=EmissionParameters['mask_flank_variants'], max_mm=EmissionParameters['max_mm'], ign_out_rds=EmissionParameters['ign_out_rds'], rev_strand=EmissionParameters['rev_strand']) #pdb.set_trace() #Mask the positions that overlap miRNA sites in the geneome Sequences.close() Background.close() f_name_read_fg = EmissionParameters['DataOutFile_seq'] f_name_read_bg = EmissionParameters['DataOutFile_bck'] #Create temporary read-files that can be modified by the masking operations if EmissionParameters['tmp_dir'] is None: f_name_read_fg_tmp = EmissionParameters['DataOutFile_seq'].replace( 'fg_reads.dat', 'fg_reads.tmp.dat') f_name_read_bg_tmp = EmissionParameters['DataOutFile_bck'].replace( 'bg_reads.dat', 'bg_reads.tmp.dat') else: f_name_read_fg_tmp = os.path.join( EmissionParameters['tmp_dir'], next(tempfile._get_candidate_names()) + '.dat') f_name_read_bg_tmp = os.path.join( EmissionParameters['tmp_dir'], next(tempfile._get_candidate_names()) + '.dat') shutil.copy(f_name_read_fg, f_name_read_fg_tmp) shutil.copy(f_name_read_bg, f_name_read_bg_tmp) #open the temporary read files Sequences = h5py.File(f_name_read_fg_tmp, 'r+') Background = h5py.File(f_name_read_bg_tmp, 'r+') EmissionParameters['DataOutFile_seq'] = f_name_read_fg_tmp EmissionParameters['DataOutFile_bck'] = f_name_read_bg_tmp #Set coverage for regions that overlapp annotated miRNAs to zero EmissionParameters['mask_miRNA'] = args.mask_miRNA if args.mask_miRNA: print 'Removing miRNA-coverage' Sequences = mask_miRNA_positions(Sequences, GeneAnnotation) #Mask regions where genes overlap EmissionParameters['mask_ovrlp'] = args.mask_ovrlp if EmissionParameters['mask_ovrlp']: print 'Masking overlapping positions' Sequences = mark_overlapping_positions(Sequences, GeneAnnotation) #Estimate the library size EmissionParameters['BckLibrarySize'] = tools.estimate_library_size( Background) EmissionParameters['LibrarySize'] = tools.estimate_library_size(Sequences) #Removing genes without any reads in the CLIP data print "Removing genes without CLIP coverage" genes_to_keep = [] all_genes = Sequences.keys() for i, gene in enumerate(Sequences.keys()): curr_cov = sum([ Sequences[gene]['Coverage'][rep].value.sum() for rep in Sequences[gene]['Coverage'].keys() ]) curr_neg_vars = sum([ np.sum(np.sum(Sequences[gene]['Variants'][rep].value < 0)) for rep in Sequences[gene]['Variants'].keys() ]) if curr_cov <= 100 or curr_neg_vars > 0: continue genes_to_keep.append(gene) if i > args.gene_sample: break genes_to_del = list(set(all_genes).difference(set(genes_to_keep))) for gene in genes_to_del: del Sequences[gene] del Background[gene] del all_genes, genes_to_del, genes_to_keep print 'Done: Elapsed time: ' + str(time.time() - t) print 'Memory usage: %s (kb)' % resource.getrusage( resource.RUSAGE_SELF).ru_maxrss #Initializing parameters print 'Initialising the parameters' if bg_type == 'Coverage_bck': NrOfStates = 4 else: NrOfStates = 3 #Remove the gene sequence from the Sequences and Background when not needed. Currently this is always the case: for gene in Sequences.keys(): if 'GeneSeq' in Sequences[gene]: del Sequences[gene]['GeneSeq'] for gene in Background.keys(): if 'GeneSeq' in Background[gene]: del Background[gene]['GeneSeq'] #pdb.set_trace() TransMat = np.ones((NrOfStates, NrOfStates)) + np.eye(NrOfStates) TransMat = TransMat / np.sum(np.sum(TransMat)) TransitionParameters = [TransMat, []] NrOfReplicates = len(args.fg_libs) gene = Sequences.keys()[0] EmissionParameters['PriorMatrix'] = np.ones( (NrOfStates, 1)) / float(NrOfStates) EmissionParameters['diag_bg'] = args.diag_bg EmissionParameters['emp_var'] = args.emp_var EmissionParameters['norm_class'] = args.norm_class #Define flag for penalized path prediction EmissionParameters['LastIter'] = False EmissionParameters['fg_pen'] = args.fg_pen EmissionParameters['Diag_event_params'] = {} EmissionParameters['Diag_event_params']['nr_mix_comp'] = args.nr_mix_comp EmissionParameters['Diag_event_params']['mix_comp'] = {} for state in range(NrOfStates): mixtures = np.random.uniform(0.0, 1.0, size=(args.nr_mix_comp)) EmissionParameters['Diag_event_params']['mix_comp'][ state] = mixtures / np.sum(mixtures) #initialise the parameter vector alpha alphashape = (Sequences[gene]['Variants']['0'].value.shape[0] + Sequences[gene]['Coverage']['0'].value.shape[0] + Sequences[gene]['Read-ends']['0'].value.shape[0]) alpha = {} for state in range(NrOfStates): alpha[state] = np.random.uniform(0.9, 1.1, size=(alphashape, args.nr_mix_comp)) EmissionParameters['Diag_event_params']['alpha'] = alpha EmissionParameters['Diag_event_type'] = args.diag_event_mod EmissionParameters['NrOfStates'] = NrOfStates EmissionParameters['NrOfReplicates'] = NrOfReplicates EmissionParameters['ExpressionParameters'] = [None, None] EmissionParameters['BckType'] = bg_type EmissionParameters['NrOfBckReplicates'] = len(args.bg_libs) EmissionParameters['TransitionType'] = args.tr_type EmissionParameters['Verbosity'] = args.verbosity EmissionParameters['NbProc'] = args.nb_proc EmissionParameters['Subsample'] = args.subs EmissionParameters['FilterSNPs'] = args.filter_snps EmissionParameters['SnpRatio'] = args.snps_thresh EmissionParameters['SnpAbs'] = args.snps_min_cov EmissionParameters['ign_diag'] = args.ign_diag if EmissionParameters['ign_out_rds']: EmissionParameters['ign_diag'] = EmissionParameters['ign_out_rds'] EmissionParameters['ign_GLM'] = args.ign_GLM EmissionParameters['only_pred'] = args.only_pred EmissionParameters['use_precomp_diagmod'] = args.use_precomp_diagmod # Transistion parameters IterParameters = [EmissionParameters, TransitionParameters] #Start computation #Iterativly fit the parameters of the model OldLogLikelihood = 0 CurrLogLikelihood = -np.inf CurrIter = 0 LoglikelihodList = [] First = 1 IterSaveFile = os.path.join(out_path, 'IterSaveFile.dat') IterSaveFileHist = os.path.join(out_path, 'IterSaveFileHist.dat') IterHist = [] Paths = {} iter_cond = True #Check whether to preload the iteration file if EmissionParameters['only_pred']: IterParameters, args_old = cPickle.load(open(IterSaveFile, 'r')) EmissionParameters['mask_miRNA'] = args.mask_miRNA EmissionParameters['glm_weight'] = args.glm_weight EmissionParameters['restart_from_file'] = restart_from_file EmissionParameters = IterParameters[0] EmissionParameters['ign_diag'] = args.ign_diag if EmissionParameters['ign_out_rds']: EmissionParameters['ign_diag'] = EmissionParameters['ign_out_rds'] EmissionParameters['ign_GLM'] = args.ign_GLM TransitionParameters = IterParameters[1] TransitionType = EmissionParameters['TransitionType'] OldLogLikelihood = -np.inf fg_state, bg_state = emission.get_fg_and_bck_state(EmissionParameters, final_pred=True) Paths, CurrLogLikelihood = tools.ParallelGetMostLikelyPath( Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo') Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') First = 0 iter_cond = False if restart_from_file: IterParameters, args_old = cPickle.load(open(IterSaveFile, 'r')) EmissionParameters = IterParameters[0] EmissionParameters['mask_miRNA'] = args.mask_miRNA EmissionParameters['glm_weight'] = args.glm_weight EmissionParameters['restart_from_file'] = restart_from_file EmissionParameters['ign_diag'] = args.ign_diag EmissionParameters['ign_GLM'] = args.ign_GLM TransitionParameters = IterParameters[1] TransitionType = EmissionParameters['TransitionType'] OldLogLikelihood = -np.inf Paths, CurrLogLikelihood = tools.ParallelGetMostLikelyPath( Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo') Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') First = 1 iter_cond = True if not EmissionParameters['use_precomp_diagmod'] is None: IterParametersPreComp, args_old = cPickle.load( open(EmissionParameters['use_precomp_diagmod'], 'r')) IterParameters[0]['Diag_event_params'] = IterParametersPreComp[0][ 'Diag_event_params'] while iter_cond: print "Iteration: " + str(CurrIter) if EmissionParameters['Verbosity'] > 0: print IterParameters[0] OldLogLikelihood = CurrLogLikelihood CurrLogLikelihood, IterParameters, First, Paths = PerformIteration( Sequences, Background, IterParameters, NrOfStates, First, Paths) gc.collect() if True: cPickle.dump([IterParameters, args], open(IterSaveFile, 'w')) if args.safe_tmp: if CurrIter > 0: IterHist = cPickle.load(open(IterSaveFileHist, 'r')) IterHist.append([IterParameters, CurrLogLikelihood]) cPickle.dump(IterHist, open(IterSaveFileHist, 'w')) del IterHist print "Log-likelihood: " + str(CurrLogLikelihood) LoglikelihodList.append(CurrLogLikelihood) print LoglikelihodList CurrIter += 1 if CurrIter >= MaxIter: print 'Maximal number of iterations reached' if not restart_from_file: if CurrIter < max(3, MaxIter): iter_cond = True else: iter_cond = (CurrIter < MaxIter) and ( (abs(CurrLogLikelihood - OldLogLikelihood) / max(abs(CurrLogLikelihood), abs(OldLogLikelihood))) > 0.01) and (abs(CurrLogLikelihood - OldLogLikelihood) > args.tol_lg_lik) else: if np.isinf(OldLogLikelihood): iter_cond = (CurrIter < MaxIter) and ( abs(CurrLogLikelihood - OldLogLikelihood) > args.tol_lg_lik) else: iter_cond = (CurrIter < MaxIter) and ( (abs(CurrLogLikelihood - OldLogLikelihood) / max(abs(CurrLogLikelihood), abs(OldLogLikelihood))) > 0.01) and (abs(CurrLogLikelihood - OldLogLikelihood) > args.tol_lg_lik) #Return the fitted parameters print 'Finished fitting of parameters' EmissionParameters, TransitionParameters = IterParameters if not isinstance(EmissionParameters['ExpressionParameters'][0], np.ndarray): print 'Emmision parameters have not been fit yet' return out_file_base = 'pred' if EmissionParameters['ign_GLM']: out_file_base += '_no_glm' if EmissionParameters['ign_diag']: out_file_base += '_no_diag' OutFile = os.path.join(out_path, out_file_base + '.txt') #determine which state has higher weight in fg. print 'Memory usage: %s (kb)' % resource.getrusage( resource.RUSAGE_SELF).ru_maxrss fg_state, bg_state = emission.get_fg_and_bck_state(EmissionParameters, final_pred=True) if EmissionParameters['fg_pen'] > 0.0: print 'Recomputing paths' EmissionParameters['LastIter'] = True Paths, LogLike = tools.ParallelGetMostLikelyPath( Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo') Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') tools.GeneratePred(Paths, Sequences, Background, IterParameters, GeneAnnotation, OutFile, fg_state, bg_state, seq_file=EmissionParameters['DataOutFile_seq'], bck_file=EmissionParameters['DataOutFile_bck'], pv_cutoff=pv_cutoff) print 'Done' #Remove the temporary files if not (EmissionParameters['tmp_dir'] is None): print 'removing temporary files' os.remove(EmissionParameters['DataOutFile_seq']) os.remove(EmissionParameters['DataOutFile_bck']) return
def ParallelGetMostLikelyPathForGene(data): ''' This function computes the most likely path for a gene ''' gene, nr_of_genes, gene_nr, EmissionParameters, TransitionParameters, TransitionTypeFirst, RandomNoise = data #Turn the Sequence and Bacground objects into dictionaries again such that the subsequent methods for using these do not need to be modified Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') #Parse the parameters alpha = EmissionParameters['Diag_event_params'] PriorMatrix = EmissionParameters['PriorMatrix'] NrOfStates = EmissionParameters['NrOfStates'] TransitionType = EmissionParameters['TransitionType'] fg_state, bg_state = emission.get_fg_and_bck_state(EmissionParameters, final_pred=True) fg_pen = EmissionParameters['fg_pen'] #Score the state sequences #1) Determine the positions where an observation is possible Sequences_per_gene = PreloadSequencesForGene(Sequences, gene) Background_per_gene = PreloadSequencesForGene(Background, gene) Ix = GetModelIx(Sequences_per_gene, Type='all') if np.sum(Ix) == 0: CurrPath = 2 * np.ones((0, Ix.shape[0]), dtype=np.int) return [gene, CurrPath, 0] if EmissionParameters['FilterSNPs']: Ix = GetModelIx(Sequences_per_gene, Type='no_snps_conv', snps_thresh=EmissionParameters['SnpRatio'], snps_min_cov=EmissionParameters['SnpAbs'], Background=Background_per_gene) else: Ix = GetModelIx(Sequences_per_gene) #2) Compute the probabilities for both states EmmisionProbGene = np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates)) CurrStackSum = StackData(Sequences_per_gene) CurrStackVar = StackData(Sequences_per_gene, add = 'no') CurrStackSumBck = StackData(Background_per_gene, add = 'only_cov') if EmissionParameters['glm_weight'] < 0.0: weight1 = 1.0 weight2 = 1.0 elif EmissionParameters['glm_weight'] == 0.0: weight1 = 0.0000001 weight2 = 1.0 - weight1 elif EmissionParameters['glm_weight'] == 1.0: weight1 = 0.9999999 weight2 = 1.0 - weight1 else: weight1 = EmissionParameters['glm_weight'] weight2 = (1.0 - EmissionParameters['glm_weight']) for State in range(NrOfStates): if not EmissionParameters['ign_GLM']: if isinstance(EmissionParameters['ExpressionParameters'][0], np.ndarray): #EmmisionProbGene[State, :] = FitBinoDirchEmmisionProbabilities.ComputeStateProbForGeneNB_unif(CurrStack, alpha, State, EmissionParameters) EmmisionProbGene[State, :] = np.log(weight1) + emission.predict_expression_log_likelihood_for_gene(CurrStackSum, State, nr_of_genes, gene_nr, EmissionParameters) if EmissionParameters['BckType'] == 'Coverage': EmmisionProbGene[State, :] += np.log(weight1) + emission.predict_expression_log_likelihood_for_gene(CurrStackSumBck, State, nr_of_genes, gene_nr, EmissionParameters, 'bg') if EmissionParameters['BckType'] == 'Coverage_bck': EmmisionProbGene[State, :] += np.log(weight1) + emission.predict_expression_log_likelihood_for_gene(CurrStackSumBck, State, nr_of_genes, gene_nr, EmissionParameters, 'bg') if not EmissionParameters['ign_diag']: EmmisionProbGene[State, Ix] += np.log(weight2) + diag_event_model.pred_log_lik(CurrStackVar[:, Ix], State, EmissionParameters) if State == fg_state: if EmissionParameters['LastIter']: EmmisionProbGene[State, :] -= fg_pen if RandomNoise: EmmisionProbGene = np.logaddexp(EmmisionProbGene, np.random.uniform(np.min(EmmisionProbGene[np.isfinite(EmmisionProbGene)]) - 4, np.min(EmmisionProbGene[np.isfinite(EmmisionProbGene)]) - 0.1, EmmisionProbGene.shape)) #Add some random noise #Get the transition probabilities if TransitionTypeFirst == 'nonhomo': if TransitionType == 'unif_bck' or TransitionType == 'binary_bck': CountsSeq = StackData(Sequences_per_gene, add = 'all') CountsBck = StackData(Background_per_gene, add = 'only_cov') Counts = np.vstack((CountsSeq, CountsBck)) else: Counts = StackData(Sequences_per_gene, add = 'all') TransistionProbabilities = np.float64(trans.PredictTransistions(Counts, TransitionParameters, NrOfStates, TransitionType)) else: TransistionProbabilities = np.float64(np.tile(np.log(TransitionParameters[0]), (EmmisionProbGene.shape[1],1,1)).T) CurrPath, Currloglik = viterbi.viterbi(np.float64(EmmisionProbGene), TransistionProbabilities, np.float64(np.log(PriorMatrix))) del TransistionProbabilities, EmmisionProbGene, CurrStackSum, CurrStackVar, Ix Sequences.close() Background.close() return [gene, CurrPath, Currloglik]
def GetSuffStatBck(Sequences, Background, Paths, NrOfStates, Type, ResetNotUsedStates = True, EmissionParameters=None): ''' This function computes for each CurrPath state a set of suffcient statistics: ''' #Initialize the sufficent statistcs variable print "Getting suffcient statistic" t = time.time() SuffStatBck = {} fg_state, bg_state = emission.get_fg_and_bck_state(EmissionParameters, final_pred=True) SuffStatBck[fg_state] = defaultdict(int) try: Sequences.close() except: pass try: Background.close() except: pass Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') #Fil the sufficent statistcs variable for gene in Sequences.keys(): rep = Background[gene]['Coverage'].keys()[0] CurrGenePath = Paths[gene] #Stack the matrizes together and convert to dense matrix Background_per_gene = PreloadSequencesForGene(Background, gene) Sequences_per_gene = PreloadSequencesForGene(Sequences, gene) if Type == 'Conv': CurrStack = StackData(Background_per_gene, add = 'variants') else: CurrStack = StackData(Background_per_gene, add = 'all') if EmissionParameters['FilterSNPs']: if Type == 'Conv': Ix = GetModelIx(Background_per_gene, Type='no_snps_conv', snps_thresh=EmissionParameters['SnpRatio'], snps_min_cov=EmissionParameters['SnpAbs'], Background=Background_per_gene) else: Ix = GetModelIx(Background_per_gene, Type) else: Ix = GetModelIx(Background_per_gene, Type) NonZero = np.sum(CurrStack, axis = 0) > 0 #Determine the nonzeros elements CurrState = fg_state CurrIx = Ix * NonZero > 0 if EmissionParameters['mask_ovrlp']: CurrIx = Ix * (Sequences_per_gene['mask'][rep][0, :] == 0) * NonZero * (CurrGenePath == CurrState) > 0 else: CurrIx = Ix * NonZero * (CurrGenePath == CurrState) > 0 data = CurrStack[:,CurrIx].T ncols = data.shape[1] dtype = data.T.dtype.descr * ncols struct = data.view(dtype) vals, val_counts = np.unique(struct, return_counts=True) #Save the tuples and how many times they have been seen so far. for curr_val, curr_count in itertools.izip(vals, val_counts): SuffStatBck[CurrState][tuple(curr_val)] += curr_count #Treat the 0 tuple seperately for speed improvment if len(Ix) == 0: continue NullIx = (NonZero == 0) * (CurrGenePath == CurrState) > 0 if np.sum(NullIx) == 0: continue NullCount = np.sum(NullIx) if NullCount > 0: NullTuple = np.zeros_like(CurrStack[:, 0]) NullTuple = tuple(NullTuple.T) SuffStatBck[CurrState][NullTuple] += NullCount del CurrStack, NonZero, CurrGenePath, Ix print 'Done: Elapsed time: ' + str(time.time() - t) return SuffStatBck
def em(counts, nr_of_counts, EmissionParameters, x_0=None, First=False, max_nr_iter=15, tol=0.0001, rand_sample_size=10): ''' This function performs the EMlagorithm ''' template_state = 3 fg_state, bg_state = emission.get_fg_and_bck_state(EmissionParameters, final_pred=True) check = False OldEmissionParameters = deepcopy(EmissionParameters) for curr_state in counts.keys(): #Only compute the the emission probabilities once if EmissionParameters['diag_bg']: if curr_state != fg_state: if True: if check == True: print 'Using template state ' + str(curr_state) EmissionParameters['Diag_event_params']['mix_comp'][ curr_state] = deepcopy( EmissionParameters['Diag_event_params'] ['mix_comp'][template_state]) EmissionParameters['Diag_event_params']['alpha'][ curr_state] = deepcopy( EmissionParameters['Diag_event_params'] ['alpha'][template_state]) continue else: print 'setting template state ' + str(curr_state) check = True template_state = curr_state else: template_state = 3 check = True EmissionParameters['Diag_event_params']['mix_comp'][ curr_state] = deepcopy( EmissionParameters['Diag_event_params']['mix_comp'] [template_state]) EmissionParameters['Diag_event_params']['alpha'][ curr_state] = deepcopy( EmissionParameters['Diag_event_params']['alpha'] [template_state]) continue print 'Estimating state ' + str(curr_state) curr_counts = counts[curr_state] curr_nr_of_counts = nr_of_counts[curr_state] alpha, mixtures = Parallel_estimate_mixture_params( OldEmissionParameters, curr_counts, curr_nr_of_counts, curr_state, rand_sample_size, max_nr_iter, nr_of_iter=20, stop_crit=1.0, nr_of_init=10) EmissionParameters['Diag_event_params']['alpha'][curr_state] = alpha EmissionParameters['Diag_event_params']['mix_comp'][ curr_state] = mixtures return EmissionParameters