def FitTransistionParameters(Sequences, Background, TransitionParameters, CurrPath, verbosity=1): """Determine optimal logistic regression parameters. Return the optimal parameters of the logistic regression for predicting the TransitionParameters. """ print('Fitting transition parameters') get_mem_usage(verbosity) NewTransitionParametersLogReg = FitTransistionParametersSimple( Sequences, Background, TransitionParameters, CurrPath, verbosity=verbosity) get_mem_usage(verbosity) return NewTransitionParametersLogReg
def run_omniCLIP(args): """Run omniCLIP function.""" # Parsing the arguments EmissionParameters = ParsingArgs.parsing_argparse(args) # Creating temp copies of Sequence and Background ParsingArgs.dup_seqfiles(EmissionParameters) # Parsing arguments dependents of Sequence and Background EmissionParameters = ParsingArgs.parsing_files(args, EmissionParameters) # Load the gene annotation print('Loading gene annotation') if args.gene_anno_file.split('.')[-1] == 'db': GeneAnnotation = gffutils.FeatureDB(args.gene_anno_file, keep_order=True) import warnings warnings.filterwarnings('error') # Load the reads get_mem_usage(EmissionParameters['verbosity']) t = time.time() Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r+') Background = h5py.File(EmissionParameters['dat_file_bg'], 'r+') msg = 'Done: Elapsed time: ' + str(time.time() - t) get_mem_usage(EmissionParameters['verbosity'], t=t, msg=msg) # Initializing parameters print('Initialising the parameters') TransitionParameters = [EmissionParameters['TransMat'], []] # Transistion parameters IterParameters = [EmissionParameters, TransitionParameters] # Start computation # Iteratively fit the parameters of the model OldLogLikelihood = 0 CurrLogLikelihood = -np.inf CurrIter = 0 LoglikelihodList = [] First = 1 Paths = {} iter_cond = True while iter_cond: print("\n") print("Iteration: " + str(CurrIter)) if EmissionParameters['verbosity'] > 1: print(IterParameters[0]) OldLogLikelihood = CurrLogLikelihood CurrLogLikelihood, IterParameters, First, Paths = PerformIteration( Sequences, Background, IterParameters, EmissionParameters['NrOfStates'], First, Paths, verbosity=EmissionParameters['verbosity']) gc.collect() if EmissionParameters['verbosity'] > 1: print("Log-likelihood: " + str(CurrLogLikelihood)) LoglikelihodList.append(CurrLogLikelihood) if EmissionParameters['verbosity'] > 1: print(LoglikelihodList) CurrIter += 1 if CurrIter >= EmissionParameters['max_it']: print('Maximal number of iterations reached') if CurrIter < max(3, EmissionParameters['max_it']): iter_cond = True else: iter_cond = ( (CurrIter < EmissionParameters['max_it']) and ((abs(CurrLogLikelihood - OldLogLikelihood)/max(abs(CurrLogLikelihood), abs(OldLogLikelihood))) > 0.01) and (abs(CurrLogLikelihood - OldLogLikelihood) > args.tol_lg_lik)) # Return the fitted parameters print('Finished parameter fitting') EmissionParameters, TransitionParameters = IterParameters if not isinstance(EmissionParameters['ExpressionParameters'][0], np.ndarray): print('Emmision parameters have not been fit yet') return OutFile = os.path.join(EmissionParameters['out_dir'], EmissionParameters['out_file_base'] + '.txt') # Determine which state has higher weight in fg. get_mem_usage(EmissionParameters['verbosity']) fg_state, bg_state = emission_prob.get_fg_and_bck_state(EmissionParameters, final_pred=True) if EmissionParameters['fg_pen'] > 0.0: print('Recomputing paths') EmissionParameters['LastIter'] = True Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r') Background = h5py.File(EmissionParameters['dat_file_bg'], 'r') Paths, LogLike = tools.ParallelGetMostLikelyPath( Paths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo', verbosity=EmissionParameters['verbosity']) Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r') Background = h5py.File(EmissionParameters['dat_file_bg'], 'r') tools.GeneratePred( Paths, Sequences, Background, IterParameters, GeneAnnotation, OutFile, fg_state, bg_state, pv_cutoff=EmissionParameters['pv_cutoff'], verbosity=EmissionParameters['verbosity']) print('Done') # Remove the temporary files print('Removing temporary files') os.remove(EmissionParameters['dat_file_clip']) os.remove(EmissionParameters['dat_file_bg']) return
def FitEmissionParameters(Sequences, Background, NewPaths, OldEmissionParameters, First, verbosity=1): """Fit EmissionParameters.""" print('Fitting emission parameters') t = time.time() # Unpack the arguments OldAlpha = OldEmissionParameters['Diag_event_params'] NrOfStates = OldEmissionParameters['NrOfStates'] OldPriorMatrix = OldEmissionParameters['PriorMatrix'] NewEmissionParameters = OldEmissionParameters # Compute new prior matrix PriorMatrix = np.zeros_like(OldPriorMatrix) for State in range(NrOfStates): for path in NewPaths: PriorMatrix[State] += np.sum(NewPaths[path] == State) # Check if one of the states is not used and add pseudo gene to prevent # singularities during distribution fitting if np.sum(PriorMatrix == 0) > 0: LoadReads.close_data_handles(handles=[Sequences, Background]) Sequences = h5py.File(NewEmissionParameters['dat_file_clip'], 'r+') Background = h5py.File(NewEmissionParameters['dat_file_bg'], 'r+') Sequences, Background, NewPaths = add_pseudo_gene( Sequences, Background, NewPaths, PriorMatrix) print('Adds pseudo gene to prevent singular matrix during GLM fitting') CorrPriorMatrix = np.copy(PriorMatrix) CorrPriorMatrix[CorrPriorMatrix == 0] = np.min( CorrPriorMatrix[CorrPriorMatrix > 0])/10 CorrPriorMatrix /= np.sum(CorrPriorMatrix) # Keep a copy to check which states are not used NewEmissionParameters['PriorMatrix'] = CorrPriorMatrix # Add Pseudo gene to Sequences, Background and Paths if NewEmissionParameters['ExpressionParameters'][0] is not None: Sequences, Background, NewPaths = add_pseudo_gene( Sequences, Background, NewPaths, PriorMatrix) # Compute parameters for the expression Sequences = h5py.File(NewEmissionParameters['dat_file_clip'], 'r') if (NewEmissionParameters['bg_type'] != 'None') and not First: if 'Pseudo' in list(Sequences.keys()): nr_of_genes = len(list(Sequences.keys())) new_pars = NewEmissionParameters['ExpressionParameters'][0] new_pars = np.vstack( (new_pars[:(nr_of_genes), :], np.mean(new_pars[:(nr_of_genes), :]), new_pars[(nr_of_genes):, :])) NewEmissionParameters['ExpressionParameters'][0] = new_pars print('Estimating expression parameters') get_mem_usage(verbosity) NewEmissionParameters = emission_prob.estimate_expression_param( (NewEmissionParameters, NewPaths), verbosity=verbosity) get_mem_usage(verbosity) Sequences = h5py.File(NewEmissionParameters['dat_file_clip'], 'r') Background = h5py.File(NewEmissionParameters['dat_file_bg'], 'r') if NewEmissionParameters['bg_type'] != 'None': if 'Pseudo' in list(Sequences.keys()): nr_of_genes = len(list(Sequences.keys())) new_pars = NewEmissionParameters['ExpressionParameters'][0] new_pars = np.vstack((new_pars[:(nr_of_genes-1), :], new_pars[(nr_of_genes):, :])) NewEmissionParameters['ExpressionParameters'][0] = new_pars if NewEmissionParameters['skip_diag_event_mdl'] is False: # Compute parameters for the ratios print('Computing sufficient statistic for fitting md') get_mem_usage(verbosity) SuffStat = tools.GetSuffStat( NewPaths, NrOfStates, Type='Conv', EmissionParameters=NewEmissionParameters, verbosity=verbosity) # Vectorize SuffStat Counts, NrOfCounts = tools.ConvertSuffStatToArrays(SuffStat) del SuffStat get_mem_usage(verbosity) if NewEmissionParameters['subs']: Counts, NrOfCounts = tools.subsample_suff_stat(Counts, NrOfCounts) print('Fitting md distribution') get_mem_usage(verbosity) if NewEmissionParameters['diag_bg']: print("Adjusting background") SuffStatBck = tools.GetSuffStatBck( NewPaths, NrOfStates, Type='Conv', EmissionParameters=NewEmissionParameters, verbosity=verbosity) # Vectorize SuffStat CountsBck, NrOfCountsBck = tools.ConvertSuffStatToArrays( SuffStatBck) if NewEmissionParameters['subs']: CountsBck, NrOfCountsBck = tools.subsample_suff_stat( CountsBck, NrOfCountsBck) # Overwrite counts in other bins fg_state, bg_state = emission_prob.get_fg_and_bck_state( NewEmissionParameters, final_pred=True) for curr_state in list(Counts.keys()): if curr_state != fg_state: Counts[curr_state] = CountsBck[fg_state] NrOfCounts[curr_state] = NrOfCountsBck[fg_state] del SuffStatBck NewEmissionParameters = mixture_tools.em( Counts, NrOfCounts, NewEmissionParameters, x_0=OldAlpha, First=First, verbosity=verbosity) get_mem_usage(verbosity) del Counts, NrOfCounts if 'Pseudo' in list(Sequences.keys()): del Sequences['Pseudo'] del Background['Pseudo'] del NewPaths['Pseudo'] if verbosity > 0: print('Done: Elapsed time: ' + str(time.time() - t)) return NewEmissionParameters
def PerformIteration(Sequences, Background, IterParameters, NrOfStates, First, NewPaths={}, verbosity=1): """ This function performs an iteration of the HMM algorithm """ # Unpack the Iteration parameters EmissionParameters = IterParameters[0] TransitionParameters = IterParameters[1] # Get new most likely path if First: NewPaths, LogLike = tools.ParallelGetMostLikelyPath( NewPaths, Sequences, Background, EmissionParameters, TransitionParameters, 'h**o', RandomNoise=True, verbosity=verbosity) Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r') Background = h5py.File(EmissionParameters['dat_file_bg'], 'r') get_mem_usage(verbosity) # Perform EM to compute the new emission parameters print('Fitting emission parameters') get_mem_usage(verbosity) NewEmissionParameters = FitEmissionParameters( Sequences, Background, NewPaths, EmissionParameters, First, verbosity=verbosity) if First: First = 0 get_mem_usage(verbosity) # Fit the transition matrix parameters NewTransitionParameters = TransitionParameters print('Fitting transition parameters') get_mem_usage(verbosity) LoadReads.close_data_handles(handles=[Sequences, Background]) Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r') Background = h5py.File(EmissionParameters['dat_file_bg'], 'r') TransistionPredictors = trans.FitTransistionParameters( Sequences, Background, TransitionParameters, NewPaths, verbosity=verbosity) NewTransitionParameters[1] = TransistionPredictors get_mem_usage(verbosity) NewIterParameters = [NewEmissionParameters, NewTransitionParameters] print('Computing most likely path') get_mem_usage(verbosity) gc.collect() NewPaths, LogLike = tools.ParallelGetMostLikelyPath( NewPaths, Sequences, Background, EmissionParameters, TransitionParameters, 'nonhomo', verbosity=verbosity) Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r') Background = h5py.File(EmissionParameters['dat_file_bg'], 'r') CurrLogLikelihood = LogLike get_mem_usage(verbosity) if verbosity > 1: print('LogLik:') print(CurrLogLikelihood) return CurrLogLikelihood, NewIterParameters, First, NewPaths
def FitTransistionParametersSimple(Sequences, Background, TransitionParameters, CurrPath, verbosity=1): """Determine optimal logistic regression parameters. Return the optimal parameters of the logistic regression for predicting the TransitionParameters. """ # Generate features from the CurrPaths and the Information in the coverage TransitionMatrix = TransitionParameters[0] NewTransitionParametersLogReg = {} t = time.time() # Iterate over the possible transitions assert (TransitionMatrix.shape[0] > 1), 'Only two states are currently allowed' genes = list(CurrPath.keys()) genes = random.sample(genes, min(len(genes), 1000)) NrOfStates = TransitionMatrix.shape[0] Xs = [] Ys = [] SampleSame = [] SampleOther = [] print("Learning transition model") print("Iterating over genes") get_mem_usage(verbosity, msg='Fitting transition parameters: I') for i, gene in enumerate(genes): if i % 1000 == 0: sys.stdout.write('.') sys.stdout.flush() # Get data Sequences_per_gene = tools.PreloadSequencesForGene(Sequences, gene) CovMat = tools.StackData(Sequences_per_gene, add='all') CovMat[CovMat < 0] = 0 nr_of_samples = CovMat.shape[0] for CurrState in range(NrOfStates): for NextState in range(NrOfStates): # Positions where the path is in the current state Ix1 = CurrPath[gene][:-1] == CurrState # Positions where the subsequent position path is in the "next" # state Ix2 = CurrPath[gene][1:] == NextState # Positions where the path changes from the current state to # the other state Ix = np.where(Ix1 * Ix2)[0] CovMatIx = GenerateFeatures(Ix, CovMat) if CurrState == NextState: if CovMatIx.shape[1] == 0: CovMatIx = np.zeros((nr_of_samples, 1)) SampleSame.append(CovMatIx) else: SampleSame.append(CovMatIx) else: if CovMatIx.shape[1] == 0: CovMatIx = np.zeros((nr_of_samples, 1)) SampleOther.append(CovMatIx) else: SampleOther.append(CovMatIx) del Sequences_per_gene, CovMat get_mem_usage(verbosity, msg='Fitting transition parameters: II') len_same = np.sum([Mat.shape[1] for Mat in SampleSame]) len_other = np.sum([Mat.shape[1] for Mat in SampleOther]) X = np.concatenate(SampleSame + SampleOther, axis=1).T del SampleSame, SampleOther # Create Y Y = np.hstack( (np.ones((1, len_same), dtype=np.int), np.zeros((1, len_other), dtype=np.int)))[0, :].T classes = np.unique(Y) get_mem_usage(verbosity, msg='Fitting transition parameters: III') n_iter = max(5, np.ceil(10**6 / Y.shape[0])) NewTransitionParametersLogReg = SGDClassifier(loss="log", max_iter=n_iter) ix_shuffle = np.arange(X.shape[0]) for n in range(n_iter): np.random.shuffle(ix_shuffle) for batch_ix in np.array_split(ix_shuffle, 50): NewTransitionParametersLogReg.partial_fit( X[batch_ix, :], Y[batch_ix], classes=classes) del Ix1, Ix2, Ix, X, Y, Xs, Ys get_mem_usage(verbosity, t=t, msg='Fitting transition parameters: IV') return NewTransitionParametersLogReg
def estimate_expression_param(expr_data, verbosity=1): """Estimate the parameters for the expression GLM.""" (EmissionParameters, Paths) = expr_data Sequences = h5py.File(EmissionParameters['dat_file_clip'], 'r') Background = h5py.File(EmissionParameters['dat_file_bg'], 'r') # 1) Get the library size print('Start estimation of expression parameters') bg_type = EmissionParameters['bg_type'] lib_size = EmissionParameters['LibrarySize'] bck_lib_size = EmissionParameters['BckLibrarySize'] start_params = EmissionParameters['ExpressionParameters'][0] disp = EmissionParameters['ExpressionParameters'][1] # 2) Estimate dispersion print('Constructing GLM matrix') t = time.time() # 3) Compute sufficient statistics get_mem_usage( verbosity, msg='Estimating expression parameters: before GLM matrix construction') A, w, Y, rep = construct_glm_matrix( EmissionParameters, Sequences, Background, Paths) print('Estimating expression parameters: GLM matrix constrution') get_mem_usage(verbosity, t=t) # Make sure that matrix A is in the right format if not sp.sparse.isspmatrix_csc(A): A = csc_matrix(A) get_mem_usage( verbosity, msg='Estimating expression parameters: before GLM matrix') # Create the offset for the library size offset = np.zeros_like(rep) for i in range(EmissionParameters['NrOfReplicates']): offset[rep == (i + 1)] = lib_size[str(i)] if bg_type != 'None': for i in range(EmissionParameters['NrOfBckReplicates']): offset[rep == -(i + 1)] = bck_lib_size[str(i)] # 4) Fit GLM print('Fitting GLM') t = time.time() print('Estimating expression parameters: before fitting') get_mem_usage(verbosity) start_params, disp = fit_glm( A, w, Y, offset, disp, start_params, norm_class=EmissionParameters['norm_class'], tmp_dir=EmissionParameters['tmp_dir']) get_mem_usage( verbosity, msg='Estimating expression parameters: after fitting') del A, w, Y, offset get_mem_usage( verbosity, t=t, msg='Estimating expression parameters: after cleanup') # 5) Process the output EmissionParameters['ExpressionParameters'] = [start_params, disp] print('Finished expression parameter estimation') return EmissionParameters