Пример #1
0
def predict_ratios_with_SD(predictors, measures, nruns, outDir):
	'''
	Parameters
	predictors: df, index are ratios, columns are ['model', 'errormodel', 'features']
	measures: df, measured MDVs, rows are MDVs, columns are ['mean', 'sd']
	nruns: int, # of runs for Monte Carlo simulation
	outDir: str, output directory
	'''
	
	predRatiosSDs = pd.DataFrame(columns = ['predicted', 'sd'])
	for ratio, [baseModel, errorModel, features] in predictors.iterrows():
		
		subMeasures = measures.loc[features, :]
		if subMeasures['sd'].sum() == 0.0:   
			predRatio, predSD = predict_ratio_with_SD(baseModel, subMeasures['mean'], errorModel = errorModel)
			
		else:   
			predRatio, predSD = predict_ratio_with_SD(baseModel, subMeasures['mean'], SDs = subMeasures['sd'], nruns = nruns)
		
		predRatiosSDs.loc[ratio, :] = [np.asscalar(predRatio), np.asscalar(predSD)]
		

	display_estimated_ratios_or_fluxes('ratio', predRatiosSDs)
	
	save_data(predRatiosSDs, 'predicted_ratios', outDir, True, True)


	

	
	
Пример #2
0
def generate_random_fluxes_in_parallel(S, revs, fluxCons, ratioCons, bndCons, nsims, njobs, outDir):
	'''
	Parameters
	S: df, stoichiometric matrix, balanced metabolites in rows, net reactions fluxes in columns
	revs: ser, reaction reversibility
	fluxCons: ser, flux value constraints
	ratioCons: df, ratio range constraints, columns are ['greater_than_zero', 'smaller_than_zero'], e.g. a < v1/v2 < b will be transfromed into v1 - a*v2 > 0 and v1 - b*v2 < 0
	bndCons: df, flux range constraints, columns are ['lb', 'ub']
	nsims: int, # of flux distributions to simulate
	njobs: int, # of jobs run in parallel
	outDir: str, output directory
	
	Returns
	totalFluxDistribs: df, total fluxes distributions, columns are fluxes, rows are runs
	'''
	
	neqns = null_space(S).shape[1]
	ncons = 0 if fluxCons is None else fluxCons.shape[0]
	
	display_DOF(neqns, ncons)
	
	
	netFluxDistribs, netFluxBnds = flux_sampler(S, revs, fluxCons, ratioCons, bndCons, nsims, njobs)
	
	plot_random_flux_distributions(netFluxDistribs, netFluxBnds, outDir)
	save_data(netFluxDistribs, 'random_fluxes', outDir, False, True)
	
	
	scaler = 1 if fluxCons is None else fluxCons[fluxCons != 0].mean()
	
	totalFluxDistribs = generate_total_flux_from_net_flux(netFluxDistribs, revs, scaler)
	
	
	return totalFluxDistribs
Пример #3
0
def simulate_ratios_MDVs_in_parallel(simEMUs, symRatios, symAs, symBs,
                                     subMDVsAll, fluxDistribs, quantile, njobs,
                                     outDir):
    '''
	Parameters
	simEMUs: lst, of which the MDVs will be simulated
	symRatios: df, index is ratio, columns are ['args', 'symbol']
	symAs: dict, key is size, value is like [[symbol variables of A], symbol matrix A, [column EMUs of A]]
	symBs: dict, key is size, value is like [[symbol variables of B], symbol matrix B, [column EMUs of B]]
	subMDVsAll: dict of dict, like {tracer: {substrate EMU: MDV}}
	fluxDistribs: df, fluxes distributions, columns are fluxes, rows are runs
	quantile: float, simulated values in the quantile interval (i.e. [0.5 - quantile/2, 0.5 + quantile/2]) are retained
	njobs: # of jobs run in parallel
	outDir: str, output directory
	
	Returns
	ratiosMDVsAll: df, combined ratiosMDVs, of which the index is flux distribution NO, columns are flux ratios and MDVs
	'''

    length = int(np.ceil(fluxDistribs.shape[0] / njobs))
    fluxDistribChunks = [
        fluxDistribs[i * length:(i + 1) * length] for i in range(njobs)
    ]

    pool = Pool(processes=njobs)

    ratiosMDVs = []
    for i in range(njobs):

        if i >= len(fluxDistribChunks): continue

        res = pool.apply_async(func=simulator,
                               args=(simEMUs, symRatios, symAs, symBs,
                                     subMDVsAll, fluxDistribChunks[i]))

        ratiosMDVs.append(res)

    pool.close()
    pool.join()

    ratiosMDVs = [res.get() for res in ratiosMDVs]

    ratiosMDVsAll = pd.concat(ratiosMDVs, ignore_index=True)

    ratiosMDVsAll = filter_ratios(symRatios.index, ratiosMDVsAll, quantile)

    save_data(ratiosMDVsAll, 'ratios_MDVs', outDir, False, True)
Пример #4
0
def estimate_fluxes_with_SD(S, AeqConsAll, beqConsAll, bndCons, nruns, outDir):
    '''
	Parameters
	S: df, stoichiometric matrix, balanced metabolites in rows, total reactions in columns
	AeqConsAll: lst of df, A of equality constraints
	beqConsAll: lst of ser, b of equality constraints
	bndCons: df, boundary constraints of flux
	nruns: int, # of runs for Monte Carlo simulation
	outDir: str, output directory

	Returns
	estFluxesSDs: df, estimated net fluxes, rows are rxns, columns are ['estimated', 'sd']
	'''

    neqns = null_space(S).shape[1]
    ncons = AeqConsAll[0].shape[0]

    display_DOF(neqns, ncons)

    estTotalFluxes = pd.DataFrame(index=np.arange(nruns), columns=S.columns)
    for i, (AeqCons, beqCons) in enumerate(zip(AeqConsAll, beqConsAll)):

        estFluxes = estimate_fluxes(S, AeqCons, beqCons, bndCons)

        estTotalFluxes.loc[i, :] = estFluxes

    estNetFluxes = calculate_net_flux_from_total_flux(estTotalFluxes)

    estNetFluxesSDs = pd.DataFrame({
        'estimated': estNetFluxes.mean(axis=0),
        'sd': estNetFluxes.std(axis=0)
    })

    display_estimated_ratios_or_fluxes('flux', estNetFluxesSDs)

    save_data(estNetFluxesSDs, 'estimated_fluxes', outDir, True, True)
Пример #5
0
def select_ratios(EMUs, EAMs, symAs, symBs, subMDVsAll, fluxDistrib, outDir, exNodes = [], thold1 = 1e12, thold2 = 1e-3):
	'''
	Parameters
	EMUs: lst, of which the MDVs will be simulated
	EAMs: dict, EMU adjacency matrix (EAM) of different size, like {size: EAM}. NOTE: the cells of EAM are symbols
	symAs: dict, key is size, value is like [[symbol variables of A], symbol matrix A, [column EMUs of A]]
	symBs: dict, key is size, value is like [[symbol variables of B], symbol matrix B, [column EMUs of B]]
	subMDVsAll: dict of dict, like {tracer: {substrate EMU: MDV}}
	fluxDistrib: ser, flux distribution
	exNodes: lst, node metabolites excluded for ratio selection
	outDir: str, output directory
	thold1: float, threshold to calculate the null space, the greater threshold, the easier to get non-empty null space (higher DOF)
	thold2: float, distance threshold, under which column MDVs will be considered equal
	
	Returns
	selRatiosAll: df, selected ratios, index is ratio, columns are ['args', 'symbol']
	'''
	
	def find_independent_columns(data, thold):
		'''
		Parameters
		data: df, independent columns of which will be found
		thold: float, distance threshold, under which columns will be considered equal
		
		Returns
		indCols: lst, independent column names
		'''
		
		labels = AgglomerativeClustering(n_clusters = None, distance_threshold = thold).fit_predict(data.values.T)
		
		labelMapping = {}
		for label, col in zip(labels, data.columns):
			labelMapping.setdefault(label, []).append(col)
		
		indCols = [cols[0] for cols in labelMapping.values() if len(cols) == 1]
		
		return indCols
	
	
	lamAs = lambdify_matrix(symAs)
	lamBs = lambdify_matrix(symBs)
	
	
	selRatiosAll = pd.DataFrame()
	for _, subMDVs in subMDVsAll.items():
		
		simMDVsAll = simulate_MDVs(EMUs, lamAs, lamBs, subMDVs, fluxDistrib, 2)
		
		for EMU in EMUs:
			
			metab, atomNOs = re.match(r'^(\w+?)(\d+)$', EMU).groups()
			
			inputInfo = EAMs[len(atomNOs)][EMU][EAMs[len(atomNOs)][EMU] != 0]
			
			if metab not in exNodes and inputInfo.shape[0] > 1:  
				
				inputMat = np.array([reduce(conv, [ChainMap(simMDVsAll, subMDVs)[preEMU] for preEMU in preEMUs.split(',')]) for preEMUs in inputInfo.index]).T   
				inputMat = pd.DataFrame(inputMat, columns = inputInfo.index)
				
				DOF = null_space(inputMat.values, rcond = np.finfo(np.float64).eps * max(inputMat.shape) * thold1).shape[1]
				if DOF == 0:
					selPreEMUs = inputInfo.index.tolist()
				
				else:
					#selPreEMUs = find_independent_columns(inputMat, thold = thold2)
					selPreEMUs = []
					
				if selPreEMUs:   
					selRatios = pd.DataFrame()
					selRatios['symbol'] = inputInfo[selPreEMUs] / inputInfo.sum()
					selRatios['args'] = selRatios['symbol'].apply(lambda r: list(map(str, r.free_symbols)))
					selRatios['formula'] = inputInfo[selPreEMUs].index.str.replace(r',', '+') + '_' + inputInfo.name
					
					selRatiosAll = pd.concat((selRatiosAll, selRatios))	
				
	selRatiosAll.drop_duplicates(subset = ['symbol'], inplace = True)
	selRatiosAll.index = ['r' + str(i) for i in range(1, selRatiosAll.shape[0]+1)]
	
	if selRatiosAll.empty:
		raise ValueError('no ratio selected, simulation terminated.')
	
	save_data(selRatiosAll[['formula', 'symbol']], 'selected_ratios', outDir, True, True)
	
	
	return selRatiosAll[['args', 'symbol']]
Пример #6
0
def model_selector(ratio,
                   Xtrain,
                   Xtest,
                   Ytrain,
                   Ytest,
                   methods,
                   outDir,
                   error=False,
                   nfolds=5):
    '''
	Parameters
	ratio: str: ratio ID
	Xtrain: df, feature matrix for training
	Xtest: df, feature matrix for testing
	Ytrain: ser, target for training
	Ytest: ser, target for testing
	methods: lst, ML methods to test
	outDir: str, output directory
	error: bool, whether to train a error model
	nfolds: int, cross validation folds
	'''

    subOutDir = r'%s/%s' % (outDir, ratio)
    os.makedirs(subOutDir, exist_ok=True)

    print('\nratio ' + ratio)

    predRess = pd.DataFrame(
        columns=pd.MultiIndex.from_product([methods, ['predicted', 'true']]))
    MAEs = pd.Series(index=methods)
    R2s = MAEs.copy()

    for method in methods:

        print('\ntuning %s ...' % method)

        bestModel, bestParams, predRes, MAE, R2 = evaluate_model(Xtrain,
                                                                 Xtest,
                                                                 Ytrain,
                                                                 Ytest,
                                                                 method,
                                                                 nfolds=nfolds)

        predRess.loc[:, idx[method, :]] = predRes.values
        MAEs[method] = MAE
        R2s[method] = R2

        display_best_params(bestParams)

        save_model(method, bestModel, subOutDir)

        if error:
            Ypredict = bestModel.predict(Xtest.values)

            YtestError = (Ypredict - Ytest)**2

            bestErrorModel = tune_model(Xtest,
                                        YtestError,
                                        method,
                                        nfolds=nfolds)[0]

            save_model(method + '_error', bestErrorModel, subOutDir)

    plot_MAE(ratio, MAEs, subOutDir)
    save_data(MAEs, 'MAE', subOutDir, True, False)

    plot_predicted_vs_true(ratio, predRess, R2s, subOutDir)
    save_data(predRess, 'predicted_vs_true', subOutDir, False, True)
    save_data(R2s, 'R2', subOutDir, True, False)