def psm(p, s0, c = 2, highResMs2 = False, dripLearnedMeans = 'dripLearned.means', dripLearnedCovars = 'dripLearned.covars', mods = '', ntermMods = '', ctermMods = '', varModSequence = '', precursor_filter = False, high_res_gauss_dist = 0.05): """ Inputs: p = peptide string s = observed spectrum, instance of class MS2Spectrum c = psm charge mods = static mods ntermMods = static nterm-mods ctermMods = static cterm-mods """ s = copy.deepcopy(s0) args = dripGaussianCollectionNames() sid = s.spectrum_id # parse modifications mods, varMods = parse_var_mods(mods, True) ntermMods, ntermVarMods = parse_var_mods(ntermMods, False) ctermMods, ctermVarMods = parse_var_mods(ctermMods, False) if precursor_filter: normalize = 'top300TightSequest' else: normalize = 'top300Sequest' preprocess = pipeline(normalize) preprocess(s) # get original intensity values to plot s0.mz = list(s.mz) mz_vals = set(s.mz) z = max(s0.intensity) s0.intensity = [i/z for mz, i in zip(s0.mz, s0.intensity) if mz in mz_vals] num_psms = 1 max_obs_mass = 2001 dirBase = 'dtk' # output_dir = os.path.abspath('dripEncode_' + dirBase) output_dir = os.path.abspath('encode') if not os.path.exists(output_dir): os.mkdir(output_dir) obs_dir = 'obs' # sub directory of output_dir pfile_dir = os.path.join(output_dir, obs_dir) if not os.path.exists(pfile_dir): os.mkdir(pfile_dir) # log_dir = os.path.abspath('dripLog_' + dirBase) log_dir = os.path.abspath('log') if not os.path.exists(log_dir): os.mkdir(log_dir) if not highResMs2: dripMeans = load_drip_means(dripLearnedMeans) if varMods or ntermVarMods or ctermVarMods: assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied. Exitting" bNy = interleave_b_y_ions_var_mods_lowres(Peptide(p), c, mods, ntermMods, ctermMods, varMods, varNtermMods, varCtermMods, varModSequence) else: bNy = interleave_b_y_ions_lowres(Peptide(p), c, mods, ntermMods, ctermMods) l = len(bNy) filter_theoretical_peaks_lowres(bNy, dripMeans, s.mz[0], s.mz[-1]) else: # calculate b- and y-ions, filter peaks outside of spectrum range if varMods or ntermVarMods or ctermVarMods: assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied. Exitting" bNy = interleave_b_y_ions_var_mods(Peptide(p), c, mods, ntermMods, ctermMods, varMods, varNtermMods, varCtermMods, varModSequence) else: bNy = interleave_b_y_ions(Peptide(p), c, mods, ntermMods, ctermMods) l = len(bNy) filter_theoretical_peaks(bNy, s.mz[0], s.mz[-1], high_res_gauss_dist) # now construct means based on this dripMeans = {} for i, ion in enumerate(bNy): dripMeans[i] = ion ion_to_index_map = {} # reverse mapping, from ions to indices for ind in dripMeans: ion_to_index_map[dripMeans[ind]] = ind # make collection per spectrum make_master_parameters_lowres(args, dripMeans) peptide_obs_file = os.path.join(pfile_dir,'pep-lengths') spectrum_obs_file = os.path.join(pfile_dir,'spectrum') pep_dt = open(os.path.join(output_dir, 'iterable.dts'), "w") pep_dt.write('%d\n\n' % (num_psms)) # write peptide database to parse and identify GMTK segments later pepdb_list = open(os.path.join(output_dir, 'pepDB.txt'), "w") pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n") pep_num = 0 # create iterable dt and peptide pfile peptide_sentence_flatascii(pep_dt, p, bNy, pep_num, sid, max_obs_mass, peptide_obs_file, True, len(bNy)) # create spectrum pfile spectrum_sentence_flatascii(spectrum_obs_file, s.mz, s.intensity) pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, p, l, c)) # close streams for this spectrum pep_dt.close() pepdb_list.close() # compile dt using gmtkDTIndex call(['gmtkDTindex', '-decisionTreeFiles', os.path.join(output_dir,'iterable.dts')], stdout = stdo, stderr = stde) # stdout = sys.stderr, stderr = sys.stderr) # create structure and master files then triangulate try: create_drip_structure(highResMs2, args.structure_file, max_obs_mass, False, False, high_res_gauss_dist) except: print "Could not create DRIP structure file %s, exitting" % args.structure_file exit(-1) try: create_drip_master(highResMs2, args.master_file, max_obs_mass, "DRIP_MZ", "drip_collection/covar.txt", "DRIP_GAUSSIAN_COMPONENTS", "DRIP_GAUSSIAN_MIXTURES", "DRIP_MZ_GAUSSIANS") except: print "Could not create DRIP master file %s, exitting" % args.master_file exit(-1) try: triangulate_drip(args.structure_file, args.master_file) except: print "Could not create triangulate structure file %s, exitting" % args.structure_file exit(-1) try: write_covar_file(highResMs2, args.covar_file, dripLearnedCovars, True, high_res_gauss_dist) except: print "Could not create covariance file %s, exitting" % args.covar_file exit(-1) # run GMTK dtFile = os.path.join(output_dir, 'iterable.dts') cppCommand = '\'-DITERABLE_DT=' + dtFile \ + ' -DDRIP_MZ=' + args.mean_file \ + ' -DDRIP_GAUSSIAN_COMPONENTS=' + args.gauss_file \ + ' -DDRIP_GAUSSIAN_MIXTURES=' + args.mixture_file \ + ' -DDRIP_MZ_GAUSSIANS=' + args.collection_file \ + '\'' # call gmtkViterbi vitStr0 = "gmtkViterbi -strFile " + args.structure_file \ + " -triFile " + args.structure_file + ".trifile -ni1 0 -nf1 2 -ni2 1 -nf2 0" \ + " -fdiffact2 rl" \ + " -inputMasterFile " + args.master_file + " -inputTrainableParameters trained.params -failOnZeroClique F" # gmtkViterbi command line vitValsFile = os.path.join(log_dir, 'vitVals.txt') vitStr = vitStr0 + ' -vitValsFile ' + vitValsFile \ + ' -of1 ' + spectrum_obs_file \ + ' -fmt1 flatascii ' \ + ' -of2 ' + peptide_obs_file \ + ' -fmt2 flatascii ' \ + ' -cppCommand ' + cppCommand # call(shlex.split(vitStr), stdout = sys.stdout, stderr = sys.stdout) call(shlex.split(vitStr), stdout = stdo, stderr = stde) # parse output t,d = ppsm.parse_dripExtract(vitValsFile, os.path.join(output_dir, 'pepDB.txt')) t = t[sid,c][0] # calculate insertions and deletions t.add_obs_spectrum(s0) t.calculate_drip_features(dripMeans) t.calc_by_sets(c, mods, ntermMods, ctermMods, highResMs2, ion_to_index_map, varMods, ntermVarMods, ctermVarMods, varModSequence) return t
exit(-1) try: create_drip_master(args.high_res_ms2, args.master_file, args.max_obs_mass, "DRIP_MZ", "drip_collection/covar.txt", "DRIP_GAUSSIAN_COMPONENTS", "DRIP_GAUSSIAN_MIXTURES", "DRIP_MZ_GAUSSIANS") except: print "Could not create DRIP structure file %s, exitting" % args.structure_file exit(-1) try: triangulate_drip(args.structure_file, args.master_file) except: print "Could not create triangulate structure file %s, exitting" % args.structure_file exit(-1) try: write_covar_file(args.high_res_ms2, args.covar_file, args.learned_covars, True, args.high_res_gauss_dist) except: print "Could not create covariance file %s, exitting" % args.covar_file exit(-1) # run GMTK dtFile = os.path.join(args.output_dir, 'iterable.dts') cppCommand = '\'-DITERABLE_DT=' + dtFile \
def runDripExtract(args, stdo, stde): """ Run drip once per spectrum, collapsing all charge-varying candidates into a single GMTK call """ # create constant gmtkViterbi command line string # don't need frame/segment difference actions since each PSM corresponds to a specific spectrum, # so that there isn't much redudandancy to exploit vitStr0 = "gmtkViterbi -strFile " + args.structure_file \ + " -triFile " + args.structure_file + ".trifile -ni1 0 -nf1 2 -ni2 1 -nf2 0" \ + " -fdiffact2 rl" \ + " -inputMasterFile " + args.master_file + " -inputTrainableParameters trained.params -failOnZeroClique F" # for now, don't worry about checking whether peptide is in valid (i.e., present in the digested # set of peptide candidates given the protein database) # currently ignore ident file input for spectra filtering spectra, minMz, maxMz, validcharges, _ = load_spectra_ret_dict(args.spectra, args.charges) # update encountered charges args.charges = validcharges args.mz_lb = minMz args.mz_ub = maxMz # create GMTK observation files # add in support for cluster usage later; assume standalone with multithreading if args.high_res_ms2: spec_dict, num_psms = make_drip_data_highres(args, spectra, stdo, stde) else: spec_dict, num_psms = make_drip_data_lowres(args, spectra, stdo, stde) pfile_dir = os.path.join(args.output_dir, args.obs_dir) # create structure and master files then triangulate try: create_drip_structure(args.high_res_ms2, args.structure_file, args.max_obs_mass, False, False, args.high_res_gauss_dist) except: print "Could not create DRIP structure file %s, exitting" % args.structure_file exit(-1) try: create_drip_master(args.high_res_ms2, args.master_file, args.max_obs_mass, "DRIP_MZ", "drip_collection/covar.txt", "DRIP_GAUSSIAN_COMPONENTS", "DRIP_GAUSSIAN_MIXTURES", "DRIP_MZ_GAUSSIANS") except: print "Could not create DRIP master file %s, exitting" % args.master_file exit(-1) try: triangulate_drip(args.structure_file, args.master_file) except: print "Could not create triangulate structure file %s, exitting" % args.structure_file exit(-1) try: write_covar_file(args.high_res_ms2, args.covar_file, args.learned_covars, True, args.high_res_gauss_dist) except: print "Could not create covariance file %s, exitting" % args.covar_file exit(-1) # run GMTK dtFile = os.path.join(args.output_dir, 'iterable.dts') cppCommand = '\'-DITERABLE_DT=' + dtFile \ + ' -DDRIP_MZ=' + args.mean_file \ + ' -DDRIP_GAUSSIAN_COMPONENTS=' + args.gauss_file \ + ' -DDRIP_GAUSSIAN_MIXTURES=' + args.mixture_file \ + ' -DDRIP_MZ_GAUSSIANS=' + args.collection_file \ + '\'' # call gmtkViterbi # gmtkViterbi command line vitValsFile = os.path.join(args.logDir, 'vitVals.txt') vitStr = vitStr0 + ' -vitValsFile ' + vitValsFile \ + ' -of1 ' + pfile_dir + '/spectrum.pfile' \ + ' -of2 ' + pfile_dir + '/pep-lengths.pfile' \ + ' -cppCommand ' + cppCommand call(shlex.split(vitStr), stdout = stdo, stderr = stde) t,d = psm.parse_dripExtract(vitValsFile, os.path.join(args.output_dir, 'pepDB.txt')) return t,d, spec_dict
def psm(p, s0, c = 2, highResMs2 = False, dripLearnedMeans = 'dripLearned.means', dripLearnedCovars = 'dripLearned.covars', mods = '', ntermMods = '', ctermMods = '', varModSequence = '', precursor_filter = False, high_res_gauss_dist = 0.05): """ Inputs: p = peptide string s = observed spectrum, instance of class MS2Spectrum c = psm charge mods = static mods ntermMods = static nterm-mods ctermMods = static cterm-mods """ s = copy.deepcopy(s0) args = dripGaussianCollectionNames() sid = s.spectrum_id # parse modifications mods, varMods = parse_var_mods(mods, True) ntermMods, ntermVarMods = parse_var_mods(ntermMods, False) ctermMods, ctermVarMods = parse_var_mods(ctermMods, False) if precursor_filter: normalize = 'top300TightSequest' else: normalize = 'top300Sequest' preprocess = pipeline(normalize) preprocess(s) # get original intensity values to plot s0.mz = list(s.mz) mz_vals = set(s.mz) z = max(s0.intensity) s0.intensity = [i/z for mz, i in zip(s0.mz, s0.intensity) if mz in mz_vals] num_psms = 1 max_obs_mass = 2001 dirBase = 'dtk' # output_dir = os.path.abspath('dripEncode_' + dirBase) output_dir = os.path.abspath('encode') if not os.path.exists(output_dir): os.mkdir(output_dir) obs_dir = 'obs' # sub directory of output_dir pfile_dir = os.path.join(output_dir, obs_dir) if not os.path.exists(pfile_dir): os.mkdir(pfile_dir) # log_dir = os.path.abspath('dripLog_' + dirBase) log_dir = os.path.abspath('log') if not os.path.exists(log_dir): os.mkdir(log_dir) if not highResMs2: dripMeans = load_drip_means(dripLearnedMeans) if varMods or ntermVarMods or ctermVarMods: assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied. Exitting" bNy = interleave_b_y_ions_var_mods_lowres(Peptide(p), c, mods, ntermMods, ctermMods, varMods, varNtermMods, varCtermMods, varModSequence) else: bNy = interleave_b_y_ions_lowres(Peptide(p), c, mods, ntermMods, ctermMods) l = len(bNy) filter_theoretical_peaks_lowres(bNy, dripMeans, s.mz[0], s.mz[-1]) else: # calculate b- and y-ions, filter peaks outside of spectrum range if varMods or ntermVarMods or ctermVarMods: assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied. Exitting" bNy = interleave_b_y_ions_var_mods(Peptide(p), c, mods, ntermMods, ctermMods, varMods, varNtermMods, varCtermMods, varModSequence) else: bNy = interleave_b_y_ions(Peptide(p), c, mods, ntermMods, ctermMods) l = len(bNy) filter_theoretical_peaks(bNy, s.mz[0], s.mz[-1], high_res_gauss_dist) # now construct means based on this dripMeans = {} for i, ion in enumerate(bNy): dripMeans[i] = ion ion_to_index_map = {} # reverse mapping, from ions to indices for ind in dripMeans: ion_to_index_map[dripMeans[ind]] = ind # make collection per spectrum make_master_parameters_lowres(args, dripMeans) peptide_obs_file = os.path.join(pfile_dir,'pep-lengths') spectrum_obs_file = os.path.join(pfile_dir,'spectrum') pep_dt = open(os.path.join(output_dir, 'iterable.dts'), "w") pep_dt.write('%d\n\n' % (num_psms)) # write peptide database to parse and identify GMTK segments later pepdb_list = open(os.path.join(output_dir, 'pepDB.txt'), "w") pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n") pep_num = 0 # create iterable dt and peptide pfile peptide_sentence_flatascii(pep_dt, p, bNy, pep_num, sid, max_obs_mass, peptide_obs_file, True, len(bNy)) # create spectrum pfile spectrum_sentence_flatascii(spectrum_obs_file, s.mz, s.intensity) pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, p, l, c)) # close streams for this spectrum pep_dt.close() pepdb_list.close() # compile dt using gmtkDTIndex call(['gmtkDTindex', '-decisionTreeFiles', os.path.join(output_dir,'iterable.dts')], stdout = stdo, stderr = stde) # stdout = sys.stderr, stderr = sys.stderr) # create structure and master files then triangulate try: create_drip_structure(highResMs2, args.structure_file, max_obs_mass, False, False, high_res_gauss_dist) except: print "Could not create DRIP structure file %s, exitting" % args.structure_file exit(-1) try: create_drip_master(highResMs2, args.master_file, max_obs_mass, "DRIP_MZ", "drip_collection/covar.txt", "DRIP_GAUSSIAN_COMPONENTS", "DRIP_GAUSSIAN_MIXTURES", "DRIP_MZ_GAUSSIANS") except: print "Could not create DRIP master file %s, exitting" % args.master_file exit(-1) try: triangulate_drip(args.structure_file, args.master_file) except: print "Could not create triangulate structure file %s, exitting" % args.structure_file exit(-1) try: write_covar_file(highResMs2, args.covar_file, dripLearnedCovars, True, high_res_gauss_dist) except: print "Could not create covariance file %s, exitting" % args.covar_file exit(-1) # run GMTK dtFile = os.path.join(output_dir, 'iterable.dts') cppCommand = '\'-DITERABLE_DT=' + dtFile \ + ' -DMAX_FRAGMENT_MASS=' + str(max_obs_mass) \ + ' -DDRIP_MZ=' + args.mean_file \ + ' -DDRIP_GAUSSIAN_COMPONENTS=' + args.gauss_file \ + ' -DDRIP_GAUSSIAN_MIXTURES=' + args.mixture_file \ + ' -DDRIP_MZ_GAUSSIANS=' + args.collection_file \ + '\'' # call gmtkViterbi vitStr0 = "gmtkViterbi -strFile " + args.structure_file \ + " -triFile " + args.structure_file + ".trifile -ni1 0 -nf1 2 -ni2 1 -nf2 0" \ + " -fdiffact2 rl" \ + " -inputMasterFile " + args.master_file + " -inputTrainableParameters trained.params -failOnZeroClique F" # gmtkViterbi command line vitValsFile = os.path.join(log_dir, 'vitVals.txt') vitStr = vitStr0 + ' -vitValsFile ' + vitValsFile \ + ' -of1 ' + spectrum_obs_file \ + ' -fmt1 flatascii ' \ + ' -of2 ' + peptide_obs_file \ + ' -fmt2 flatascii ' \ + ' -cppCommand ' + cppCommand # call(shlex.split(vitStr), stdout = sys.stdout, stderr = sys.stdout) call(shlex.split(vitStr), stdout = stdo, stderr = stde) # parse output t,d = ppsm.parse_dripExtract(vitValsFile, os.path.join(output_dir, 'pepDB.txt')) t = t[sid,c][0] # calculate insertions and deletions t.add_obs_spectrum(s0) t.calculate_drip_features(dripMeans) t.calc_by_sets(c, mods, ntermMods, ctermMods, highResMs2, ion_to_index_map, varMods, ntermVarMods, ctermVarMods, varModSequence) return t