def psm(p, s0, c = 2, highResMs2 = False, dripLearnedMeans = 'dripLearned.means', dripLearnedCovars = 'dripLearned.covars', mods = '', ntermMods = '', ctermMods = '', varModSequence = '', precursor_filter = False, high_res_gauss_dist = 0.05): """ Inputs: p = peptide string s = observed spectrum, instance of class MS2Spectrum c = psm charge mods = static mods ntermMods = static nterm-mods ctermMods = static cterm-mods """ s = copy.deepcopy(s0) args = dripGaussianCollectionNames() sid = s.spectrum_id # parse modifications mods, varMods = parse_var_mods(mods, True) ntermMods, ntermVarMods = parse_var_mods(ntermMods, False) ctermMods, ctermVarMods = parse_var_mods(ctermMods, False) if precursor_filter: normalize = 'top300TightSequest' else: normalize = 'top300Sequest' preprocess = pipeline(normalize) preprocess(s) # get original intensity values to plot s0.mz = list(s.mz) mz_vals = set(s.mz) z = max(s0.intensity) s0.intensity = [i/z for mz, i in zip(s0.mz, s0.intensity) if mz in mz_vals] num_psms = 1 max_obs_mass = 2001 dirBase = 'dtk' # output_dir = os.path.abspath('dripEncode_' + dirBase) output_dir = os.path.abspath('encode') if not os.path.exists(output_dir): os.mkdir(output_dir) obs_dir = 'obs' # sub directory of output_dir pfile_dir = os.path.join(output_dir, obs_dir) if not os.path.exists(pfile_dir): os.mkdir(pfile_dir) # log_dir = os.path.abspath('dripLog_' + dirBase) log_dir = os.path.abspath('log') if not os.path.exists(log_dir): os.mkdir(log_dir) if not highResMs2: dripMeans = load_drip_means(dripLearnedMeans) if varMods or ntermVarMods or ctermVarMods: assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied. Exitting" bNy = interleave_b_y_ions_var_mods_lowres(Peptide(p), c, mods, ntermMods, ctermMods, varMods, varNtermMods, varCtermMods, varModSequence) else: bNy = interleave_b_y_ions_lowres(Peptide(p), c, mods, ntermMods, ctermMods) l = len(bNy) filter_theoretical_peaks_lowres(bNy, dripMeans, s.mz[0], s.mz[-1]) else: # calculate b- and y-ions, filter peaks outside of spectrum range if varMods or ntermVarMods or ctermVarMods: assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied. Exitting" bNy = interleave_b_y_ions_var_mods(Peptide(p), c, mods, ntermMods, ctermMods, varMods, varNtermMods, varCtermMods, varModSequence) else: bNy = interleave_b_y_ions(Peptide(p), c, mods, ntermMods, ctermMods) l = len(bNy) filter_theoretical_peaks(bNy, s.mz[0], s.mz[-1], high_res_gauss_dist) # now construct means based on this dripMeans = {} for i, ion in enumerate(bNy): dripMeans[i] = ion ion_to_index_map = {} # reverse mapping, from ions to indices for ind in dripMeans: ion_to_index_map[dripMeans[ind]] = ind # make collection per spectrum make_master_parameters_lowres(args, dripMeans) peptide_obs_file = os.path.join(pfile_dir,'pep-lengths') spectrum_obs_file = os.path.join(pfile_dir,'spectrum') pep_dt = open(os.path.join(output_dir, 'iterable.dts'), "w") pep_dt.write('%d\n\n' % (num_psms)) # write peptide database to parse and identify GMTK segments later pepdb_list = open(os.path.join(output_dir, 'pepDB.txt'), "w") pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n") pep_num = 0 # create iterable dt and peptide pfile peptide_sentence_flatascii(pep_dt, p, bNy, pep_num, sid, max_obs_mass, peptide_obs_file, True, len(bNy)) # create spectrum pfile spectrum_sentence_flatascii(spectrum_obs_file, s.mz, s.intensity) pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, p, l, c)) # close streams for this spectrum pep_dt.close() pepdb_list.close() # compile dt using gmtkDTIndex call(['gmtkDTindex', '-decisionTreeFiles', os.path.join(output_dir,'iterable.dts')], stdout = stdo, stderr = stde) # stdout = sys.stderr, stderr = sys.stderr) # create structure and master files then triangulate try: create_drip_structure(highResMs2, args.structure_file, max_obs_mass, False, False, high_res_gauss_dist) except: print "Could not create DRIP structure file %s, exitting" % args.structure_file exit(-1) try: create_drip_master(highResMs2, args.master_file, max_obs_mass, "DRIP_MZ", "drip_collection/covar.txt", "DRIP_GAUSSIAN_COMPONENTS", "DRIP_GAUSSIAN_MIXTURES", "DRIP_MZ_GAUSSIANS") except: print "Could not create DRIP master file %s, exitting" % args.master_file exit(-1) try: triangulate_drip(args.structure_file, args.master_file) except: print "Could not create triangulate structure file %s, exitting" % args.structure_file exit(-1) try: write_covar_file(highResMs2, args.covar_file, dripLearnedCovars, True, high_res_gauss_dist) except: print "Could not create covariance file %s, exitting" % args.covar_file exit(-1) # run GMTK dtFile = os.path.join(output_dir, 'iterable.dts') cppCommand = '\'-DITERABLE_DT=' + dtFile \ + ' -DDRIP_MZ=' + args.mean_file \ + ' -DDRIP_GAUSSIAN_COMPONENTS=' + args.gauss_file \ + ' -DDRIP_GAUSSIAN_MIXTURES=' + args.mixture_file \ + ' -DDRIP_MZ_GAUSSIANS=' + args.collection_file \ + '\'' # call gmtkViterbi vitStr0 = "gmtkViterbi -strFile " + args.structure_file \ + " -triFile " + args.structure_file + ".trifile -ni1 0 -nf1 2 -ni2 1 -nf2 0" \ + " -fdiffact2 rl" \ + " -inputMasterFile " + args.master_file + " -inputTrainableParameters trained.params -failOnZeroClique F" # gmtkViterbi command line vitValsFile = os.path.join(log_dir, 'vitVals.txt') vitStr = vitStr0 + ' -vitValsFile ' + vitValsFile \ + ' -of1 ' + spectrum_obs_file \ + ' -fmt1 flatascii ' \ + ' -of2 ' + peptide_obs_file \ + ' -fmt2 flatascii ' \ + ' -cppCommand ' + cppCommand # call(shlex.split(vitStr), stdout = sys.stdout, stderr = sys.stdout) call(shlex.split(vitStr), stdout = stdo, stderr = stde) # parse output t,d = ppsm.parse_dripExtract(vitValsFile, os.path.join(output_dir, 'pepDB.txt')) t = t[sid,c][0] # calculate insertions and deletions t.add_obs_spectrum(s0) t.calculate_drip_features(dripMeans) t.calc_by_sets(c, mods, ntermMods, ctermMods, highResMs2, ion_to_index_map, varMods, ntermVarMods, ctermVarMods, varModSequence) return t
def percolatorPsms_gen_lorikeet(psmFile, spectrumFile, outputDirectory, isCruxPercolator = False, plotList = 'currPsms.html', mods_spec = '', nterm_mods_spec = '', cterm_mods_spec = '', cMod = True): """ """ # parse modifications mods, var_mods = parse_var_mods(mods_spec, True) nterm_mods, nterm_var_mods = parse_var_mods(nterm_mods_spec, False) cterm_mods, cterm_var_mods = parse_var_mods(cterm_mods_spec, False) if cMod: # will typically be true if 'C' not in mods: mods['C'] = 57.021464 # lorikeet only supports a single static mod to the n-terminus if nterm_mods: nm = [] for aa in allPeps: if aa not in nterm_mods: print "Lorikeet only supports a single constant shift to the n-terminus, please specify only one n-terminal shift for using X" print "Exitting" exit(-1) nm.append(nterm_mods[aa]) if len(set(nm)) > 1: print "different n-teriminal shifts supplied for different amino acids, Lorikeet only supports a single constant shift to the n-terminus." print "Exitting" exit(-1) nterm_mods = nm[0] # lorikeet only supports a single static mod to the c-terminus if cterm_mods: cm = [] for aa in allPeps: if aa not in cterm_mods: print "Lorikeet only supports a single constant shift to the n-terminus, please specify only one n-terminal shift for using X" print "Exitting" exit(-1) cm.append(cterm_mods[aa]) if len(set(cm)) > 1: print "different n-teriminal shifts supplied for different amino acids, Lorikeet only supports a single constant shift to the n-terminus." print "Exitting" exit(-1) cterm_mods = cm[0] # load Percolator PSMs t = load_percolator_output(psmFile, isCruxPercolator) # load .ms2 spectra spectra, _, _, _ = load_spectra_minMaxMz(spectrumFile) # make output directory if not os.path.exists(outputDirectory): os.mkdir(outputDirectory) # open filestream for master list of html files fid = open(plotList, 'w') # get original intensity values to plot for sid in t: if sid not in spectra: print "Scan number %d specified for PSM, but not appear in provided ms2 file, skipping" % sid continue spec = spectra[sid] psm = t[sid] charge = psm[3] filename = 'scan' + str(sid) + '-' + psm[1] + '-ch' + str(charge) + '.html' filename = os.path.join(outputDirectory,filename) write_lorikeet_file(psm, spec, filename, mods, nterm_mods, cterm_mods) fid.write("<a href=\"%s\">Scan %d, %s, Charge %d</a><br>\n" % (filename, sid, psm[1], charge)) fid.close()
def gen_lorikeet(psmFile, spectrumFile, outputDirectory, plotList = 'currPsms.html', mods_spec = '', nterm_mods_spec = '', cterm_mods_spec = '', scanField = 'scan', peptideField = 'sequence', chargeField = 'charge', scoreField = 'score', varModStringField = '', cMod = True): """ Generate html files for Lorikeet plugin """ # parse modifications mods, var_mods = parse_var_mods(mods_spec, True) nterm_mods, nterm_var_mods = parse_var_mods(nterm_mods_spec, False) cterm_mods, cterm_var_mods = parse_var_mods(cterm_mods_spec, False) if cMod: # will typically be true if 'C' not in mods: mods['C'] = 57.021464 # lorikeet only supports a single static mod to the n-terminus if nterm_mods: nm = [] for aa in allPeps: if aa not in nterm_mods: print "Lorikeet only supports a single constant shift to the n-terminus, please specify only one n-terminal shift using X" print "Exitting" exit(-1) nm.append(nterm_mods[aa]) if len(set(nm)) > 1: print "different n-teriminal shifts supplied for different amino acids, Lorikeet only supports a single constant shift to the n-terminus." print "Exitting" exit(-1) nterm_mods = nm[0] # lorikeet only supports a single static mod to the c-terminus if cterm_mods: cm = [] for aa in allPeps: if aa not in cterm_mods: print "Lorikeet only supports a single constant shift to the n-terminus, please specify only one n-terminal shift using X" print "Exitting" exit(-1) cm.append(cterm_mods[aa]) if len(set(cm)) > 1: print "different n-teriminal shifts supplied for different amino acids, Lorikeet only supports a single constant shift to the n-terminus." print "Exitting" exit(-1) cterm_mods = cm[0] # load Percolator PSMs t = load_psm_for_lorikeet(psmFile, scanField, peptideField, chargeField, scoreField, varModStringField) # load .ms2 spectra spectra, _, _, _ = load_spectra_minMaxMz(spectrumFile) # make output directory if not os.path.exists(outputDirectory): os.mkdir(outputDirectory) # open filestream for master list of html files fid = open(plotList, 'w') for sid in t: if sid not in spectra: print "Scan number %d specified for PSM, but not appear in provided ms2 file, skipping" % sid continue spec = spectra[sid] psm = t[sid] charge = psm[3] varModTuple = [] if varModStringField: varModString = psm[4] for ind, (aa, v) in enumerate(zip(psm[1], varModString)): l = int(v) if l: if l == 1: assert aa in var_mods, "Var mod string denotes variable mod %c not supported by variable mod enzyme settings" % aa varModShift = var_mods[aa][1] varModTuple.append((ind+1, aa, varModShift)) elif l == 2: assert aa in cterm_var_mods, "Nterm var mod string denotes nterm var mod %c not supported by nterm var mod enzyme settings" % aa varModShift = cterm_var_mods[aa][1] varModTuple.append((ind+1, aa, varModShift)) elif l == 3: assert aa in nterm_var_mods, "Nterm var mod string denotes nterm var mod %c not supported by nterm var mod enzyme settings" % aa varModShift = nterm_var_mods[aa][1] varModTuple.append((ind+1, aa, varModShift)) filename = 'scan' + str(sid) + '-' + psm[1] + '-ch' + str(charge) + '.html' filename = os.path.join(outputDirectory,filename) write_lorikeet_file(psm, spec, filename, mods, nterm_mods, cterm_mods, varModTuple) fid.write("<a href=\"%s\">Scan %d, %s, Charge %d</a><br>\n" % (filename, sid, psm[1], charge)) fid.close()
def plot_psms(psmFile, spectrumFile, plotList = 'currPsms.html', highResMs2 = False, dripLearnedMeans = 'dripLearned.means', dripLearnedCovars = 'dripLearned.covars', mods = '', ntermMods = '', ctermMods = '', precursor_filter = False, high_res_gauss_dist = 0.05): """ """ # initialize arguments for dripExtract args = dripExtractParams(psmFile, spectrumFile, 'all', mods, ntermMods, ctermMods, highResMs2, dripLearnedMeans, dripLearnedCovars) mods, varMods = parse_var_mods(mods, True) ntermMods, ntermVarMods = parse_var_mods(ntermMods, False) ctermMods, ctermVarMods = parse_var_mods(ctermMods, False) stde = open('gmtk_err', "w") # stdo = sys.stdout stdo = stde args.precursor_filter = False args.high_res_gauss_dist = high_res_gauss_dist if precursor_filter: args.normalize = 'top300TightSequest' else: args.normalize = 'top300Sequest' # decode DRIP PSMs t, d, spectra0 = runDripExtract(args, stdo, stde) # if variable mods, get variable mod string per PSM if varMods or ntermVarMods or ctermVarMods: varModDict = psm_var_mods(psmFile) assert varModDict, "Variable mods specified in enzyme options, but strings denoting variables mods per peptide are not specified in %s, exitting" (psmFile) spectra, minMz, maxMz, validCharges = load_spectra_minMaxMz(spectrumFile) # get original intensity values to plot for sid in spectra0: spectra[sid].mz = list(spectra0[sid].mz) mz_vals = set(spectra0[sid].mz) z = max(spectra0[sid].intensity) spectra[sid].intensity = [i/z for mz, i in zip(spectra[sid].mz, spectra[sid].intensity) if mz in mz_vals] if not highResMs2: dripMeans = load_drip_means(dripLearnedMeans) else: dripMeansSet = set([]) for sid, c in t: for p in t[sid,c]: pep = p.peptide if varMods or ntermVarMods or ctermVarMods: varModSequence = varModDict[sid, p.peptide] bNy = interleave_b_y_ions_var_mods(Peptide(pep), c, mods, ntermMods, ctermMods, varMods, ntermVarMods, ctermVarMods, varModSequence) else: bNy = interleave_b_y_ions(Peptide(pep), c, mods, ntermMods, ctermMods) filter_theoretical_peaks(bNy, minMz, maxMz, high_res_gauss_dist) dripMeansSet |= set(bNy) # for i, ion in enumerate(bNy): # dripMeans[i] = ion for sid, c in d: for p in d[sid,c]: pep = p.peptide if varMods or ntermVarMods or ctermVarMods: varModSequence = varModDict[sid, p.peptide] bNy = interleave_b_y_ions_var_mods(Peptide(pep), c, mods, ntermMods, ctermMods, varMods, ntermVarMods, ctermVarMods, varModSequence) else: bNy = interleave_b_y_ions(Peptide(pep), c, mods, ntermMods, ctermMods) filter_theoretical_peaks(bNy, minMz, maxMz, high_res_gauss_dist) dripMeansSet |= set(bNy) # for i, ion in enumerate(bNy): # dripMeans[i] = ion dripMeans = {} for ind, ion in enumerate(sorted(dripMeansSet)): dripMeans[ind] = ion ion_to_index_map = {} # reverse mapping, from ions to indices for ind in dripMeans: ion_to_index_map[dripMeans[ind]] = ind all_psms = [] varModSequence = '' for sid, c in t: s = spectra[sid] for p in t[sid,c]: p.add_obs_spectrum(s) p.calculate_drip_features(dripMeans) if varMods or ntermVarMods or ctermVarMods: varModSequence = varModDict[sid, p.peptide] p.calc_by_sets(c, mods, ntermMods, ctermMods, highResMs2, ion_to_index_map, varMods, ntermVarMods, ctermVarMods, varModSequence) all_psms.append(p) for sid, c in d: s = spectra[sid] for p in d[sid,c]: p.add_obs_spectrum(s) p.calculate_drip_features(dripMeans) if varMods or ntermVarMods or ctermVarMods: varModSequence = varModDict[sid, p.peptide] p.calc_by_sets(c, mods, ntermMods, ctermMods, highResMs2, ion_to_index_map, varMods, ntermVarMods, ctermVarMods, varModSequence) all_psms.append(p) fid = open(plotList, "w") all_psms.sort(key = lambda r: r.score, reverse = True) for p in all_psms: if p.kind == 't': kind = 'target' elif p.kind == 'd': kind = 'decoy' else: continue plotName = kind + 'Scan' + str(p.scan) + \ 'Charge' + str(p.charge) + \ p.peptide + '.png' p.plot_drip_viterbi(plotName) fid.write("<a href=\"%s\">%s Scan %d Charge %d %s</a><br>\n" % (plotName, kind, p.scan, p.charge, p.peptide)) fid.close()
def make_drip_data_lowres(args, spectra, stdo, stde): """Generate test data .pfile. and create job scripts for cluster use. Decrease number of calls to GMTK by only calling once per spectrum and running for all charge states in one go """ # parse modifications mods, varMods = parse_var_mods(args.mods_spec, True) # print "mods:" # print mods ntermMods, ntermVarMods = parse_var_mods(args.nterm_peptide_mods_spec, False) # print "n-term mods:" # print nterm_mods ctermMods, ctermVarMods = parse_var_mods(args.cterm_peptide_mods_spec, False) varModKey = "Var_mod_seq" # load means dripMeans = load_drip_means(args.learned_means) # make master file make_master_parameters_lowres(args, dripMeans) if not args.append_to_pin: target,decoy,num_psms = load_psms(args.psm_file) else: target,decoy,num_psms = load_pin_file(args.psm_file) # check whether variable mods enzyme options were specified and # necessary variable mod string specifying which amino acids are modded # were in the PSM files for i in target[target.keys()[0]]: t = i break if varMods or ntermVarMods or ctermVarMods: if varModKey not in t.other: print "Variable modifications enzyme options specified," print "but PSM file does not contain necessary field Var_mod_seq for strings specifying which amino acids are modified." print "Exitting" exit(-1) # else: # if varModKey in t.other: # print "PSM file does contains field Var_mod_seq denoting variable modifications," # print "but variable modifications enzyme options not specified." # print "Exitting" # exit(-1) pfile_dir = os.path.join(args.output_dir, args.obs_dir) sid_charges = list(set(target.iterkeys()) | set(decoy.iterkeys())) # sid_charges = list(set(list(target.iterkeys()) + list(decoy.iterkeys()))) # assume that we should randomize PSMs for multithreading purposes; only reason # why we are currently assuming this is that there is already a parameter for dripSearch # which signifies whether we should shuffle the data shuffle(sid_charges) if(args.normalize != 'filter0'): preprocess = pipeline(args.normalize) validcharges = args.charges # write peptide database to parse and identify GMTK segments later pepdb_list = open(os.path.join(args.output_dir, 'pepDB.txt'), "w") pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n") peptide_pfile = create_pfile(pfile_dir, 'pep-lengths.pfile', 0, 1) spectrum_pfile = create_pfile(pfile_dir, 'spectrum.pfile', 2,0) pep_dt = open(os.path.join(args.output_dir, 'iterable.dts'), "w") pep_dt.write('%d\n\n' % (num_psms)) spec_dict = {} pep_num = 0 for sid, charge in sid_charges: if sid not in spec_dict: s = spectra[sid] preprocess(s) spec_dict[sid] = s else: s = spec_dict[sid] if args.filt_theo_peaks: if args.per_spectrum_mz_bound: minMz = s.mz[0] maxMz = s.mz[-1] else: minMz = args.mz_lb maxMz = args.mz_ub if (sid,charge) in target: for p in target[sid,charge]: pep = p.peptide # bNy = interleave_b_y_ions_lowres(Peptide(pep), charge, mods, # ntermMods, ctermMods) if varMods or ntermVarMods or ctermVarMods: varModSequence = p.other[varModKey] bNy = interleave_b_y_ions_var_mods_lowres(Peptide(pep), charge, mods, ntermMods, ctermMods, varMods, ntermVarMods, ctermVarMods, varModSequence) else: bNy = interleave_b_y_ions_lowres(Peptide(pep), charge, mods, ntermMods, ctermMods) pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, pep, len(bNy), charge)) # numBY for DRIP features assumes all b-/y-ions, not just those # unfiltered per spectrum if args.filt_theo_peaks: filter_theoretical_peaks_lowres(bNy, dripMeans, minMz, maxMz) drip_peptide_sentence(pep_dt, pep, bNy, pep_num, s.spectrum_id, args.max_obs_mass, peptide_pfile, True, len(bNy)-1) drip_spectrum_sentence(spectrum_pfile, s.mz, s.intensity) pep_num += 1 if (sid,charge) in decoy: for d in decoy[sid,charge]: pep = d.peptide # bNy = interleave_b_y_ions_lowres(Peptide(pep), charge, mods, # ntermMods, ctermMods) if varMods or ntermVarMods or ctermVarMods: varModSequence = d.other[varModKey] bNy = interleave_b_y_ions_var_mods_lowres(Peptide(pep), charge, mods, ntermMods, ctermMods, varMods, ntermVarMods, ctermVarMods, varModSequence) else: bNy = interleave_b_y_ions_lowres(Peptide(pep), charge, mods, ntermMods, ctermMods) pepdb_list.write("d\t%d\t%s\t%d\t%d\n" % (sid, pep, len(bNy), charge)) # numBY for DRIP features assumes all b-/y-ions, not just those # unfiltered per spectrum if args.filt_theo_peaks: filter_theoretical_peaks_lowres(bNy, dripMeans, minMz, maxMz) drip_peptide_sentence(pep_dt, pep, bNy, pep_num, s.spectrum_id, args.max_obs_mass, peptide_pfile, False, len(bNy)-1) drip_spectrum_sentence(spectrum_pfile, s.mz, s.intensity) pep_num += 1 # close streams for this spectrum pep_dt.close() pepdb_list.close() # compile dt using gmtkDTIndex call(['gmtkDTindex', '-decisionTreeFiles', os.path.join(args.output_dir,'iterable.dts')], stdout = stdo, stderr = stde) return spec_dict, pep_num
def make_drip_data_highres(args, spectra, stdo, stde): """Generate test data .pfile. and create job scripts for cluster use (if num_jobs > 1). Decrease number of calls to GMTK by only calling once per spectrum and running for all charge states in one go. inputs: args - output of parsed input arguments (struct) outputs: sids - list of scan IDs for the generated data pre: - args has been created by parse_args(), directories have been created/checked for existence, relevant arguments have been processed (Booleans, mods, digesting enzyme, etc) - data has been created by candidate_spectra_generate() and contains the above mentioned fields post: - args.{mean_file, gauss_file, mixture_file, collection_file} will all be adjusted - args.max_mass will be updated to the size of the number of unique theoretical fragmentation locations (floating point if high-res ms2, integers if low-res ms2) """ # parse modifications mods, varMods = parse_var_mods(args.mods_spec, True) # print "mods:" # print mods ntermMods, ntermVarMods = parse_var_mods(args.nterm_peptide_mods_spec, False) # print "n-term mods:" # print nterm_mods ctermMods, ctermVarMods = parse_var_mods(args.cterm_peptide_mods_spec, False) varModKey = "Var_mod_seq" if not args.append_to_pin: target,decoy,num_psms = load_psms(args.psm_file) else: target,decoy,num_psms = load_pin_file(args.psm_file) # check whether variable mods enzyme options were specified and # necessary variable mod string specifying which amino acids are modded # were in the PSM files for i in target[target.keys()[0]]: t = i break if varMods or ntermVarMods or ctermVarMods: if varModKey not in t.other: print "Variable modifications enzyme options specified," print "but PSM file does not contain necessary field Var_mod_seq for strings specifying which amino acids are modified." print "Exitting" exit(-1) # else: # if varModKey in t.other: # print "PSM file does contains field Var_mod_seq denoting variable modifications," # print "but variable modifications enzyme options not specified." # print "Exitting" # exit(-1) pfile_dir = os.path.join(args.output_dir, args.obs_dir) sid_charges = list(set(target.iterkeys()) | set(decoy.iterkeys())) # assume that we should randomize PSMs for multithreading purposes; only reason # why we are currently assuming this is that there is already a parameter for dripSearch # which signifies whether we should shuffle the data shuffle(sid_charges) if(args.normalize != 'filter0'): preprocess = pipeline(args.normalize) validcharges = args.charges ion_dict = {} # global dictionary for used fragment ions theo_spec_dict = {} numBY_dict_per_sid = {} # construct ion_dict for sid in spectra: s = spectra[sid] preprocess(s) for charge in validcharges: if (s.spectrum_id, charge) not in target: continue # check if we're filtering theoretical peaks outside observed m/z values if args.filt_theo_peaks: if args.per_spectrum_mz_bound: minMz = s.mz[0] maxMz = s.mz[-1] else: minMz = args.mz_lb maxMz = args.mz_ub # calculate maximum decoy and target theoretical spectra cardinalities for p in target[s.spectrum_id, charge]: pep = p.peptide # bNy = interleave_b_y_ions(Peptide(pep), charge, mods, # ntermMods, ctermMods) if varMods or ntermVarMods or ctermVarMods: varModSequence = p.other[varModKey] bNy = interleave_b_y_ions_var_mods(Peptide(pep), charge, mods, ntermMods, ctermMods, varMods, ntermVarMods, ctermVarMods, varModSequence) else: bNy = interleave_b_y_ions(Peptide(pep), charge, mods, ntermMods, ctermMods) numBY_dict_per_sid[sid, pep] = len(bNy) if args.filt_theo_peaks: filter_theoretical_peaks(bNy, minMz, maxMz) theo_spec_dict[s.spectrum_id, pep] = bNy for i in bNy: ion_dict[i] = 1 for d in decoy[s.spectrum_id, charge]: pep = d.peptide # bNy = interleave_b_y_ions(Peptide(pep), charge, mods, # ntermMods, ctermMods) if varMods or ntermVarMods or ctermVarMods: varModSequence = d.other[varModKey] bNy = interleave_b_y_ions_var_mods(Peptide(pep), charge, mods, ntermMods, ctermMods, varMods, ntermVarMods, ctermVarMods, varModSequence) else: bNy = interleave_b_y_ions(Peptide(pep), charge, mods, ntermMods, ctermMods) numBY_dict_per_sid[sid, pep] = len(bNy) if args.filt_theo_peaks: filter_theoretical_peaks(bNy, minMz, maxMz) theo_spec_dict[s.spectrum_id, pep] = bNy for i in bNy: ion_dict[i] = 1 ions = list(ion_dict.iterkeys()) ions.sort() for i, ion in enumerate(ions): ion_dict[ion] = i # make collection per spectrum make_master_parameters(args, ion_dict, ions) peptide_pfile = create_pfile(pfile_dir, 'pep-lengths.pfile', 0, 1) spectrum_pfile = create_pfile(pfile_dir, 'spectrum.pfile', 2,0) pep_dt = open(os.path.join(args.output_dir, 'iterable.dts'), "w") pep_dt.write('%d\n\n' % (num_psms)) # write peptide database to parse and identify GMTK segments later pepdb_list = open(os.path.join(args.output_dir, 'pepDB.txt'), "w") pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n") spec_dict = {} pep_num = 0 for sid, charge in sid_charges: if sid not in spec_dict: s = spectra[sid] preprocess(s) spec_dict[sid] = s else: s = spec_dict[sid] for p in target[sid,charge]: pep = p.peptide bNy = theo_spec_dict[s.spectrum_id, pep] bNy = [ion_dict[bOrY] for bOrY in bNy] drip_peptide_sentence(pep_dt, pep, bNy, pep_num, s.spectrum_id, args.max_obs_mass, peptide_pfile, True, len(bNy)-1) drip_spectrum_sentence(spectrum_pfile, s.mz, s.intensity) pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, pep, numBY_dict_per_sid[sid, pep], charge)) pep_num += 1 if (sid,charge) in decoy: for d in decoy[sid,charge]: pep = d.peptide bNy = theo_spec_dict[s.spectrum_id, pep] bNy = [ion_dict[bOrY] for bOrY in bNy] drip_peptide_sentence(pep_dt, pep, bNy, pep_num, s.spectrum_id, args.max_obs_mass, peptide_pfile, False, len(bNy)-1) drip_spectrum_sentence(spectrum_pfile, s.mz, s.intensity) pepdb_list.write("d\t%d\t%s\t%d\t%d\n" % (sid, pep, numBY_dict_per_sid[sid, pep], charge)) pep_num += 1 # close streams for this spectrum pep_dt.close() pepdb_list.close() # compile dt using gmtkDTIndex call(['gmtkDTindex', '-decisionTreeFiles', os.path.join(args.output_dir,'iterable.dts')], stdout = stdo, stderr = stde) return spec_dict, pep_num