Exemplo n.º 1
0
def psm(p, s0, c = 2, highResMs2 = False,
        dripLearnedMeans = 'dripLearned.means',
        dripLearnedCovars = 'dripLearned.covars',
        mods = '', ntermMods = '', ctermMods = '', varModSequence = '',
        precursor_filter = False, 
        high_res_gauss_dist = 0.05):
    """ Inputs:
               p = peptide string
               s = observed spectrum, instance of class MS2Spectrum
               c = psm charge
               mods = static mods
               ntermMods = static nterm-mods
               ctermMods = static cterm-mods
    """

    s = copy.deepcopy(s0)

    args = dripGaussianCollectionNames()
    sid = s.spectrum_id

    # parse modifications
    mods, varMods = parse_var_mods(mods, True)
    ntermMods, ntermVarMods = parse_var_mods(ntermMods, False)
    ctermMods, ctermVarMods = parse_var_mods(ctermMods, False)

    if precursor_filter: 
        normalize = 'top300TightSequest'
    else:
        normalize = 'top300Sequest'

    preprocess = pipeline(normalize)
    preprocess(s)

    # get original intensity values to plot
    s0.mz = list(s.mz)
    mz_vals = set(s.mz)
    z = max(s0.intensity)    
    s0.intensity = [i/z for mz, i in zip(s0.mz, s0.intensity)
                                  if mz in mz_vals]
    num_psms = 1

    max_obs_mass = 2001

    dirBase = 'dtk'

    # output_dir = os.path.abspath('dripEncode_' + dirBase)
    output_dir = os.path.abspath('encode')
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    obs_dir = 'obs' # sub directory of output_dir
    pfile_dir = os.path.join(output_dir, obs_dir)
    if not os.path.exists(pfile_dir):
        os.mkdir(pfile_dir)

    # log_dir = os.path.abspath('dripLog_' + dirBase)
    log_dir = os.path.abspath('log')
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    if not highResMs2:
        dripMeans = load_drip_means(dripLearnedMeans)
        if varMods or ntermVarMods or ctermVarMods:
            assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied.  Exitting"
            bNy = interleave_b_y_ions_var_mods_lowres(Peptide(p), c, 
                                                      mods, ntermMods, ctermMods, 
                                                      varMods, varNtermMods, varCtermMods, 
                                                      varModSequence)
        else:
            bNy = interleave_b_y_ions_lowres(Peptide(p), c, mods,
                                             ntermMods, ctermMods)
        l = len(bNy)
        filter_theoretical_peaks_lowres(bNy, 
                                        dripMeans, s.mz[0], s.mz[-1])
    else:
        # calculate b- and y-ions, filter peaks outside of spectrum range
        if varMods or ntermVarMods or ctermVarMods:
            assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied.  Exitting"
            bNy = interleave_b_y_ions_var_mods(Peptide(p), c, 
                                               mods, ntermMods, ctermMods,
                                               varMods, varNtermMods, varCtermMods,
                                               varModSequence)
        else:
            bNy = interleave_b_y_ions(Peptide(p), c, mods,
                                      ntermMods, ctermMods)
        l = len(bNy)
        filter_theoretical_peaks(bNy, s.mz[0], s.mz[-1], high_res_gauss_dist)
        # now construct means based on this
        dripMeans = {}
        for i, ion in enumerate(bNy):
            dripMeans[i] = ion

    ion_to_index_map = {} # reverse mapping, from ions to indices
    for ind in dripMeans:
        ion_to_index_map[dripMeans[ind]] = ind

    # make collection per spectrum
    make_master_parameters_lowres(args, dripMeans)
    peptide_obs_file = os.path.join(pfile_dir,'pep-lengths')
    spectrum_obs_file = os.path.join(pfile_dir,'spectrum')

    pep_dt = open(os.path.join(output_dir, 'iterable.dts'), "w")
    pep_dt.write('%d\n\n' % (num_psms))

    # write peptide database to parse and identify GMTK segments later
    pepdb_list = open(os.path.join(output_dir, 'pepDB.txt'), "w")
    pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n")

    pep_num = 0
    # create iterable dt and peptide pfile
    peptide_sentence_flatascii(pep_dt, p, bNy, 
                               pep_num, sid, max_obs_mass,
                               peptide_obs_file, True, len(bNy))
    # create spectrum pfile
    spectrum_sentence_flatascii(spectrum_obs_file, s.mz, s.intensity)
    pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, 
                                              p, l, c))
        
    # close streams for this spectrum
    pep_dt.close()
    pepdb_list.close()
    # compile dt using gmtkDTIndex
    call(['gmtkDTindex', '-decisionTreeFiles', 
          os.path.join(output_dir,'iterable.dts')], 
         stdout = stdo, stderr = stde)
         # stdout = sys.stderr, stderr = sys.stderr)

    # create structure and master files then triangulate
    try:
        create_drip_structure(highResMs2, args.structure_file, 
                              max_obs_mass, False, False,
                              high_res_gauss_dist)
    except:
        print "Could not create DRIP structure file %s, exitting" % args.structure_file
        exit(-1)

    try:
        create_drip_master(highResMs2, args.master_file, 
                           max_obs_mass,
                           "DRIP_MZ",
                           "drip_collection/covar.txt",
                           "DRIP_GAUSSIAN_COMPONENTS",
                           "DRIP_GAUSSIAN_MIXTURES",
                           "DRIP_MZ_GAUSSIANS")
    except:
        print "Could not create DRIP master file %s, exitting" % args.master_file
        exit(-1)

    try:
        triangulate_drip(args.structure_file, args.master_file)
    except:
        print "Could not create triangulate structure file %s, exitting" % args.structure_file
        exit(-1)

    try:
        write_covar_file(highResMs2, args.covar_file, 
                         dripLearnedCovars, True,
                         high_res_gauss_dist)
    except:
        print "Could not create covariance file %s, exitting" % args.covar_file
        exit(-1)

    # run GMTK
    dtFile = os.path.join(output_dir, 'iterable.dts')
    cppCommand = '\'-DITERABLE_DT=' + dtFile \
        + ' -DDRIP_MZ=' + args.mean_file \
        + ' -DDRIP_GAUSSIAN_COMPONENTS=' + args.gauss_file \
        + ' -DDRIP_GAUSSIAN_MIXTURES=' + args.mixture_file \
        + ' -DDRIP_MZ_GAUSSIANS=' + args.collection_file \
        + '\''

    # call gmtkViterbi
    vitStr0 = "gmtkViterbi -strFile " + args.structure_file \
        + " -triFile " + args.structure_file + ".trifile -ni1 0 -nf1 2 -ni2 1 -nf2 0" \
        + " -fdiffact2 rl" \
        + " -inputMasterFile " + args.master_file + " -inputTrainableParameters trained.params -failOnZeroClique F"
    # gmtkViterbi command line
    vitValsFile = os.path.join(log_dir, 'vitVals.txt')
    vitStr = vitStr0 + ' -vitValsFile ' +  vitValsFile \
        + ' -of1 ' + spectrum_obs_file \
        + ' -fmt1 flatascii ' \
        + ' -of2 ' + peptide_obs_file \
        + ' -fmt2 flatascii ' \
        + ' -cppCommand ' + cppCommand
    # call(shlex.split(vitStr), stdout = sys.stdout, stderr = sys.stdout)
    call(shlex.split(vitStr), stdout = stdo, stderr = stde)

    # parse output
    t,d = ppsm.parse_dripExtract(vitValsFile, os.path.join(output_dir, 'pepDB.txt'))

    t = t[sid,c][0]
    # calculate insertions and deletions
    t.add_obs_spectrum(s0)
    t.calculate_drip_features(dripMeans)
    t.calc_by_sets(c, mods,
                   ntermMods, ctermMods, highResMs2, 
                   ion_to_index_map,
                   varMods, ntermVarMods, ctermVarMods,
                   varModSequence)
    return t
Exemplo n.º 2
0
def plot_psms(psmFile, spectrumFile, plotList = 'currPsms.html',
              highResMs2 = False,
              dripLearnedMeans = 'dripLearned.means',
              dripLearnedCovars = 'dripLearned.covars',
              mods = '', ntermMods = '', ctermMods = '',
              precursor_filter = False, 
              high_res_gauss_dist = 0.05):
    """
    """
    # initialize arguments for dripExtract
    args = dripExtractParams(psmFile, spectrumFile, 'all', 
                             mods, ntermMods, ctermMods, 
                             highResMs2, 
                             dripLearnedMeans, dripLearnedCovars)

    mods, varMods = parse_var_mods(mods, True)
    ntermMods, ntermVarMods = parse_var_mods(ntermMods, False)
    ctermMods, ctermVarMods = parse_var_mods(ctermMods, False)

    stde = open('gmtk_err', "w")
    # stdo = sys.stdout
    stdo = stde

    args.precursor_filter = False
    args.high_res_gauss_dist = high_res_gauss_dist
    if precursor_filter: 
        args.normalize = 'top300TightSequest'
    else:
        args.normalize = 'top300Sequest'

    # decode DRIP PSMs
    t, d, spectra0 = runDripExtract(args, stdo, stde)
    
    # if variable mods, get variable mod string per PSM
    if varMods or ntermVarMods or ctermVarMods:
        varModDict = psm_var_mods(psmFile)
        assert varModDict, "Variable mods specified in enzyme options, but strings denoting variables mods per peptide are not specified in %s, exitting"  (psmFile)
    spectra, minMz, maxMz, validCharges = load_spectra_minMaxMz(spectrumFile)

    # get original intensity values to plot
    for sid in spectra0:
        spectra[sid].mz = list(spectra0[sid].mz)
        mz_vals = set(spectra0[sid].mz)
        z = max(spectra0[sid].intensity)
        spectra[sid].intensity = [i/z for mz, i in zip(spectra[sid].mz, spectra[sid].intensity)
                                  if mz in mz_vals]

    if not highResMs2:
        dripMeans = load_drip_means(dripLearnedMeans)
    else:
        dripMeansSet = set([])
        for sid, c in t:
            for p in t[sid,c]:
                pep = p.peptide
                if varMods or ntermVarMods or ctermVarMods:
                    varModSequence = varModDict[sid, p.peptide]
                    bNy = interleave_b_y_ions_var_mods(Peptide(pep), c,
                                                       mods, ntermMods, ctermMods,
                                                       varMods, ntermVarMods, ctermVarMods,
                                                       varModSequence)
                else:
                    bNy = interleave_b_y_ions(Peptide(pep), c, 
                                              mods, ntermMods, ctermMods)
                filter_theoretical_peaks(bNy, minMz, maxMz, high_res_gauss_dist)
                dripMeansSet |= set(bNy)
                # for i, ion in enumerate(bNy):
                #     dripMeans[i] = ion
        for sid, c in d:
            for p in d[sid,c]:
                pep = p.peptide
                if varMods or ntermVarMods or ctermVarMods:
                    varModSequence = varModDict[sid, p.peptide]
                    bNy = interleave_b_y_ions_var_mods(Peptide(pep), c, 
                                                       mods, ntermMods, ctermMods,
                                                       varMods, ntermVarMods, ctermVarMods,
                                                       varModSequence)
                else:
                    bNy = interleave_b_y_ions(Peptide(pep), c, 
                                              mods, ntermMods, ctermMods)
                filter_theoretical_peaks(bNy, minMz, maxMz, high_res_gauss_dist)
                dripMeansSet |= set(bNy)
                # for i, ion in enumerate(bNy):
                #     dripMeans[i] = ion
        dripMeans = {}
        for ind, ion in enumerate(sorted(dripMeansSet)):
            dripMeans[ind] = ion

    ion_to_index_map = {} # reverse mapping, from ions to indices
    for ind in dripMeans:
        ion_to_index_map[dripMeans[ind]] = ind

    all_psms = []
    varModSequence = ''
    for sid, c in t:
        s = spectra[sid]
        for p in t[sid,c]:
            p.add_obs_spectrum(s)
            p.calculate_drip_features(dripMeans)
            if varMods or ntermVarMods or ctermVarMods:
                varModSequence = varModDict[sid, p.peptide]
            p.calc_by_sets(c,
                           mods, ntermMods, ctermMods,
                           highResMs2, 
                           ion_to_index_map,
                           varMods, ntermVarMods, ctermVarMods,
                           varModSequence)
        all_psms.append(p)
    for sid, c in d:
        s = spectra[sid]
        for p in d[sid,c]:
            p.add_obs_spectrum(s)
            p.calculate_drip_features(dripMeans)
            if varMods or ntermVarMods or ctermVarMods:
                varModSequence = varModDict[sid, p.peptide]
            p.calc_by_sets(c,
                           mods, ntermMods, ctermMods,
                           highResMs2, 
                           ion_to_index_map,
                           varMods, ntermVarMods, ctermVarMods,
                           varModSequence)
        all_psms.append(p)

    fid = open(plotList, "w")

    all_psms.sort(key = lambda r: r.score, reverse = True)
    for p in all_psms:
        if p.kind == 't':
            kind = 'target'
        elif p.kind == 'd':
            kind = 'decoy'
        else:
            continue

        plotName = kind + 'Scan' + str(p.scan) + \
            'Charge' + str(p.charge) + \
            p.peptide + '.png'

        p.plot_drip_viterbi(plotName)
        fid.write("<a href=\"%s\">%s Scan %d Charge %d %s</a><br>\n" %
                  (plotName, kind, p.scan, p.charge, p.peptide))

    fid.close()
Exemplo n.º 3
0
def psm(p, s0, c = 2, highResMs2 = False,
        dripLearnedMeans = 'dripLearned.means',
        dripLearnedCovars = 'dripLearned.covars',
        mods = '', ntermMods = '', ctermMods = '', varModSequence = '',
        precursor_filter = False, 
        high_res_gauss_dist = 0.05):
    """ Inputs:
               p = peptide string
               s = observed spectrum, instance of class MS2Spectrum
               c = psm charge
               mods = static mods
               ntermMods = static nterm-mods
               ctermMods = static cterm-mods
    """

    s = copy.deepcopy(s0)

    args = dripGaussianCollectionNames()
    sid = s.spectrum_id

    # parse modifications
    mods, varMods = parse_var_mods(mods, True)
    ntermMods, ntermVarMods = parse_var_mods(ntermMods, False)
    ctermMods, ctermVarMods = parse_var_mods(ctermMods, False)

    if precursor_filter: 
        normalize = 'top300TightSequest'
    else:
        normalize = 'top300Sequest'

    preprocess = pipeline(normalize)
    preprocess(s)

    # get original intensity values to plot
    s0.mz = list(s.mz)
    mz_vals = set(s.mz)
    z = max(s0.intensity)    
    s0.intensity = [i/z for mz, i in zip(s0.mz, s0.intensity)
                                  if mz in mz_vals]
    num_psms = 1

    max_obs_mass = 2001

    dirBase = 'dtk'

    # output_dir = os.path.abspath('dripEncode_' + dirBase)
    output_dir = os.path.abspath('encode')
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    obs_dir = 'obs' # sub directory of output_dir
    pfile_dir = os.path.join(output_dir, obs_dir)
    if not os.path.exists(pfile_dir):
        os.mkdir(pfile_dir)

    # log_dir = os.path.abspath('dripLog_' + dirBase)
    log_dir = os.path.abspath('log')
    if not os.path.exists(log_dir):
        os.mkdir(log_dir)

    if not highResMs2:
        dripMeans = load_drip_means(dripLearnedMeans)
        if varMods or ntermVarMods or ctermVarMods:
            assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied.  Exitting"
            bNy = interleave_b_y_ions_var_mods_lowres(Peptide(p), c, 
                                                      mods, ntermMods, ctermMods, 
                                                      varMods, varNtermMods, varCtermMods, 
                                                      varModSequence)
        else:
            bNy = interleave_b_y_ions_lowres(Peptide(p), c, mods,
                                             ntermMods, ctermMods)
        l = len(bNy)
        filter_theoretical_peaks_lowres(bNy, 
                                        dripMeans, s.mz[0], s.mz[-1])
    else:
        # calculate b- and y-ions, filter peaks outside of spectrum range
        if varMods or ntermVarMods or ctermVarMods:
            assert varModSequence, "Variable mod enzyme options specified, but empty string denoting which amino acids are var mods supplied.  Exitting"
            bNy = interleave_b_y_ions_var_mods(Peptide(p), c, 
                                               mods, ntermMods, ctermMods,
                                               varMods, varNtermMods, varCtermMods,
                                               varModSequence)
        else:
            bNy = interleave_b_y_ions(Peptide(p), c, mods,
                                      ntermMods, ctermMods)
        l = len(bNy)
        filter_theoretical_peaks(bNy, s.mz[0], s.mz[-1], high_res_gauss_dist)
        # now construct means based on this
        dripMeans = {}
        for i, ion in enumerate(bNy):
            dripMeans[i] = ion

    ion_to_index_map = {} # reverse mapping, from ions to indices
    for ind in dripMeans:
        ion_to_index_map[dripMeans[ind]] = ind

    # make collection per spectrum
    make_master_parameters_lowres(args, dripMeans)
    peptide_obs_file = os.path.join(pfile_dir,'pep-lengths')
    spectrum_obs_file = os.path.join(pfile_dir,'spectrum')

    pep_dt = open(os.path.join(output_dir, 'iterable.dts'), "w")
    pep_dt.write('%d\n\n' % (num_psms))

    # write peptide database to parse and identify GMTK segments later
    pepdb_list = open(os.path.join(output_dir, 'pepDB.txt'), "w")
    pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n")

    pep_num = 0
    # create iterable dt and peptide pfile
    peptide_sentence_flatascii(pep_dt, p, bNy, 
                               pep_num, sid, max_obs_mass,
                               peptide_obs_file, True, len(bNy))
    # create spectrum pfile
    spectrum_sentence_flatascii(spectrum_obs_file, s.mz, s.intensity)
    pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, 
                                              p, l, c))
        
    # close streams for this spectrum
    pep_dt.close()
    pepdb_list.close()
    # compile dt using gmtkDTIndex
    call(['gmtkDTindex', '-decisionTreeFiles', 
          os.path.join(output_dir,'iterable.dts')], 
         stdout = stdo, stderr = stde)
         # stdout = sys.stderr, stderr = sys.stderr)

    # create structure and master files then triangulate
    try:
        create_drip_structure(highResMs2, args.structure_file, 
                              max_obs_mass, False, False,
                              high_res_gauss_dist)
    except:
        print "Could not create DRIP structure file %s, exitting" % args.structure_file
        exit(-1)

    try:
        create_drip_master(highResMs2, args.master_file, 
                           max_obs_mass,
                           "DRIP_MZ",
                           "drip_collection/covar.txt",
                           "DRIP_GAUSSIAN_COMPONENTS",
                           "DRIP_GAUSSIAN_MIXTURES",
                           "DRIP_MZ_GAUSSIANS")
    except:
        print "Could not create DRIP master file %s, exitting" % args.master_file
        exit(-1)

    try:
        triangulate_drip(args.structure_file, args.master_file)
    except:
        print "Could not create triangulate structure file %s, exitting" % args.structure_file
        exit(-1)

    try:
        write_covar_file(highResMs2, args.covar_file, 
                         dripLearnedCovars, True,
                         high_res_gauss_dist)
    except:
        print "Could not create covariance file %s, exitting" % args.covar_file
        exit(-1)

    # run GMTK
    dtFile = os.path.join(output_dir, 'iterable.dts')
    cppCommand = '\'-DITERABLE_DT=' + dtFile \
        + ' -DMAX_FRAGMENT_MASS=' + str(max_obs_mass) \
        + ' -DDRIP_MZ=' + args.mean_file \
        + ' -DDRIP_GAUSSIAN_COMPONENTS=' + args.gauss_file \
        + ' -DDRIP_GAUSSIAN_MIXTURES=' + args.mixture_file \
        + ' -DDRIP_MZ_GAUSSIANS=' + args.collection_file \
        + '\''

    # call gmtkViterbi
    vitStr0 = "gmtkViterbi -strFile " + args.structure_file \
        + " -triFile " + args.structure_file + ".trifile -ni1 0 -nf1 2 -ni2 1 -nf2 0" \
        + " -fdiffact2 rl" \
        + " -inputMasterFile " + args.master_file + " -inputTrainableParameters trained.params -failOnZeroClique F"
    # gmtkViterbi command line
    vitValsFile = os.path.join(log_dir, 'vitVals.txt')
    vitStr = vitStr0 + ' -vitValsFile ' +  vitValsFile \
        + ' -of1 ' + spectrum_obs_file \
        + ' -fmt1 flatascii ' \
        + ' -of2 ' + peptide_obs_file \
        + ' -fmt2 flatascii ' \
        + ' -cppCommand ' + cppCommand
    # call(shlex.split(vitStr), stdout = sys.stdout, stderr = sys.stdout)
    call(shlex.split(vitStr), stdout = stdo, stderr = stde)

    # parse output
    t,d = ppsm.parse_dripExtract(vitValsFile, os.path.join(output_dir, 'pepDB.txt'))

    t = t[sid,c][0]
    # calculate insertions and deletions
    t.add_obs_spectrum(s0)
    t.calculate_drip_features(dripMeans)
    t.calc_by_sets(c, mods,
                   ntermMods, ctermMods, highResMs2, 
                   ion_to_index_map,
                   varMods, ntermVarMods, ctermVarMods,
                   varModSequence)
    return t
Exemplo n.º 4
0
def make_drip_data_highres(args, spectra, stdo, stde):
    """Generate test data .pfile. and create job scripts for cluster use (if num_jobs > 1).
       Decrease number of calls to GMTK by only calling once per spectrum
       and running for all charge states in one go.

       inputs:
       args - output of parsed input arguments (struct)

       outputs:
       sids - list of scan IDs for the generated data

       pre:
       - args has been created by parse_args(), directories have been created/checked for existence,
         relevant arguments have been processed (Booleans, mods, digesting enzyme, etc)
       - data has been created by candidate_spectra_generate() and contains the above mentioned fields

       post:
       - args.{mean_file, gauss_file, mixture_file, collection_file} will all be adjusted
       - args.max_mass will be updated to the size of the number of unique theoretical fragmentation locations (floating point if high-res ms2, integers if low-res ms2)
    """
    # parse modifications
    mods, varMods = parse_var_mods(args.mods_spec, True)
    # print "mods:"
    # print mods
    ntermMods, ntermVarMods = parse_var_mods(args.nterm_peptide_mods_spec, False)
    # print "n-term mods:"
    # print nterm_mods
    ctermMods, ctermVarMods = parse_var_mods(args.cterm_peptide_mods_spec, False)

    varModKey = "Var_mod_seq"

    if not args.append_to_pin:
        target,decoy,num_psms = load_psms(args.psm_file)
    else:
        target,decoy,num_psms = load_pin_file(args.psm_file)

    # check whether variable mods enzyme options were specified and 
    # necessary variable mod string specifying which amino acids are modded
    # were in the PSM files
    for i in target[target.keys()[0]]:
        t = i
        break
    if varMods or ntermVarMods or ctermVarMods:
        if varModKey not in t.other:
            print "Variable modifications enzyme options specified,"
            print "but PSM file does not contain necessary field Var_mod_seq for strings specifying which amino acids are modified."
            print "Exitting"
            exit(-1)
    # else:
    #     if varModKey in t.other:
    #         print "PSM file does contains field Var_mod_seq denoting variable modifications,"
    #         print "but variable modifications enzyme options not specified."
    #         print "Exitting"
    #         exit(-1)


    pfile_dir = os.path.join(args.output_dir, args.obs_dir)
    sid_charges =  list(set(target.iterkeys()) | set(decoy.iterkeys()))
    # assume that we should randomize PSMs for multithreading purposes; only reason
    # why we are currently assuming this is that there is already a parameter for dripSearch
    # which signifies whether we should shuffle the data
    shuffle(sid_charges)

    if(args.normalize != 'filter0'):
        preprocess = pipeline(args.normalize)

    validcharges = args.charges

    ion_dict = {} # global dictionary for used fragment ions
    theo_spec_dict = {}
    numBY_dict_per_sid = {}
    # construct ion_dict
    for sid in spectra:
        s = spectra[sid]
        preprocess(s)
        for charge in validcharges:
            if (s.spectrum_id, charge) not in target:
                continue
            # check if we're filtering theoretical peaks outside observed m/z values
            if args.filt_theo_peaks:
                if args.per_spectrum_mz_bound:
                    minMz = s.mz[0]
                    maxMz = s.mz[-1]
                else:
                    minMz = args.mz_lb
                    maxMz = args.mz_ub

            # calculate maximum decoy and target theoretical spectra cardinalities
            for p in target[s.spectrum_id, charge]:
                pep = p.peptide
                # bNy = interleave_b_y_ions(Peptide(pep), charge, mods,
                #                           ntermMods, ctermMods)
                if varMods or ntermVarMods or ctermVarMods:
                    varModSequence = p.other[varModKey]
                    bNy = interleave_b_y_ions_var_mods(Peptide(pep), charge, 
                                                       mods, ntermMods, ctermMods,
                                                       varMods, ntermVarMods, ctermVarMods,
                                                       varModSequence)
                else:
                    bNy = interleave_b_y_ions(Peptide(pep), charge, 
                                              mods, ntermMods, ctermMods)
                numBY_dict_per_sid[sid, pep] = len(bNy)
                if args.filt_theo_peaks:
                    filter_theoretical_peaks(bNy, minMz, maxMz)
                theo_spec_dict[s.spectrum_id, pep] = bNy

                for i in bNy:
                    ion_dict[i] = 1
            for d in decoy[s.spectrum_id, charge]:
                pep = d.peptide
                # bNy = interleave_b_y_ions(Peptide(pep), charge, mods, 
                #                           ntermMods, ctermMods)
                if varMods or ntermVarMods or ctermVarMods:
                    varModSequence = d.other[varModKey]
                    bNy = interleave_b_y_ions_var_mods(Peptide(pep), charge, 
                                                       mods, ntermMods, ctermMods,
                                                       varMods, ntermVarMods, ctermVarMods,
                                                       varModSequence)
                else:
                    bNy = interleave_b_y_ions(Peptide(pep), charge, 
                                              mods, ntermMods, ctermMods)
                numBY_dict_per_sid[sid, pep] = len(bNy)
                if args.filt_theo_peaks:
                    filter_theoretical_peaks(bNy, minMz, maxMz)
                theo_spec_dict[s.spectrum_id, pep] = bNy
                for i in bNy:
                    ion_dict[i] = 1

    ions = list(ion_dict.iterkeys())
    ions.sort()
    for i, ion in enumerate(ions):
        ion_dict[ion] = i

    # make collection per spectrum
    make_master_parameters(args, ion_dict, ions)
    peptide_pfile = create_pfile(pfile_dir,
                                 'pep-lengths.pfile',
                                 0, 1)
            
    spectrum_pfile = create_pfile(pfile_dir,
                                  'spectrum.pfile',
                                  2,0)

    pep_dt = open(os.path.join(args.output_dir, 'iterable.dts'), "w")
    pep_dt.write('%d\n\n' % (num_psms))

    # write peptide database to parse and identify GMTK segments later
    pepdb_list = open(os.path.join(args.output_dir, 'pepDB.txt'), "w")
    pepdb_list.write("Kind\tSid\tPeptide\tNumBY\tCharge\n")

    spec_dict = {}
    pep_num = 0
    for sid, charge in sid_charges:
        if sid not in spec_dict:
            s = spectra[sid]
            preprocess(s)
            spec_dict[sid] = s
        else:
            s = spec_dict[sid]

        for p in target[sid,charge]:
            pep = p.peptide
            bNy = theo_spec_dict[s.spectrum_id, pep]
            bNy = [ion_dict[bOrY] for bOrY in bNy]
            drip_peptide_sentence(pep_dt, pep, bNy, 
                                  pep_num, s.spectrum_id, args.max_obs_mass,
                                  peptide_pfile, True, len(bNy)-1)
            drip_spectrum_sentence(spectrum_pfile, s.mz, s.intensity)
            pepdb_list.write("t\t%d\t%s\t%d\t%d\n" % (sid, 
                                                      pep, 
                                                      numBY_dict_per_sid[sid, pep],
                                                      charge))
            pep_num += 1

        if (sid,charge) in decoy:
            for d in decoy[sid,charge]:
                pep = d.peptide
                bNy = theo_spec_dict[s.spectrum_id, pep]
                bNy = [ion_dict[bOrY] for bOrY in bNy]
                drip_peptide_sentence(pep_dt, pep, bNy, 
                                      pep_num, s.spectrum_id, args.max_obs_mass,
                                      peptide_pfile, False, len(bNy)-1)
                drip_spectrum_sentence(spectrum_pfile, s.mz, s.intensity)
                pepdb_list.write("d\t%d\t%s\t%d\t%d\n" % (sid, 
                                                          pep, 
                                                          numBY_dict_per_sid[sid, pep],
                                                          charge))
                pep_num += 1

    # close streams for this spectrum
    pep_dt.close()
    pepdb_list.close()
    # compile dt using gmtkDTIndex
    call(['gmtkDTindex', '-decisionTreeFiles', 
          os.path.join(args.output_dir,'iterable.dts')], 
         stdout = stdo, stderr = stde)

    return spec_dict, pep_num
Exemplo n.º 5
0
def plot_psms(psmFile, spectrumFile, plotList = 'currPsms.html',
              highResMs2 = False,
              dripLearnedMeans = 'dripLearned.means',
              dripLearnedCovars = 'dripLearned.covars',
              mods = '', ntermMods = '', ctermMods = '',
              precursor_filter = False, 
              high_res_gauss_dist = 0.05):
    """
    """
    # initialize arguments for dripExtract
    args = dripExtractParams(psmFile, spectrumFile, 'all', 
                             mods, ntermMods, ctermMods, 
                             highResMs2, 
                             dripLearnedMeans, dripLearnedCovars)

    mods, varMods = parse_var_mods(mods, True)
    ntermMods, ntermVarMods = parse_var_mods(ntermMods, False)
    ctermMods, ctermVarMods = parse_var_mods(ctermMods, False)

    stde = open('gmtk_err', "w")
    # stdo = sys.stdout
    stdo = stde

    args.precursor_filter = False
    args.high_res_gauss_dist = high_res_gauss_dist
    if precursor_filter: 
        args.normalize = 'top300TightSequest'
    else:
        args.normalize = 'top300Sequest'

    # decode DRIP PSMs
    t, d, spectra0 = runDripExtract(args, stdo, stde)
    
    # if variable mods, get variable mod string per PSM
    if varMods or ntermVarMods or ctermVarMods:
        varModDict = psm_var_mods(psmFile)
        assert varModDict, "Variable mods specified in enzyme options, but strings denoting variables mods per peptide are not specified in %s, exitting"  (psmFile)
    spectra, minMz, maxMz, validCharges = load_spectra_minMaxMz(spectrumFile)

    # get original intensity values to plot
    for sid in spectra0:
        spectra[sid].mz = list(spectra0[sid].mz)
        mz_vals = set(spectra0[sid].mz)
        z = max(spectra0[sid].intensity)
        spectra[sid].intensity = [i/z for mz, i in zip(spectra[sid].mz, spectra[sid].intensity)
                                  if mz in mz_vals]

    if not highResMs2:
        dripMeans = load_drip_means(dripLearnedMeans)
    else:
        dripMeansSet = set([])
        for sid, c in t:
            for p in t[sid,c]:
                pep = p.peptide
                if varMods or ntermVarMods or ctermVarMods:
                    varModSequence = varModDict[sid, p.peptide]
                    bNy = interleave_b_y_ions_var_mods(Peptide(pep), c,
                                                       mods, ntermMods, ctermMods,
                                                       varMods, ntermVarMods, ctermVarMods,
                                                       varModSequence)
                else:
                    bNy = interleave_b_y_ions(Peptide(pep), c, 
                                              mods, ntermMods, ctermMods)
                filter_theoretical_peaks(bNy, minMz, maxMz, high_res_gauss_dist)
                dripMeansSet |= set(bNy)
                # for i, ion in enumerate(bNy):
                #     dripMeans[i] = ion
        for sid, c in d:
            for p in d[sid,c]:
                pep = p.peptide
                if varMods or ntermVarMods or ctermVarMods:
                    varModSequence = varModDict[sid, p.peptide]
                    bNy = interleave_b_y_ions_var_mods(Peptide(pep), c, 
                                                       mods, ntermMods, ctermMods,
                                                       varMods, ntermVarMods, ctermVarMods,
                                                       varModSequence)
                else:
                    bNy = interleave_b_y_ions(Peptide(pep), c, 
                                              mods, ntermMods, ctermMods)
                filter_theoretical_peaks(bNy, minMz, maxMz, high_res_gauss_dist)
                dripMeansSet |= set(bNy)
                # for i, ion in enumerate(bNy):
                #     dripMeans[i] = ion
        dripMeans = {}
        for ind, ion in enumerate(sorted(dripMeansSet)):
            dripMeans[ind] = ion

    ion_to_index_map = {} # reverse mapping, from ions to indices
    for ind in dripMeans:
        ion_to_index_map[dripMeans[ind]] = ind

    all_psms = []
    varModSequence = ''
    for sid, c in t:
        s = spectra[sid]
        for p in t[sid,c]:
            p.add_obs_spectrum(s)
            p.calculate_drip_features(dripMeans)
            if varMods or ntermVarMods or ctermVarMods:
                varModSequence = varModDict[sid, p.peptide]
            p.calc_by_sets(c,
                           mods, ntermMods, ctermMods,
                           highResMs2, 
                           ion_to_index_map,
                           varMods, ntermVarMods, ctermVarMods,
                           varModSequence)
        all_psms.append(p)
    for sid, c in d:
        s = spectra[sid]
        for p in d[sid,c]:
            p.add_obs_spectrum(s)
            p.calculate_drip_features(dripMeans)
            if varMods or ntermVarMods or ctermVarMods:
                varModSequence = varModDict[sid, p.peptide]
            p.calc_by_sets(c,
                           mods, ntermMods, ctermMods,
                           highResMs2, 
                           ion_to_index_map,
                           varMods, ntermVarMods, ctermVarMods,
                           varModSequence)
        all_psms.append(p)

    fid = open(plotList, "w")

    all_psms.sort(key = lambda r: r.score, reverse = True)
    for p in all_psms:
        if p.kind == 't':
            kind = 'target'
        elif p.kind == 'd':
            kind = 'decoy'
        else:
            continue

        plotName = kind + 'Scan' + str(p.scan) + \
            'Charge' + str(p.charge) + \
            p.peptide + '.png'

        p.plot_drip_viterbi(plotName)
        fid.write("<a href=\"%s\">%s Scan %d Charge %d %s</a><br>\n" %
                  (plotName, kind, p.scan, p.charge, p.peptide))

    fid.close()