def write_fdr_outputs(inFile, specFile, pepFile, protFile):

    smf.print_milestone('Generating FDR Analysis Files:')
    overallDf = pd.read_csv(inFile).sort_values(
        'MaCC_Score', ascending=False).reset_index(drop=True)
    spectralDf = add_fdr_to_csodiaq_output(overallDf)
    peptideDf = add_fdr_to_csodiaq_output(overallDf, filterType='peptide')

    spectralDf.to_csv(specFile, index=False)
    peptideDf.to_csv(pepFile, index=False)

    if protFile:
        peptideProteinConnections = format_peptide_protein_connections(
            peptideDf)
        verifiedProteinDict = idp.find_valid_proteins(
            peptideProteinConnections)
        proteinDf = add_leading_protein_column(peptideDf, verifiedProteinDict)
        tempProtDf = add_fdr_to_csodiaq_output(proteinDf,
                                               filterType='leadingProtein')
        proteinMetaInfoDict = tempProtDf.set_index(
            'leadingProtein').T.to_dict()
        proteinDf = remove_invalid_peptides_and_add_metadata(
            proteinDf, proteinMetaInfoDict)
        proteinDf = mark_peptides_unique_to_proteins(proteinDf)
        proteinDf.to_csv(protFile, index=False)
def mgf_library_upload(fileName):
    libMGF = mgf.read(fileName)
    smf.print_milestone('Enter library dictionary upload: ')
    lib = {}
    id = 0
    for spec in libMGF:
        id += 1
        key = (spec['params']['pepmass'][0], spec['params']['seq'])
        charge = int(re.sub('[+-]', '', str(spec['params']['charge'][0])))
        name = spec['params']['title']
        if 'protein' in spec['params']: protein = spec['params']['protein']
        else: protein = ''
        if 'DECOY' in name: decoy = 1
        else: decoy = 0
        mz = spec['m/z array']
        intensity = spec['intensity array']
        intensity = [x**0.5 for x in intensity]
        keyList = [id for x in mz]
        peaks = list(tuple(zip(mz, intensity, keyList)))
        peaks.sort(key=lambda x: x[1], reverse=True)
        if len(peaks) > 10: peaks = peaks[:10]
        peaks.sort(key=lambda x: x[0])
        tempDict = {
            'PrecursorCharge': charge,
            'transition_group_id': name,
            'ProteinName': protein,
            'Peaks': peaks,
            'ID': id,
            'Decoy': decoy,
        }
        lib[key] = tempDict
    return lib
def filter_fdr_output_for_targeted_reanalysis(fdrFile, proteins, heavy):
    smf.print_milestone('Generate DISPA Targeted Reanalysis Files:')
    fdrDf = pd.read_csv(fdrFile)
    fdrDf = fdrDf[~fdrDf['protein'].str.contains('DECOY', na=False
                                                 )].reset_index(drop=True)
    if heavy:
        fdrDf = fdrDf[fdrDf['peptide'].str.endswith('R')
                      | fdrDf['peptide'].str.endswith('K')].reset_index(
                          drop=True)
    if proteins:
        fdrDf = fdrDf[fdrDf['uniquePeptide'] == 1].sort_values(
            'ionCount', ascending=False).reset_index(drop=True)
        fdrDf = fdrDf.groupby(['leadingProtein']).head(proteins).reset_index()
    return fdrDf
def connect_mzxml_to_csodiaq_and_library(idFile, libFile, mzxmlFiles,
                                         maxPeaks):
    smf.print_milestone('Preparing Quantification Dictionaries:')
    metadataToScanDict = create_mzxml_to_csodiaq_dict(mzxmlFiles[0])
    fileType = idFile.split('.')[-1]
    if fileType == 'csv': fragDf = pd.read_csv(idFile)
    else: fragDf = pd.read_csv(idFile, sep='\t')

    scanToCsodiaqDict, libMetadataToScanDict = connect_csodiaq_data_to_scans(
        idFile, metadataToScanDict, fragDf)
    scanToLibPeaksDict = pool_library_spectra_by_scan(libFile,
                                                      libMetadataToScanDict,
                                                      fragDf, maxPeaks)
    return scanToCsodiaqDict, scanToLibPeaksDict
def heavy_light_quantification(fragDict, libDict, mzxmlFiles, outDir, massTol,
                               minMatch, ratioType, correction, hist):

    finalDf = initialize_quantification_output(fragDict, libDict)

    def initialize_ratio_dict_values():
        return np.nan

    for f in mzxmlFiles:
        ppmDiffs = []
        allSpectraMatch = QuantificationSpectraMatcher.QuantificationSpectraMatcher(
        )
        scanToNoiseIntensityCutoffDict = dict()
        with mzxml.read(f, use_index=True) as file:
            for scan in sorted(libDict.keys()):

                spec = file.get_by_id(scan)

                scanToNoiseIntensityCutoffDict[int(scan)] = np.mean(
                    sorted(spec['intensity array'])[:10]) / 2

                expSpectrum = smf.format_spectra_for_pooling(spec,
                                                             scan,
                                                             sqrt=False)
                expSpectrum.sort()

                libSpectra = sorted(libDict[scan])

                quantSpectraMatch = QuantificationSpectraMatcher.QuantificationSpectraMatcher(
                )
                quantSpectraMatch.compare_spectra(libSpectra, expSpectrum,
                                                  massTol, minMatch)
                allSpectraMatch.extend_all_spectra(quantSpectraMatch)

        if correction != -1:
            allSpectraMatch.filter_by_corrected_ppm_window(
                correction, hist, minMatch)
        ratioDict = defaultdict(initialize_ratio_dict_values)
        if len(allSpectraMatch.libraryIntensities) != 0:
            ratioDict = allSpectraMatch.determine_ratios(
                ratioDict, scanToNoiseIntensityCutoffDict, ratioType, minMatch)

        finalDf[f] = [
            ratioDict[(int(row['scan']), row['peptide'])]
            for index, row in finalDf.iterrows()
        ]
    smf.print_milestone('Finish SILAC Quantification')
    return finalDf
def perform_spectra_pooling_and_analysis(querySpectraFile, outFile, lib,
                                         tolerance, maxQuerySpectraToPool,
                                         corrected, histFile):

    smf.print_milestone('Begin Grouping Scans by m/z Windows:')
    queWindowDict, queScanValuesDict = pool_scans_by_mz_windows(
        querySpectraFile)

    print('Number of Unpooled MS/MS Query Spectra: ' +
          str(len(queScanValuesDict)))
    print('Number of Pooled MS/MS Query Spectra/Mz Windows: ' +
          str(len(queWindowDict)),
          flush=True)

    # To enhance the print experience, status prints will be given at intervals tailored to the number of identified windows.
    #  example: if there are 1-99 pooled query spectra, print statements are made after every pooled query spectra analysis is complete.
    #           if there are 100-999, print after every 10 pooled spectra. And so on.
    printFriendlyCounter = 100
    while printFriendlyCounter < len(queWindowDict):
        printFriendlyCounter *= 10
    printFriendlyCounter /= 100

    allLibKeys, libIdToKeyDict, libIdToDecoyDict = gather_library_metadata(lib)
    allSpectraMatches = IdentificationSpectraMatcher.IdentificationSpectraMatcher(
    )
    numWindowsAnalyzed = 0

    prevtime = timer()
    smf.print_milestone('Begin Pooled Spectra Analysis:')
    with mzxml.read(querySpectraFile, use_index=True) as spectra:

        for precMz_win, scans in queWindowDict.items():
            top_mz = precMz_win[0] + precMz_win[1] / 2
            bottom_mz = precMz_win[0] - precMz_win[1] / 2
            libKeys = identify_lib_spectra_in_window(top_mz, bottom_mz,
                                                     allLibKeys)
            if len(libKeys) == 0: continue
            pooledLibSpectra = pool_lib_spectra(lib, libKeys)
            pooledQueSpectra = []

            for i in range(len(scans)):
                scanNumber = scans[i]
                queSpectrum = spectra.get_by_id(scanNumber)
                pooledQueSpectra += smf.format_spectra_for_pooling(
                    queSpectrum, scanNumber)

                if (i % maxQuerySpectraToPool == 0
                        and i != 0) or i == len(scans) - 1:
                    pooledQueSpectra.sort()
                    windowSpectraMatches = IdentificationSpectraMatcher.IdentificationSpectraMatcher(
                    )
                    windowSpectraMatches.compare_spectra(
                        pooledLibSpectra, pooledQueSpectra, tolerance,
                        libIdToDecoyDict)
                    allSpectraMatches.extend_all_spectra(windowSpectraMatches)
                    pooledQueSpectra.clear()

            numWindowsAnalyzed += 1
            if numWindowsAnalyzed % printFriendlyCounter == 0:
                time = timer()
                print('\nNumber of Pooled Experimental Spectra Analyzed: ' +
                      str(numWindowsAnalyzed))
                print('Number of Spectra in Current Pooled Spectra: ' +
                      str(len(scans)))
                print('Time Since Last Checkpoint: ' +
                      str(round(time - prevtime, 2)) + ' Seconds',
                      flush=True)
                prevtime = time

    smf.print_milestone('Begin FDR Analysis:')
    maccCutoff = allSpectraMatches.find_score_fdr_cutoff()

    if corrected != -1:
        smf.print_milestone('Begin Correction Process:')
        allSpectraMatches.filter_by_corrected_ppm_window(
            corrected, maccCutoff, histFile)

        smf.print_milestone('Begin Corrected FDR Analysis:')
        maccCutoff = allSpectraMatches.find_score_fdr_cutoff()

    smf.print_milestone('\nBegin Writing to File: ')
    allSpectraMatches.write_output(outFile, querySpectraFile, maccCutoff,
                                   queScanValuesDict, libIdToKeyDict, lib)
def traml_library_upload(fileName):
    if fileName.endswith('.tsv'):
        lib_df = pd.read_csv(fileName, sep='\t')
    else:
        lib_df = pd.read_csv(fileName)
    smf.print_milestone('Enter library dictionary upload: ')

    # Pan human and spectraST libraries have different column names. This normalizes the columns.
    headings = traml_column_headings(lib_df.columns)
    lib_df = lib_df.loc[:,
                        lib_df.columns.intersection([
                            headings['PrecursorMz'],
                            headings['FullUniModPeptideName'],
                            headings['PrecursorCharge'], headings['ProductMz'],
                            headings['LibraryIntensity'], headings[
                                'transition_group_id'], headings['ProteinName']
                        ])]
    lib_df = lib_df[[
        headings['PrecursorMz'], headings['FullUniModPeptideName'],
        headings['PrecursorCharge'], headings['ProductMz'],
        headings['LibraryIntensity'], headings['transition_group_id'],
        headings['ProteinName']
    ]]
    lib_df.columns = [
        'PrecursorMz', 'FullUniModPeptideName', 'PrecursorCharge', 'ProductMz',
        'LibraryIntensity', 'transition_group_id', 'ProteinName'
    ]

    lib_df['LibraryIntensity'] = [
        x**0.5 for x in list(lib_df['LibraryIntensity'])
    ]
    lib_df['ID'] = list(
        zip(lib_df['PrecursorMz'].tolist(),
            lib_df['FullUniModPeptideName'].tolist()))

    mz_dict = lib_df.groupby("ID")['ProductMz'].apply(list).to_dict()
    intensity_dict = lib_df.groupby("ID")['LibraryIntensity'].apply(
        list).to_dict()
    lib_df.drop_duplicates(subset="ID", inplace=True)
    lib_df = lib_df.loc[:,
                        lib_df.columns.intersection([
                            'ID', 'PrecursorCharge', 'transition_group_id',
                            'ProteinName'
                        ])]
    lib_df.set_index("ID", drop=True, inplace=True)
    lib = lib_df.to_dict(orient="index")

    # pan human library formats are different, including how the peptides are matched to proteins (esp. decoys). This section of code adjusts for this discrepancy.
    if headings['type'] == 'PanHuman':
        for key, value in lib.items():
            proteins = lib[key]['ProteinName'].split('/')
            num = proteins.pop(0)
            newProteins = [x for x in proteins if 'DECOY' not in x]
            proteinStr = str(len(newProteins))
            for x in newProteins:
                if 'DECOY' in num: proteinStr += ('/DECOY_' + x)
                else: proteinStr += ('/' + x)
            lib[key]['ProteinName'] = proteinStr

    id = 0
    for key in lib:
        id += 1
        mz, intensity = (list(t) for t in zip(
            *sorted(zip(mz_dict[key], intensity_dict[key]))))
        keyList = [id for i in range(len(mz))]
        peaks = list(tuple(zip(mz, intensity, keyList)))

        peaks.sort(key=lambda x: x[1], reverse=True)
        if len(peaks) > 10: peaks = peaks[:10]

        peaks.sort(key=lambda x: x[0])
        lib[key]['Peaks'] = peaks
        lib[key]['ID'] = id
        if 'DECOY' in lib[key]['ProteinName']: lib[key]['Decoy'] = 1
        else: lib[key]['Decoy'] = 0
    return lib