Exemplo n.º 1
0
def mzxml_import(file_path):
    """Read centroided mzXML data"""
    headers = ["scan", "rt", "mz", "drift", "intensity"]
    input_data = []

    intensity_cutoff = config.intensity_cutoff

    reader = mzxml.MzXML(file_path)
    for index, spectrum in enumerate(reader):
        if len(spectrum["m/z array"]) != len(spectrum["intensity array"]):
            print(
                "ERROR: mzXML import; m/z and intensity arrays different lengths"
            )
        if spectrum["msLevel"] == 1:
            rt = round(spectrum["retentionTime"], 2)
            for j in range(len(spectrum["m/z array"])):
                intensity = spectrum["intensity array"][j]
                mz = spectrum["m/z array"][j]
                if intensity >= intensity_cutoff:
                    input_data.append([index, rt, mz, None, int(intensity)])

    if len(input_data) > 0:
        mzxml_dataframe = pd.DataFrame.from_records(input_data,
                                                    columns=headers)
        print("Completed mzXML import")
        return mzxml_dataframe

    else:
        print("No mass peaks found for " + file_path)
Exemplo n.º 2
0
def read_data(file):
    """
    read mzxml file using pyteomics.mzxml
    """
    data = mzxml.MzXML(file)
    print(str(file), 'has been accepted')

    return data
def getRT(runs, idTxt):
    # Input
    # 1. mzXML files
    # 2. ID.txt file containing all identified PSMs

    # Parameter(s)
    params = {"isolation_window": 1}  # isolation window size 1= +/-0.5
    # filterLine = re.compile("([0-9.]+)\\@")

    # Read ID.txt files to extract PSM information
    print("  Read ID.txt file: to extract PSM information")
    psms = pd.read_csv(
        idTxt, skiprows=1,
        sep=";")  # Note that ID.txt file is delimited by semicolon
    psms = psms[["Peptide", "Outfile", "XCorr"]].drop_duplicates()
    psms["charge"] = [
        outfile.split("/")[-1].split(".")[-2] for outfile in psms["Outfile"]
    ]
    psms["key"] = psms["Peptide"] + "_" + psms["charge"]
    print("  Done ...\n")

    # RT extraction/assignment for each mzXML file
    res = []
    for run in runs:
        runName = os.path.basename(run).split(".")[0]

        # Read a mzXML file and extract PSMs corresponding to the mzXML file
        reader = mzxml.MzXML(run)
        ms2ToSurvey = getMs2ToSurvey(reader)
        subPsms = psms[psms["Outfile"].str.contains(runName)]

        # Unique key is peptide-charge pair
        print(
            "  RT of every identified peptide in {} is being inferred and assigned"
            .format(runName))
        keys = subPsms["key"]
        keys = list(set(keys))
        progress = progressBar(len(keys))
        for key in keys:
            progress.increment()
            rtArray = np.array([])
            intArray = np.array([])
            for _, psm in subPsms[subPsms["key"] == key].iterrows():
                [_, psmScanNum, _, _,
                 _] = os.path.basename(psm["Outfile"]).split(".")
                psmScanNum = int(psmScanNum)
                surveyScanNum = ms2ToSurvey[psmScanNum]
                _, precIntensity, precRt = getPrecursorPeak(
                    reader, int(psmScanNum), surveyScanNum, params)
                rtArray = np.append(rtArray, precRt)
                intArray = np.append(intArray, precIntensity)
            rt = sum(rtArray * intArray) / sum(intArray)  # Unit of minute
            res.append([key, runName, rt, len(rtArray)])
        print("  Done ...\n")

    res = pd.DataFrame(res, columns=["key", "run", "RT", "nPSMs"])
    res = reformatRtTable(res)
    return res
Exemplo n.º 4
0
def mzxml_to_pandas_df(filename):
    slices = []
    file = mzxml.MzXML(filename)
    print("Reading:", filename)
    while True:
        try:
            slices.append(pd.DataFrame(file.next()))
        except:
            break
    df = pd.concat(slices)
    df_to_numeric(df)
    df["intensity array"] = df["intensity array"].astype(np.float64)
    return df
Exemplo n.º 5
0
def mzxml_to_pandas_df(filename):
    '''
    Reads mzXML file and returns a pandas.DataFrame.
    '''
    cols = ['retentionTime', 'm/z array', 'intensity array']
    slices = []
    file = mzxml.MzXML(filename)
    while True:
        try:
            slices.append(pd.DataFrame(file.next()))
        except:
            break
    df = pd.concat(slices)[cols]
    df_to_numeric(df)
    return df
Exemplo n.º 6
0
def mzxml_to_df(fn):
    '''
    Reads mzXML file and returns a pandas.DataFrame.
    '''
    slices = []
    with mzxml.MzXML(fn) as ms_data:
        while True:
            try:
                data = ms_data.next()
                df = pd.DataFrame(data)
                # Fix byteorder issue
                df.loc[:, :] = df.values.byteswap().newbyteorder()
                df = df[[
                    'num', 'msLevel', 'polarity', 'retentionTime', 'm/z array',
                    'intensity array'
                ]]
                slices.append(df)
            except StopIteration as e:
                break

    df = pd.concat(slices)
    df['retentionTime'] = df['retentionTime'].astype(np.float32)
    df['m/z array'] = df['m/z array'].astype(np.float32)
    df['intensity array'] = df['intensity array'].astype(int)
    df = df.rename(
        columns={
            'num': 'scan_id',
            'msLevel': 'ms_level',
            'retentionTime': 'scan_time_min',
            'm/z array': 'mz',
            'intensity array': 'intensity'
        })
    df = df.reset_index(drop=True)
    cols = [
        'scan_id', 'ms_level', 'polarity', 'scan_time_min', 'mz', 'intensity'
    ]
    df = df[cols]
    return df
Exemplo n.º 7
0
        assert (subset_csv.split('.')[-1] == 'csv')
    except:
        print("subset csv file is not provided, will not be used")
        subset_csv = None

    guide_file_path = '/'.join(guide.split('.')[0].split('/')[:-1])
    if guide_file_type == 'txt':
        print('accessing encycolpedia file:', guide)
        rt_fit_file = glob.glob(guide_file_path+'./*.rt_fit.txt')[0]
        print('accessing encycolpedia file rt file:', rt_fit_file)
    elif guide_file_type == 'xml':
        print('accessing pepxml file:', pepxmlfilename)
    print("################################################\n")

    print('reading mzXML file...')
    mzxml_it = mzxml.MzXML(mzXMLfilename)
    print('reading guide file...')
    if guide_file_type =='txt':
        encyc_parsed = guide_parsers.parse_encyclopedia(guide_file_path, filename=guide_file_name, q_value=q_cut, IO=PARSERIO)
        guide_parsed = guide_parsers.parse_rt_fit(guide_file_path, encyc_parsed, filename = rt_fit_file, IO=PARSERIO)
    elif guide_file_type=='xml':
        pepxml_it = pepxml.PepXML(pepxmlfilename)
    elif guide_file_type=='csv':
        sys.exit('csv not yet implemented')

    os.makedirs(os.path.dirname('misc/NPULSE.batch'), exist_ok=True)  # create NPULSE file
    t1 = time.time()
    run_program(mzxml_it, pepxml_it)
    t2 = time.time()
    
    dt = t2 - t1
Exemplo n.º 8
0
#    It needs to be considered in the script

# << ReAdW-based mzXML >>
# A mzXML file from ReAdW has the following characteristics
# 1. MS3 scan is not always MS2 + 1 scan (it depends on MS instrument, not on MSconvert)
#    For example,
#    scan#100: MS2 -> scan#101: MS3 (generally)
#    scan#1000: MS2, scan#1001: MS2 -> scan#1003:MS3, scan#1004: MS3 (some cases)
# 2. "msLevel" of MS2 scan is set to 0
# 3. There's no tag representing precursor m/z in MS2 and MS3
# 4. Precursor m/z can be inferred from "filterLine" tag in MS2 and MS3
# 4. Precursor m/z value is identifcal to .raw file (no re-evaluation/re-calculation)

# Inferred relationship between MS2 and MS3 using mzXML file
mzxmlFile = "NCI-11plex-1-F1-f10268.mzXML"
reader = mzxml.MzXML(mzxmlFile)
nTotScans = len(reader)
nMS1, nMS2, nMS3 = 0, 0, 0
progress = progressBar(nTotScans)
f = open("MS2_MS3_mzXML.txt", "w")
with reader:
    ms2ToMs3 = {}
    for spec in reader:
        progress.increment()
        if spec["msLevel"] == 1:
            nMS1 += 1
            precMzToMs2 = {
            }  # This dictionary is re-initiated for every MS1-scan cycle
        if spec["msLevel"] == 2:
            nMS2 += 1
            precMz = spec["precursorMz"][0]["precursorMz"]
params["last_scan_extraction"] = "100000"  # the last scan used for search
params["isolation_window"] = "1"  # isolation window size 1= +/-0.5
params["mass_correction"] = "0"  # 0 = no correction, 1 = MS1-based
params["signal_noise_ratio"] = "0"  # fold of the minimum signal noise ratio
params[
    "max_percentage_RT_range"] = "100"  # threshold maximum percentage of the range of retention time of a peak
params["min_peak_intensity"] = "10000"  # threshold of a peak intensity
params["skipping_scans"] = "3"  # number of skipping scans during 3D formation
params[
    "mass_tolerance_peak_matching"] = "3"  # mass tolerance for peak matching during 3D formation
features, ms1ToFeatures = detectFeatures(mzXML, params)
# features = pd.read_pickle("FTLD_Batch2_F50_Features_SN0_Gap3_3ppm.pickle")
# ms1ToFeatures = pd.read_pickle("FTLD_Batch2_F50_MS1_to_Features.pickle")

# Read mzXML file
reader = mzxml.MzXML(mzXML)
ms2ToSurvey = getMs2ToSurvey(reader)
mzXMLBaseName = os.path.basename(mzXML).split(".")[0]

# Read ID.txt files to extract PSM information
print("  Read ID.txt file and feature file")
psms = pd.read_csv(idTxt, skiprows=1,
                   sep=";")  # Note that ID.txt file is delimited by semicolon
psms = psms[["Peptide", "Outfile", "measuredMH", "XCorr"]]
psms = psms.loc[psms["Outfile"].str.contains(
    mzXMLBaseName)]  # Extract PSMs from FTLD_Batch2_F50.mzXML
psms["precMz"] = np.nan
psms["charge"] = np.nan
psms["featureIndex"] = np.nan
psms["category"] = ""
psms = psms.drop_duplicates()
Exemplo n.º 10
0
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights("SteroidXtract_model.h5")
loaded_model.compile(optimizer='adam', loss='binary_crossentropy')

os.chdir(input_dir)
files = [f for f in os.listdir(input_dir) if f.endswith('.mzXML')]

for l in range(len(files)):
    print('New file loaded')
    os.chdir(input_dir)
    # read mzxml file
    mzxml_file = files[l]
    print(files[l])
    file = mzxml.MzXML(mzxml_file)  # dict
    feature_df = pd.DataFrame(
        np.nan,
        index=range(len(file)),
        columns=['mzxml_index', 'precursor_MZ', 'rt', 'precursor_intensity'])

    # fill in precursorMZ and RT information
    h = 0
    for i in range(len(file)):
        if (file[i]['msLevel'] != 2): continue  # only MS2 recorded
        if (file[i]['retentionTime'] > rt_threshold): continue
        feature_df.iloc[h, 0] = int(file[i]['num'])
        feature_df.iloc[h, 1] = float(file[i]['precursorMz'][0]['precursorMz'])
        feature_df.iloc[h, 2] = float(file[i]['retentionTime'])
        feature_df.iloc[h, 3] = int(
            file[i]['precursorMz'][0]['precursorIntensity'])
Exemplo n.º 11
0
def ms2ForFeatures(full, mzxmlFiles, paramFile):
    print("  Identification of MS2 spectra for the features")
    print("  ==============================================")
    logging.info("  Identification of MS2 spectra for the features")
    logging.info("  ==============================================")
    full = full.to_records(
        index=False
    )  # Change pd.DataFrame to np.RecArray for internal computation (speed issue)

    ######################################
    # Load parameters and initialization #
    ######################################
    params = utils.getParams(paramFile)
    # ppiThreshold = "max"  # Hard-coded
    ppiThreshold = params["ppi_threshold_of_features"]
    pctTfThreshold = float(params["max_percentage_RT_range"])
    tolIsolation = float(params["isolation_window"])
    tolPrecursor = float(params["tol_precursor"])
    tolIntraMS2Consolidation = float(params["tol_intra_ms2_consolidation"])
    tolInterMS2Consolidation = float(params["tol_inter_ms2_consolidation"])
    nFeatures = len(full)
    nFiles = len(mzxmlFiles)
    featureToScan = np.empty((nFeatures, nFiles), dtype=object)
    featureToSpec = np.empty((nFeatures, nFiles), dtype=object)

    #################################################
    # Assignment of MS2 spectra to features         #
    # Consolidation of MS2 spectra for each feature #
    #################################################
    m = -1  # Index for input files
    for file in mzxmlFiles:
        m += 1
        reader = mzxml.MzXML(file)
        fileBasename, _ = os.path.splitext(os.path.basename(file))
        colNames = [
            item for item in full.dtype.names
            if item.startswith(fileBasename + "_")
        ]
        subset = full[colNames]
        subset.dtype.names = [s.split("_")[-1] for s in subset.dtype.names]
        ms2Dict = {}
        minScan, maxScan = int(np.nanmin(subset["minMS1"])), int(
            np.nanmax(subset["maxMS1"]))
        progress = utils.progressBar(maxScan - minScan + 1)
        print("  %s is being processed" % os.path.basename(file))
        print("  Looking for MS2 scan(s) responsible for each feature")
        logging.info("  %s is being processed" % os.path.basename(file))
        logging.info("  Looking for MS2 scan(s) responsible for each feature")
        for i in range(minScan, maxScan + 1):
            progress.increment()
            spec = reader[str(i)]
            msLevel = spec["msLevel"]
            if msLevel == 1:
                surveyNum = i
            elif msLevel == 2:
                # Find MS2 scans which satisfy the following conditions

                # From the discussion around June 2020,
                # 1. In ReAdW-derived mzXML files, precursor m/z values are in two tags: "precursorMz" and "filterLine"
                # 2. Through Haiyan's manual inspection, the real precursor m/z value is closer to one in "filterLine" tag
                # 3. So, in this script, precursor m/z of MS2 scan is obtained from "filterLine" tag
                # 4. Note that it may be specific to ReAdW-derived mzXML files since MSConvert-derived mzXML files do not have "filterLine" tag
                # 4.1. In this case, maybe the use of mzML (instead of mzXML) would be a solution (to-do later)

                # precMz = spec["precursorMz"][0]["precursorMz"]  # Precursor m/z from "precursorMz" tag
                p = re.search("([0-9.]+)\\@", spec["filterLine"])
                precMz = float(p.group(1))
                survey = reader[str(surveyNum)]
                fInd = np.where((surveyNum >= subset["minMS1"])
                                & (surveyNum <= subset["maxMS1"])
                                & (subset["mz"] >= (precMz - tolIsolation))
                                & (subset["mz"] <= (precMz + tolIsolation)) &
                                (subset["PercentageTF"] <= pctTfThreshold))[0]
                if len(fInd) > 0:
                    ppi = []
                    for i in range(len(fInd)):
                        mz = subset["mz"][fInd[i]]
                        lL = mz - mz * tolPrecursor / 1e6
                        uL = mz + mz * tolPrecursor / 1e6
                        ind = np.where((survey["m/z array"] >= lL)
                                       & (survey["m/z array"] <= uL))[0]
                        if len(ind) > 0:
                            ppi.append(np.max(survey["intensity array"][ind]))
                        else:
                            ppi.append(0)

                    if sum(ppi) == 0:
                        continue
                    ppi = ppi / np.sum(
                        ppi) * 100  # Convert intensities to percentage values
                    if ppiThreshold == "max":
                        fInd = np.array([fInd[np.argmax(ppi)]])
                    else:
                        # ppiThreshold should be a numeric value
                        ppiThreshold = float(ppiThreshold)
                        fInd = fInd[np.where(ppi > ppiThreshold)]
                    if len(fInd
                           ) == 0:  # Last check of candidate feature indexes
                        continue
                    else:
                        # Add this MS2 scan information to ms2Dict
                        ms2Dict[spec["num"]] = {}
                        ms2Dict[spec["num"]]["mz"] = spec["m/z array"]
                        ms2Dict[
                            spec["num"]]["intensity"] = spec["intensity array"]

                        # Mapping between features and MS2 scan numbers
                        for i in range(len(fInd)):
                            if featureToScan[fInd[i], m] is None:
                                featureToScan[fInd[i], m] = spec["num"]
                            else:
                                featureToScan[fInd[i], m] += ";" + spec["num"]

        print(
            "  Merging MS2 spectra for each feature within a run (it may take a while)"
        )
        logging.info(
            "  Merging MS2 spectra for each feature within a run (it may take a while)"
        )
        progress = utils.progressBar(nFeatures)
        for i in range(nFeatures):
            progress.increment()
            if featureToScan[i, m] is not None:
                spec = intraConsolidation(ms2Dict, featureToScan[i, m],
                                          tolIntraMS2Consolidation)
                featureToSpec[i, m] = spec

    print(
        "  Merging MS2 spectra for each feature between runs when there are multiple runs"
    )
    print(
        "  Simplification of MS2 spectrum for each feature by retaining the most strongest 100 peaks"
    )
    logging.info(
        "  Merging MS2 spectra for each feature between runs when there are multiple runs"
    )
    logging.info(
        "  Simplification of MS2 spectrum for each feature by retaining the most strongest 100 peaks"
    )
    specArray = np.array([])
    progress = utils.progressBar(nFeatures)
    for i in range(nFeatures):
        progress.increment()
        if np.sum(featureToSpec[i] == None) == nFiles:
            specArray = np.append(specArray, None)
        else:
            spec = interConsolidation(featureToSpec[i, :],
                                      tolInterMS2Consolidation)
            specArray = np.append(specArray, spec)

    ###############################
    # MS2 processing for features #
    ###############################
    # "specArray" is the list of (consolidated) MS2 spectra
    # specArray[i] is the MS2 spectrum corresponding to the i-th feature
    # If there's no MS2 spectrum, then specArray[i] is None
    df = utils.summarizeFeatures(full, params)
    # Add the mean m/z of feature and its charge state to the beginning of MS2 spectrum (similar to .dta file)
    for i in range(nFeatures):
        if specArray[i] is not None:
            specArray[i]["mz"] = np.insert(specArray[i]["mz"], 0,
                                           df["feature_m/z"].iloc[i])
            specArray[i]["intensity"] = np.insert(specArray[i]["intensity"], 0,
                                                  df["feature_z"].iloc[i])
    df["MS2"] = specArray
    df = df.sort_values(
        by="feature_m/z",
        ignore_index=True)  # Features are sorted by "feature_m/z"
    df.insert(loc=0, column="feature_num", value=df.index + 1)
    # df["feature_num"] = df.index + 1  # Update "feature_num" according to the ascending order of "feature_m/z" (as sorted)

    # Write MS2 spectra to files
    filePath = os.path.join(os.getcwd(), "align_" + params["output_name"])
    ms2Path = os.path.join(filePath, "MS2")
    if not os.path.exists(ms2Path):
        os.mkdir(ms2Path)
    for i in range(df.shape[0]):
        if df["MS2"].iloc[i] is not None:
            fileName = os.path.join(ms2Path, "f" + str(i + 1) + ".MS2")
            dfMS2 = pd.DataFrame.from_dict(df["MS2"].iloc[i])
            dfMS2.to_csv(fileName, index=False, header=False, sep="\t")

    # Save fully-aligned features with their MS2 spectra (i.e. res) for debugging purpose
    # When the pipeline gets mature, this part needs to be removed
    pickle.dump(df,
                open(os.path.join(filePath, ".fully_aligned_feature.pickle"),
                     "wb"))  # Make the file be hidden

    ##########################
    # Handling mzXML file(s) #
    ##########################
    # Move mzXML files to the directory(ies) where individual .feature files are located
    if params["skip_feature_detection"] == "0":
        for file in mzxmlFiles:
            baseFilename = os.path.basename(file)
            featureDirectory = os.path.join(os.getcwd(),
                                            os.path.splitext(baseFilename)[0])
            os.rename(file, os.path.join(featureDirectory, baseFilename))

    return df, featureToScan
Exemplo n.º 12
0
# mzxmlFile = "NCI-11plex-1-F1-f10268.mzXML"
paramFile = sys.argv[1]
params = getParams(paramFile)
idTxt = params["idtxt"]
print("  Loading ID.txt file")
psms, pep2psm, prot2psm, jumpfPath = parseIdtxt(idTxt, params)

####################################################
# Extract TMT reporter ion intensities - 1st round #
####################################################
print("  Extraction of TMT reporter ion intensities")
ms2ToMs3 = {}
qdict = {
}  # Dictionary; key = MS2 scan number, value = TMT reporter intensities (array)
for frac in sorted(psms.keys()):
    reader = mzxml.MzXML(frac)
    print("  Processing %s" % os.path.basename(frac))
    print("  Looking for MS2 precursor scans of MS3 scans")
    ms2ToMs3[frac] = matchMs2ToMs3(psms[frac], reader)
    print("  Reporter intensities are being extracted from MS3 scans")
    progress = progressBar(len(ms2ToMs3[frac]))
    for ms2, ms3 in ms2ToMs3[frac].items():
        progress.increment()
        reporterIntensity = getReporterIntensity(ms3, reader, params)
        key = os.path.basename(frac) + "_" + ms2
        qdict[key] = reporterIntensity
    print()

# Create a dataFrame after the first extraction of reporter m/z and intensity values
reporters = params["tmt_reporters_used"].split(";")
columnNames = [re.sub("sig", "mz", i) for i in reporters] + reporters