Пример #1
0
def calculate_sample_means(data, parameters):
    # type: (LFDataFrame, LFParameters) -> None
    """Calculate and add the mean of the intensity of each sample
    replicates in the input dataframe.

    Keyword Arguments:
        data       -- LFDataFrame instance
        parameters -- LipidFinder's PeakFilter parameters instance
    """
    # Column index of first sample replicate
    startIndex = parameters['firstSampleIndex'] - 1
    # Column index of last sample replicate
    endIndex = startIndex + (parameters['numSamples'] *
                             parameters['numTechReps'])
    if (parameters['numTechReps'] == 1):
        # The mean of a single replicate is the replicate itself, so the
        # mean column will have a copy of the single sample replicate
        for firstIndex in range(startIndex, endIndex):
            colName = data.columns[firstIndex] + '_mean'
            data[colName] = data.iloc[:, firstIndex].astype(float).round(
                0).astype(int)
    else:
        for firstIndex in range(startIndex, endIndex,
                                parameters['numTechReps']):
            lastIndex = firstIndex + parameters['numTechReps']
            # Create the column name for the mean of the current sample
            colName = re.sub('\d+$', "", data.columns[firstIndex]) + '_mean'
            # Get means (not taking into account zeros) of the sample
            rawMeans = data.iloc[:, firstIndex:lastIndex].apply(
                lambda x: x.sum() / (x.astype(bool).sum()
                                     if (x.astype(bool).sum()) else 1),
                axis=1)
            # Round to nearest integer, cast to integer and insert
            # sample means into the dataframe
            data[colName] = rawMeans.round(0).astype("int64")
Пример #2
0
def remove_isotopes(data, parameters):
    # type: (LFDataFrame, LFParameters) -> None
    """Remove isotopes of parent analytes.

    Keyword Arguments:
        data       -- LFDataFrame instance
        parameters -- LipidFinder's PeakFilter parameters instance
    """
    mzCol = parameters['mzCol']
    rtCol = parameters['rtCol']
    # Calculate the location of sample columns based on the current
    # state of the dataframe (before adding isotope annotation)
    firstSampleCol = len(data.columns) - parameters['numSamples']
    lastSampleCol = len(data.columns)
    for i in range(firstSampleCol, lastSampleCol):
        # Create an array from 'data' with m/z, retention time, the
        # samples' intensity mean and index per row
        array = numpy.stack((data[mzCol].values, data[rtCol].values,
                             data.iloc[:, i], data.iloc[:, 0].values), axis=-1)
        tagArray = _detect_sample_isotopes(array, parameters)
        # Set the intensity of the sample detected isotopes to 0
        colName = data.columns[i]
        isoColName = colName + '_isotopes'
        data.insert(len(data.columns), isoColName, tagArray)
        if (parameters['removeIsotopes']):
            data.loc[data[isoColName].str.contains('M\+'), colName] = 0.0
    if (parameters['removeIsotopes']):
        # Drop empty frames, i.e. isotope frames found in every sample
        data.drop_empty_frames(
                'Isotope removal (isotopes found in every sample)', parameters,
                True)
Пример #3
0
def __process_feature__(featureCluster, parameters, means):
    # type: (pandas.DataFrame, LFParameters, bool) -> pandas.DataFrame
    """Correct retention time misalignment in the given feature cluster.

    Keyword Arguments:
        featureCluster -- frames with the same feature cluster ID
        parameters     -- LipidFinder's PeakFilter parameters instance
        means          -- perform the correction over mean columns
                          instead of each sample replicate?
    """
    if (len(featureCluster) == 1):
        return featureCluster
    if (means):
        # The sample means for the feature cluster
        tmpData = featureCluster.iloc[:, -parameters['numSamples'] : ].copy()
        # Get the index of frames with at least 1 column with a non-zero
        # intensity
        nonZeroIndices = numpy.where(tmpData.sum(axis=1) > 0)[0]
        if (nonZeroIndices.size > 1):
            # Get array of retention times (RT)
            rtArray = featureCluster[parameters['rtCol']].values
            # Get an array of the time difference to next frame
            rtDiff = numpy.roll(rtArray[nonZeroIndices], -1) \
                    - rtArray[nonZeroIndices]
            # Get the array of intensities for the frames with at least
            # 1 column with a non-zero intensity
            intensity = tmpData.values[nonZeroIndices]
            __process_sample__(intensity, rtDiff, parameters,
                               parameters['numSamples'])
            # Replace old values with the new ones
            tmpData.values[nonZeroIndices] = intensity
            featureCluster.iloc[:, -parameters['numSamples'] : ] = tmpData
    else:
        firstSampleIndex = parameters['firstSampleIndex'] - 1
        lastSampleIndex = firstSampleIndex + (parameters['numSamples']
                                              * parameters['numTechReps'])
        # Get array of RTs
        rtArray = featureCluster[parameters['rtCol']].values
        # Loop through each set of replicates per sample
        for firstIndex in range(firstSampleIndex, lastSampleIndex,
                                parameters['numTechReps']):
            lastIndex = firstIndex + parameters['numTechReps']
            tmpData = featureCluster.iloc[:, firstIndex : lastIndex].copy()
            # Get the index of frames with at least 1 replicate with a
            # non-zero intensity
            nonZeroIndices = numpy.where(tmpData.sum(axis=1) > 0)[0]
            if (nonZeroIndices.size > 1):
                # Get an array of the time difference to next frame
                rtDiff = numpy.roll(rtArray[nonZeroIndices], -1) \
                        - rtArray[nonZeroIndices]
                # Get the array of intensities for the frames with at least
                # 1 replicate with a non-zero intensity
                intensity = tmpData.values[nonZeroIndices]
                __process_sample__(intensity, rtDiff, parameters,
                                   parameters['numTechReps'])
                # Replace old values with the new ones
                tmpData.values[nonZeroIndices] = intensity
                featureCluster.iloc[:, firstIndex : lastIndex] = tmpData
    return featureCluster
Пример #4
0
def remove_outliers(data, parameters, src='samples'):
    # type: (LFDataFrame, LFParameters) -> None
    """Removes outliers from a set of replicates on a row by row basis.

    All sample replicates may be discarded if the relative standard
    deviation (RSD) of the remaining replicates cannot be reduced below
    the established threshold.

    Keyword Arguments:
        data         -- LFDataFrame instance
        parameters   -- LipidFinder's PeakFilter parameters instance
        src          -- columns where to check for outliers: "samples"
                        or "blanks" [default: "samples"]
    """
    if (src not in ['samples', 'blanks']):
        raise ValueError('Unexpected value. Options: samples, blanks')
    # Set the corresponding values regarding the columns to evaluate
    if (src == 'samples'):
        startIndex = parameters['firstSampleIndex'] - 1
        endIndex = startIndex + (parameters['numSamples'] *
                                 parameters['numTechReps'])
        repsPerGroup = parameters['numTechReps']
    else:
        startIndex = parameters['firstSampleIndex'] \
                + (parameters['numSamples'] * parameters['numTechReps']) \
                + parameters['numQCReps'] - 1
        endIndex = startIndex + parameters['numSolventReps']
        repsPerGroup = parameters['numSolventReps']
    # Add dummy row to avoid unexpected behavior when using apply(): "In
    # the current implementation, apply calls func twice on the first
    # column/row to decide whether it can take a fast or slow code
    # path."
    tmpData = data.iloc[0, :].to_frame().transpose()
    tmpData = tmpData.append(data, ignore_index=True)
    # Loop through each set of replicates per sample, in each case
    # slicing out and processing 1 sample's replicate
    for firstIndex in range(startIndex, endIndex, repsPerGroup):
        lastIndex = firstIndex + repsPerGroup
        tmpData.iloc[:, firstIndex : lastIndex] = \
                tmpData.iloc[:, firstIndex : lastIndex].apply(
                        __reps_frame__, axis=1, parameters=parameters)
    # Copy to data the new replicates values after removing the first
    # dummy row
    tmpData = tmpData.iloc[1:]
    tmpData.index = tmpData.index - 1
    data.iloc[:, startIndex : endIndex] = \
            tmpData.iloc[:, startIndex : endIndex]
    # Drop empty frames (if any)
    data.drop_empty_frames('Empty frames after Outlier Correction', parameters)
Пример #5
0
def rm_full_frags(
        array,  # type: numpy.ndarray
        fragments,  # type: pandas.DataFrame
        parameters  # type: LFParameters
):
    # type: (...) -> list[float]
    """Return an index list corresponding to common in-source fragments
    in the given sample array.

    Return the index list of all 'array' features that match the m/z
    values provided in 'fragments' for which there is at least another
    feature above the given m/z cut-off at the same retention time (RT).
    All m/z and RT matching are computed within tolerance.

    Keyword arguments:
        array      -- array with m/z, RT and index of the original
                      dataframe
        fragments  -- in-source fragments to be removed
        parameters -- LipidFinder's PeakFilter parameters instance
    """
    # Create an array with one in-source fragment m/z cut-off and m/z
    # offset per row
    fragsArray = numpy.stack(
        (fragments['MZ'].values, fragments['MZCutOff'].values), axis=-1)
    fragsIndex = []
    for fragMZ, mzCutOff in fragsArray:
        mzRange = mz_tol_range(fragMZ, parameters['mzFixedError'],
                               parameters['mzPPMError'])
        # Get the index of 'array' features that match the in-source
        # fragment m/z value
        mzMatches = numpy.searchsorted(array[:, 0], mzRange)
        if (mzMatches[0] == mzMatches[1]):
            continue
        for index in range(mzMatches[0], mzMatches[1]):
            # To be a match, each feature must have the same RT
            minRT, maxRT = rt_tol_range(array[index, 1], RT_TOLERANCE)
            rtMatches = numpy.where((array[:, 0] >= mzCutOff)
                                    & (array[:, 1] >= minRT)
                                    & (array[:, 1] <= maxRT))[0]
            if (len(rtMatches) > 0):
                # Mark the feature as an in-source fragment
                fragsIndex.append(index)
    return fragsIndex
Пример #6
0
def cluster_by_features(data, parameters):
    # type: (LFDataFrame, LFParameters) -> None
    """Cluster contiguous ions within the same mass cluster where each
    member is separated by a retention time difference of less than
    'maxRTDiffAdjFrame' (in 'parameters').

    Feature clusters are identified and each assigned an arbitrary
    unique integer identifier.

    Keyword Arguments:
        data       -- LFDataFrame instance
        parameters -- LipidFinder's PeakFilter parameters instance
    """
    firstRepIndex = parameters['firstSampleIndex'] - 1
    mzCol = parameters['mzCol']
    rtCol = parameters['rtCol']
    # Re-sort dataframe ready for feature clustering
    data.sort_values(by=['mzClusterID', rtCol, mzCol], inplace=True,
                     kind='mergesort')
    # Reset index
    data.reset_index(inplace=True, drop=True)
    # Create a new dataframe with auxiliary information:
    # "TimeDiff": retention time difference between current and next
    #     frames
    auxData = pandas.DataFrame(
            {'TimeDiff': data[rtCol].shift(-1) - data[rtCol]})
    # Assign a feature cluster ID to each cluster of contiguous
    # ions within the same mass cluster where each member is separated
    # by a retention time difference of less than 'maxRTDiffAdjFrame'
    data['FeatureClusterID'] = numpy.nan
    timeDiffs = auxData['TimeDiff'].values
    mzClusterIDs = data['mzClusterID'].values
    featureClusterIDs = data['FeatureClusterID'].values
    id = 1
    numRowsData = len(data)
    for index in range(0, numRowsData - 1):
        featureClusterIDs[index] = id
        if ((mzClusterIDs[index] != mzClusterIDs[index + 1])
            or (timeDiffs[index] > parameters['maxRTDiffAdjFrame'])):
            id += 1
    featureClusterIDs[numRowsData - 1] = id
Пример #7
0
def get_fdr(data, parameters):
    # type: (LFDataFrame, LFParameters) -> float
    """Return the False Discovery Rate (FDR) of the dataset following a
    target-decoy strategy.

    The value is calculated based on the number of m/z values of 'data'
    found in the COMP_DB database from LIPID MAPS, and the number of m/z
    values of 'data' found in a decoy database, created adding 0.5 Da to
    every m/z in COMP_DB (a very rare lipid mass defect). FDR is equal
    to the number of decoy hits divided by the number of target hits.

    Keyword arguments:
        data       -- LFDataFrame instance
        parameters -- LipidFinder's PeakFilter parameters instance
    """
    # Get the list of unique m/z values from 'data'
    mzList = data[parameters['mzCol']].unique().tolist()
    # Set the target adducts
    if (parameters['polarity'] == 'Positive'):
        targetAdducts = (
            "M+H,M+H-H2O,M+2H,M+3H,M+4H,M+NH4,M+Ag,M+Na,M+2Na,M+K,"
            "M+2K,M+Li,M+2Li")
    else:
        targetAdducts = 'M-H,M-CH3,M-2H,M-3H,M-4H,M.F,M.HF2,M.Cl,M.OAc,M.HCOO'
    # Get the number of matches in batches to balance the number of
    # requests and the amount of information requested
    numTargetHits = 0
    numDecoyHits = 0
    for start in range(0, len(mzList), BATCH_SIZE):
        mzBatch = mzList[start:start + BATCH_SIZE]
        # Get a string with one m/z per line (text file alike)
        mzStr = os.linesep.join(map(str, mzBatch))
        numTargetHits += _get_num_matches('COMP_DB', mzStr, targetAdducts)
        numDecoyHits += _get_num_matches('COMP_DB_5', mzStr, targetAdducts)
    # Raise an exception if there are no matches in the target database
    if (numTargetHits == 0):
        raise ValueError(("No matches found in the target database. The FDR "
                          "cannot be computed."))
    # FDR = numDecoyHits / numTargetHits
    return float(numDecoyHits) / numTargetHits
Пример #8
0
def _detect_sample_isotopes(array, parameters):
    """Return an array with the tagged parents and their corresponding
    isotopes in the same order as in the given sample array.

    Keyword Arguments:
        array      -- array with m/z, retention time (RT), sample's
                      intensity mean and index of the original dataframe
        parameters -- LipidFinder's PeakFilter parameters instance
    """
    # Get the corresponding symbol for the polarity of the data (+ or -)
    polSign = '+' if (parameters['polarity'].lower() == 'positive') else '-'
    # Create an array of empty strings that will contain the tagged
    # parents and their corresponding isotopes
    tagArray = numpy.full(len(array), '', dtype=object)
    # Loop over each m/z to search for isotopes
    isotopesIndex = set()
    for index in range(0, len(array)):
        # Skip if frame has already been identified as an isotope
        if (array[index, 3] in isotopesIndex):
            continue
        for isoPeak in range(1, parameters['numIsotopes'] + 1):
            parentMZ = array[index, 0]
            tagID = int(array[index, 3])
            # Get the first and last indexes of the frames that are
            # within the first isotope m/z range for the current analyte
            isotopeMZ = parentMZ + ISO_OFFSET * isoPeak
            minMZ, maxMZ = mz_tol_range(isotopeMZ, parameters['mzFixedError'],
                                        parameters['mzPPMError'])
            mzMatches = numpy.searchsorted(array[:, 0], [minMZ, maxMZ])
            if (mzMatches[0] == mzMatches[1]):
                # Have not found any analyte with an isotope-like m/z
                if (isoPeak == 1):
                    # The first isotope must exists to search for others
                    break
                else:
                    continue
            # Filter m/z matches with the same RT as the parent
            parentRT = array[index, 1]
            minRT, maxRT = rt_tol_range(parentRT,
                                        parameters['maxRTDiffAdjFrame'])
            rtMatches = numpy.where(
                    (array[mzMatches[0] : mzMatches[1], 1] >= minRT)
                    & (array[mzMatches[0] : mzMatches[1], 1] <= maxRT))[0]
            if (len(rtMatches) == 0):
                # No candidates are within the same RT
                if (isoPeak == 1):
                    # The first isotope must exists to search for others
                    break
                else:
                    continue
            # Resultant indexes are based on the previous search
            rtMatches += mzMatches[0]
            # Filter the candidate isotopes by intensity
            parentInten = array[index, 2]
            # The intensity range coefficients vary depending on the
            # isotope number
            if (isoPeak == 1):
                # Get an estimated maximum number of C in the molecule
                numC = round(parentMZ / 12)
                # Calculate isotopic distribution based on polynomial
                # expansion
                baseIntensity = parentInten * (numC ** 1.3) * 0.002
                minIntensity = baseIntensity * parameters['isoIntensityCoef'][0]
                maxIntensity = baseIntensity * parameters['isoIntensityCoef'][1]
            elif (isoPeak == 2):
                # Get an estimated maximum number of C in the molecule
                numC = round(parentMZ / 12)
                # Calculate isotopic distribution based on polynomial
                # expansion
                baseIntensity = parentInten * (numC ** 1.7) * 0.0001
                minIntensity = baseIntensity * parameters['isoIntensityCoef'][0]
                maxIntensity = baseIntensity * parameters['isoIntensityCoef'][1]
            else:
                # Calculate isotopic distribution with the same formula
                # as CAMERA (from XCMS)
                minIntensity = parentInten * float('1e-{0}'.format(isoPeak + 2))
                maxIntensity = parentInten * 2
            isotopes = numpy.where((array[rtMatches, 2] >= minIntensity)
                                   & (array[rtMatches, 2] <= maxIntensity))[0]
            if (len(isotopes) == 0):
                # No candidates have an intensity within expected range
                if (isoPeak == 1):
                    # The first isotope must exists to search for others
                    break
                else:
                    continue
            # Resultant indexes are based on the previous search
            isotopes += rtMatches[0]
            # Tag the analyte as isotope and save its index to avoid
            # checking it as parent of other analytes
            tagArray[isotopes] = '[{0}][M+{1}]{2}'.format(tagID, isoPeak,
                                                          polSign)
            isotopesIndex.update(array[isotopes, 3])
        else:
            # Tag the analyte as parent
            tagArray[index] = '[{0}][M]{1}'.format(tagID, polSign)
    return tagArray
Пример #9
0
def __process_sample__(intensity, rtDiff, parameters, repsPerGroup):
    # type: (numpy.ndarray, numpy.ndarray, LFParameters, int) -> None
    """Correct retention time misalignment in the given sample.

    Keyword Arguments:
        intensity    -- intensity per frame and sample's replicate
        rtDiff       -- time differences between consecutive frames
        parameters   -- LipidFinder's PeakFilter parameters instance
        repsPerGroup -- number of replicates per sample
    """
    while True:
        # Copy 'intensity' array to check later if it has been modified
        oldIntensity = numpy.copy(intensity)
        # Number of frames and replicates in the given feature cluster
        numRows, numCols = intensity.shape
        for rep in range(0, numCols):
            for row in range(0, numRows):
                if (intensity[row][rep] != 0):
                    continue
                # Require at least half non-zero intensity values
                elif ((2 * numpy.count_nonzero(intensity[row]))
                      >= repsPerGroup):
                    # Adjacent frame (row -/+ 1) intensity values
                    adjFrameValues = [0, 0]
                    if ((row > 0) and (intensity[row - 1][rep] != 0)
                        and (rtDiff[row - 1]
                             < parameters['maxRTDiffAdjFrame'])):
                        # The frame above has a non-zero intensity and
                        # is within the allowed retention time (RT)
                        # threshold
                        adjFrameValues[0] = intensity[row - 1][rep]
                    if ((row < (numRows - 1)) and (intensity[row + 1][rep] != 0)
                        and (rtDiff[row] < parameters['maxRTDiffAdjFrame'])):
                        # The frame below has a non-zero intensity and
                        # is within the allowed RT threshold
                        adjFrameValues[1] = intensity[row + 1][rep]
                    if (any(adjFrameValues)):
                        # Save the contiguous frame (if any) where to
                        # swap the intensity values
                        swapIndex = 0
                        # At least one contiguous intensity is greater
                        # than zero. Get mean and standard deviation of
                        # current frame (non-zero values).
                        repMean = intensity[row][numpy.nonzero(
                                intensity[row])[0]].mean()
                        repStdDev = intensity[row][numpy.nonzero(
                                intensity[row])[0]].std()
                        # Calculate the maximum standard deviation
                        stDev = parameters['intensityStDev'] * repStdDev
                        if ((adjFrameValues[0] != 0)
                            and (adjFrameValues[0] >= repMean - stDev)
                            and (adjFrameValues[0] <= repMean + stDev)):
                            if ((2 * numpy.count_nonzero(intensity[row - 1]))
                                < repsPerGroup):
                                swapIndex = -1
                            elif ((2 * numpy.count_nonzero(intensity[row - 1]))
                                  == repsPerGroup):
                                prevFrameMean = intensity[row - 1][
                                        numpy.nonzero(intensity[row - 1])[0]
                                        ].mean()
                                if (repMean >= prevFrameMean):
                                    swapIndex = -1
                        if ((adjFrameValues[1] != 0)
                            and (adjFrameValues[1] >= repMean - stDev)
                            and (adjFrameValues[1] <= repMean + stDev)):
                            # If 'swapIndex' is not 0, swap with the
                            # closest intensity value to the mean of the
                            # current frame
                            if ((swapIndex == 0)
                                or ((swapIndex != 0)
                                    and (abs(repMean - adjFrameValues[1])
                                         < abs(repMean - adjFrameValues[0])))):
                                nextNonZeroReps = numpy.count_nonzero(
                                        intensity[row + 1])
                                if ((2 * nextNonZeroReps) < repsPerGroup):
                                    swapIndex = 1
                                elif ((2 * nextNonZeroReps) == repsPerGroup):
                                    nextFrameMean = intensity[row + 1][
                                            numpy.nonzero(intensity[row + 1])[0]
                                            ].mean()
                                    if (repMean >= nextFrameMean):
                                        swapIndex = 1
                        if (swapIndex != 0):
                            # Swap with the chosen contiguous frame
                            intensity[row][rep] = \
                                    intensity[row + swapIndex][rep]
                            intensity[row + swapIndex][rep] = 0
        # Repeat the process until no more modifications are performed
        if (numpy.array_equal(intensity, oldIntensity)):
            break
Пример #10
0
def cluster_by_mz(data, parameters):
    # type: (LFDataFrame, LFParameters) -> None
    """Cluster m/z artifacts that differ from each other by a mass less
    than the defined tolerance.

    Hierarchical clustering is employed to group the ions into the most
    appropriate groups. Mass clusters are assigned an arbitrary unique
    integer identifier.

    Keyword Arguments:
        data       -- LFDataFrame instance
        parameters -- LipidFinder's PeakFilter parameters instance
    """
    firstRepIndex = parameters['firstSampleIndex'] - 1
    mzCol = parameters['mzCol']
    # Create a new dataframe with auxiliary information:
    # "mzDiffNextFrame": m/z difference between current and next frames
    # "mzClusterSectionID": cluster section ID given to each m/z
    auxData = pandas.DataFrame(
            {'mzDiffNextFrame': data[mzCol].shift(-1) - data[mzCol]})
    auxData['mzClusterSectionID'] = numpy.nan
    # Calculate the cluster section ID for each m/z
    numRowsData = len(data)
    sectionBegin = 0
    # Minimum amount of m/z that will belong to the same cluster section
    sectionMinSize = 49
    clusterSectionID = 1
    while ((numRowsData - sectionBegin) >= sectionMinSize):
        sectionEnd = sectionBegin + sectionMinSize
        while (sectionEnd < (numRowsData - 1)):
            # If the m/z difference to the next frame is greater than
            # the sum of the m/z delta of the largest mass in the
            # current group and the smallest mass in the next group, we
            # can close this cluster section and start a new one
            currentDelta = mz_delta(data.loc[sectionEnd, mzCol],
                                    parameters['mzFixedError'],
                                    parameters['mzPPMError'])
            nextDelta = mz_delta(data.loc[sectionEnd + 1, mzCol],
                                 parameters['mzFixedError'],
                                 parameters['mzPPMError'])
            if (auxData.iloc[sectionEnd, 0] > (currentDelta + nextDelta)):
                break
            sectionEnd += 1
        sectionEnd += 1
        auxData.iloc[sectionBegin : sectionEnd, 1] = clusterSectionID
        clusterSectionID += 1
        sectionBegin = sectionEnd
    if (sectionBegin < numRowsData):
        # Group the remaining masses in another cluster section
        auxData.iloc[sectionBegin : numRowsData, 1] = clusterSectionID
    else:
        # The last cluster section ID was not used so get the total
        # number of IDs assigned
        clusterSectionID -= 1
    # Add a column to dataframe where the mass cluster IDs will be saved
    data['mzClusterID'] = numpy.nan
    currentMaxClusterID = 0
    for sectionID in range(1, clusterSectionID + 1):
        sectionRows = auxData.iloc[:, 1] == sectionID
        # Copy the masses in the current cluster into a list of single
        # item lists (one per mass)
        vectorMZ = data.loc[sectionRows, mzCol].values.reshape((-1, 1))
        if (len(vectorMZ) == 1):
            # Give the next cluster ID to the item and move to next
            # cluster section
            currentMaxClusterID += 1
            data.loc[sectionRows, 'mzClusterID'] = currentMaxClusterID
        else:
            # Perform hierarchical clustering:
            # Get maximum m/z error in current cluster (based on maximum
            # m/z). This will be the cut off for hierarchical clustering.
            maxMZ = data.loc[sectionRows, mzCol].max()
            currentMaxMZError = 2 * mz_delta(maxMZ, parameters['mzFixedError'],
                                             parameters['mzPPMError'])
            # Calculate distance between every mass in cluster section
            mzDistMatrix = distance.pdist(vectorMZ)
            # Calculate linkage
            mzLinkage = hierarchy.complete(mzDistMatrix)
            # Return a list of flat cluster IDs for each mass, shifting
            # the numbers by the last assigned cluster ID
            mzClusters = hierarchy.fcluster(mzLinkage, currentMaxMZError,
                                            'distance') + currentMaxClusterID
            # Add this information to the dataframe
            data.loc[sectionRows, 'mzClusterID'] = mzClusters
            # Increment the current cluster ID by the number of unique
            # clusters in the current mass section
            currentMaxClusterID += len(set(mzClusters))
    # Renumber Cluster IDs based on their appearance in the dataframe
    clusterIDs = data['mzClusterID'].values
    id = 1
    numRowsData = len(data)
    for index in range(0, numRowsData - 1):
        clusterIDs[index] = id
        if (clusterIDs[index] != clusterIDs[index + 1]):
            id += 1
    clusterIDs[numRowsData - 1] = id
Пример #11
0
def rm_neutral_loss_frags(
        array,  # type: numpy.ndarray
        losses,  # type: pandas.DataFrame
        parameters  # type: LFParameters
):
    # type: (...) -> list[float]
    """Return an index list corresponding to the features in the given
    sample array that have been fragmented.

    Return the index list of all 'array' features that have lost one of
    the m/z in 'losses' and their complete counterpart is present in the
    data. The features to be removed must be higher than the given
    cut-off. All m/z and retention time (RT) matching are computed
    within tolerance.

    Keyword arguments:
        array      -- array with m/z, RT and index of the original
                      dataframe
        losses     -- neutral losses to subtract in order to detect
                      fragmented features
        parameters -- LipidFinder's PeakFilter parameters instance
    """
    # Create an array with one m/z cut-off and neutral loss m/z per row
    fragsArray = numpy.stack((losses['MZCutOff'].values, losses['MZ'].values),
                             axis=-1)
    # Create a dictionary with cut-off values as keys and their
    # corresponding neutral loss m/z in lists as values
    fragsDict = {}
    for mzCutOff, mzLoss in fragsArray:
        fragsDict.setdefault(mzCutOff, []).append(mzLoss)
    matchIndexSet = set()
    for mzCutOff in viewkeys(fragsDict):
        # Get the index of the first m/z value in 'array' greater than
        # the m/z cut-off
        firstIndex = numpy.searchsorted(array[:, 0], mzCutOff)
        for index in range(firstIndex, len(array)):
            for mzLoss in fragsDict[mzCutOff]:
                # Look for in-source fragments, that is, features that
                # are the result of subtracting the neutral loss to the
                # parent's m/z and elute at the same RT
                fragMZ = array[index, 0] - mzLoss
                mzRange = mz_tol_range(fragMZ, parameters['mzFixedError'],
                                       parameters['mzPPMError'])
                # Get first and last indexes of the features within the
                # m/z range
                mzMatches = numpy.searchsorted(array[:, 0], mzRange)
                if (mzMatches[0] == mzMatches[1]):
                    continue
                # In order to be considered a match, each feature must
                # have the same RT
                minRT, maxRT = rt_tol_range(array[index, 1], RT_TOLERANCE)
                rtMatches = numpy.where(
                    (array[mzMatches[0]:mzMatches[1], 1] >= minRT)
                    & (array[mzMatches[0]:mzMatches[1], 1] <= maxRT))[0]
                if (len(rtMatches) == 0):
                    continue
                # The resultant indexes are based on the starting index
                # of the search ('mzMatches[0]')
                rtMatches += mzMatches[0]
                # The union of sets will handle any index repetition
                matchIndexSet.update(set(rtMatches))
    return list(matchIndexSet)