Exemplo n.º 1
0
def msms_per_intensity_histogram():
    """
    Example of how to make a histogram of the MS/MS per intensity.
    
    Takes a csv file of spectra with a column 'ms level' and a column with the base peak intensity. It makes a vector out of all the intensities per spectrum in 
    example_files/input/mzml.csv. It retrieves vectors of information about the intensities, so with little tweaking this method can be used any time that you can 
    make a vector out of your data.
    
    This example script uses the following functions:
      - L{rFunctions.readCsvFile}
      - L{rFunctions.index}
      - L{rPlots.Plots.barplot}
      - L{rFunctions.takeLog}

       
    >>> from pyMSA import rFunctions
    >>> from pyMSA import rPlots
    >>> mzmlDataframe = rFunctions.readCsvFile('example_files/input/mzML.csv')                                                       # Read a csv file into a rpy2 dataframe with precursors and intensities
    >>> precursorDataframe = mzmlDataframe.rx(mzmlDataframe[rFunctions.index(mzmlDataframe, 'ms.level')].ro >= 2, True)              # Retrieve a subset dataframe of mzmlDataframe where values in column 'ms level' >= 2
    >>> intensityVector = precursorDataframe[rFunctions.index(precursorDataframe, 'base.peak.intensity')]                            # Retrieve a vector all the values from the 'base peak intensity' column
    >>> logIntensityVector = rFunctions.takeLog(intensityVector, 10                                                                  # Take the log10 of all the values in the intensityVector
    >>> rplots = rPlots.Plots()                                                                                                      # Instantiate rPlots.Plots()
    >>> plots.barplot('example_files/output/msms_per_feature_barplot.png', precursorTable, width=400, height=400,                     # draw a histogram
                        title='#MS/MS per feature', xlab = '# features', ylab = '# MS/MS')
 
    """
    # Reading in a csv file, seperated by tabs into a dataFrame. A feature.csv file contains at least a column 'id' with the id's of all features and a columne
    # 'intensity' with the intensity of each feature. A file that is separated by a different delimiter can be given using the
    # sep argument. So rFunctions.readCsvFile('example_files/input/mzML.csv', sep=',') would separate on commas. Additional arguments can be found in the
    # documentation of rFunctions.readCsvFile()
    mzmlDataframe = rFunctions.readCsvFile('example_files/input/mzML.csv', head=True, sep='\t', na='N/A')
    
    # Get a subset of csvData with only rows that have a value of 2 or higher in the 'ms level' column (the rows that have an MS/MS precursor)
    # mzmlDataframe.rx and the .ro at the are rpy2 functions. It is possible to use these because csvData is a rpy2.robjects.DataFrame object.
    # The rFunctions.index() is used because, although R allows getting a column by name, rpy2 only allows getting columns by number.
    # rFunctions.index(csvData,'ms.level') returns the number of the 'ms level' column (rpy2 makes a '.' out of spaces).
    # The same would have been: mzmlDataframe.rx(mzmlDataframe[0].ro >= 2, True), if 'ms level' is the first column of csvData.
    precursorDataframe = mzmlDataframe.rx(mzmlDataframe[rFunctions.index(mzmlDataframe, 'ms.level')].ro >= 2, True)
    
    # Retrieve all the values of the column 'base.peak.intensity' from precursorDataframe and return them as a vector. The vector precursorDataframe 
    # contains the intensities of all the unique MS/MS precursors (because in the previous step only 'ms level' >= 2 retrieved)
    intensityVector = precursorDataframe[rFunctions.index(precursorDataframe, 'base.peak.intensity')]
    
    # Take the log with base 10 of all the values in intensityVector
    logIntensityVector = rFunctions.takeLog(intensityVector, 10)
    
    # make an instance of rPlots.Plots()
    plots = rPlots.Plots()

    # using rplots.histogram to plot a histogram. First argument is the outfile, second argument is the vector for the histogram,
    # title is the title of the graph, xlab is the description that will go under the x-axis, y-lab is the description that will go to the y-axis.
    # The first 3 positional arguments are mandatory, the keyworded arguments are given to the **kwargs of histogram and are optional. 
    # For more possible arguments see the rPlots.Plots.histogram documentation 
    plots.histogram('example_files/output/msms_per_intensity_histogram.png', logIntensityVector, width=400, height=400, title='test #features per intensity',  
                            xlab = 'log10 of intensity', ylab = '# of test features')
Exemplo n.º 2
0
def feature_per_intensity_histogram():
    """
    Example of how to make a histogram of the features per intensity.
    
    Takes a csv file which contains information on all the features. The important information are the intensities. Because every row is one feature, to get the features
    per intensity it is possible to count the occurence of each intensity. It makes a vector out of all the intensities per feature in example_files/input/feature.csv.
    It retrieves vectors of information about the feature, so with little tweaking this method can be used any time that you can make a vector out 
    of your data. For better viewing the data is logged with base 10.
    
    This example script uses the following functions:
      - L{rFunctions.readCsvFile}
      - L{rFunctions.getRowsWithUniqColumn}
      - L{rFunctions.index}
      - L{rFunctions.takeLog}
      - L{rPlots.Plots.histogram}
    
      
    >>> from pyMSA import rFunctions
    >>> from pyMSA import rPlots
    >>> featDataframe = rFunctions.readCsvFile('example_files/input/feature.csv')                                                    # Read a csv file into a rpy2 dataframe with featurs and intensities
    >>> featDataframeUniq = rFunctions.getRowsWithUniqColumn(featDataframe, 'id')                                                    # Remove the rows with redundant id's from featDataframe
    >>> featIntensityVector = featDataframe[rFunctions.index(featDataframeUniq, 'intensity')]                                        # Retrieve a vector of intensities from the unique features
    >>> featLogIntensityVector = rFunctions.takeLog(featIntensityVector,10)                                                          # Take the logarithm of all the values in the vector with base 10
    >>> rplots = rPlots.Plots()                                                                                                      # Instantiate rPlots.Plots()
    >>> plots.histogram('example_files/output/feature_per_intensity_histogram.png', featLogIntensityVector, width=400, height=400,   # draw a plot with 1 histogram
                                        title='#features per intensity',  ylab = '# of features', xlab = 'intensity')
    """
    # Reading in a csv file, seperated by tabs into a dataFrame. A feature.csv file contains at least a column 'id' with the id's of all features and a columne
    # 'intensity' with the intensity of each feature. A file that is separated by a different delimiter can be given using the
    # sep argument. So rFunctions.readCsvFile('example_files/input/feature.csv', sep=',') would separate on commas. Additional arguments can be found in the
    # documentation of rFunctions.readCsvFile()
    featDataframe = rFunctions.readCsvFile('example_files/input/feature.csv')
    
    # Remove all the rows from featDataframe of which the value in the column 'id' already exists. featDataframeUniq is a sub-dataframe of
    # featDataframe with only rows with unique id's.  
    featDataframeUniq = rFunctions.getRowsWithUniqColumn(featDataframe, 'id')
        
    # Retreive all the values of the column 'intensity' from featDataframeUniq and return them as a vector. The vector featIntensityVector conatains
    # the intensities of all the unique feature id's. 
    featIntensityVector = featDataframe[rFunctions.index(featDataframeUniq, 'intensity')]
    
    # Take the log with base 10 of all the values in featIntensityVector.
    featLogIntensityVector = rFunctions.takeLog(featIntensityVector,10)

    # make an instance of rPlots.Plots()
    plots = rPlots.Plots()
    
    # using rplots.histogram to plot a histogram. First argument is the outfile, second argument is the vector for the histogram,
    # title is the title of the graph, xlab is the description that will go under the x-axis, # y-lab is the description that will go to the y-axis.
    # The first 3 positional arguments are mandatory, the keyworded arguments are given to the **kwargs of histogram and are optional. 
    # For more possible arguments see the rPlots.Plots.histogram documentation 
    plots.histogram('example_files/output/feature_per_intensity_histogram.png', featLogIntensityVector, width=400, height=400, 
                                        title='#features per intensity',  ylab = '# of features', xlab = 'intensity')
Exemplo n.º 3
0
def plot_trafoXML_plusFeatureIntensity():
    """
    Plot points for the change in retention time for all the values in a .trafoXML file, and change the color of the points
    according to the intensity of the feature of the original featureXML file corresponding to that point.
    Use the trafoXML file with as name 'linear', not 'identity'. Identity is the one that linear maps to,
    so identity doesn't have any changes. 
    
    This example script uses the following classes and functions:
        - L{elementFunctions.getItems}
        - L{rPlotGenerics.Plots}
        - L{rPlotGenerics.Plots.plot}
        - L{pyMSA.parseFeatureXML.Reader}
        - L{parseFeatureXML.Reader.getSimpleFeatureInfo}
        - L{rFunctions.takeLog}
        
    B{Example (see source code for additional comments):}
    
    Plot the shift in retention time of one of the featureXML files and color each point a red/blue ratio dependent on its intensity
    
    >>> from pyMSA import elementFunctions
    >>> from pyMSA import rPlots
    >>> from pyMSA import parseFeatureXML
    >>> from pyMSA import rFunctions
    >>> import rpy2.robjects as R
    >>> features_C2_01 = parseFeatureXML.Reader('/homes/ndeklein/Doreen data/featureXML/JG_TiO2_C2_01.featureXML')                             
    >>> mzDict = {}
    >>> for feature in features_C2_01.getSimpleFeatureInfo():
    ...    mzDict[str(float(round(features_C2_01['retention time'],2)))] = features_C2_01['intensity']
    >>> changeDict = collections.defaultdict(int)
    >>> for event, element in cElementTree.iterparse('/homes/ndeklein/Doreen data/trafoXML/JG_TiO2-C2_01-C2_01A-file_1.trafoXML'):
    ...    if element.tag == 'Transformation':
    ...        if elementFunctions.getItems(element)['name'] == 'identity':
    ...            sys.exit('This it he trafoXML identity file (see the Transformation node). There is no information in the identity file. Use the \'linear\' file as input')
    ...    if element.tag == 'Pair':
    ...        retentionTime = str(round(float(elementFunctions.getItems(element)['from']),2))
    ...        changeDict[float(elementFunctions.getItems(element)['to'])-float(elementFunctions.getItems(element)['from'])] = mzDict[retentionTime]
    >>> colorPalette = R.r['colorRampPalette'](R.StrVector(['red','blue']))(10)
    >>> colorVector = colorPalette
    >>> changeList = [] 
    >>> intensityList = []
    >>> for changeAndIntensity in sorted(changeDict.items()):
    ...    changeList.append(changeAndIntensity[0])
    ...    intensityList.append(changeAndIntensity[1])
    >>> for index, item in enumerate(intensityList):
    ...     intensityList[index] = colorPalette[int(rFunctions.takeLog(float(item),10)[0])]
    >>> colorVector = R.StrVector(intensityList)
    >>> floatVector = R.FloatVector(changeList)
    >>> plots = rPlots.Plots()
    >>> extraInput = {'col':colorVector,'pch':20}
    >>> plots.plot('example_files/output/test plot_trafoXML intensity.png',floatVector, width=1000, height=1000,title='Change in retention time per feature', xlab='Feature number.', ylab='change in retention time', plotArgs=extraInput)
    """    
    
    features_C2_01 = parseFeatureXML.Reader('/homes/ndeklein/Doreen data/featureXML/JG_TiO2_C2_01.featureXML')                             
    # this method is more expensive on memory, but a lot faster than old method (old method used getWindow, which looped through the list each time)
    # this makes the retention time the key value, so that the retention time can be found with O(1)
    mzDict = {}
    for feature in features_C2_01.getSimpleFeatureInfo():
        mzDict[str(round(float(features_C2_01['retention time']),8))] = features_C2_01['intensity']
    
            
    # keeps track of the amount of change between the two, with as value a list with the values of the two compared featureXML files
    changeDict = collections.defaultdict(int)
    for event, element in cElementTree.iterparse('/homes/ndeklein/Doreen data/trafoXML/JG_TiO2-C2_01-C2_01A-file_1.trafoXML'):
        if element.tag == 'Transformation':
            if elementFunctions.getItems(element)['name'] == 'identity':
                sys.exit('This it he trafoXML identity file (see the Transformation node). There is no information in the identity file. Use the \'linear\' file as input')
        if element.tag == 'Pair':
            retentionTime = str(round(float(elementFunctions.getItems(element)['from']),8))
            changeDict[float(elementFunctions.getItems(element)['to'])-float(elementFunctions.getItems(element)['from'])] = mzDict[retentionTime]

    #Create a function to generate a continuous color palette
    colorPalette = R.r['colorRampPalette'](R.StrVector(['red','blue']))(10)
    #This adds a column of color values
    # based on the intensity values
    changeList = [] 
    intensityList = []
    for changeAndIntensity in sorted(changeDict.items()):
        changeList.append(changeAndIntensity[0])
        intensityList.append(changeAndIntensity[1])

    for index, item in enumerate(intensityList):
        intensityList[index] = colorPalette[int(rFunctions.takeLog(float(item),10)[0])]
    
    colorVector = R.StrVector(intensityList)
    floatVector = R.FloatVector(changeList)
    plots = rPlotGenerics.Plots()
    extraInput = {'col':colorVector,'pch':20}
    plots.plot('example_files/output/test plot_trafoXML intensity.png',floatVector, width=1000, height=1000,title='Change in retention time per feature', xlab='Feature number.', ylab='change in retention time', plotArgs=extraInput)
Exemplo n.º 4
0
def plot_mapped_and_unmapped_intensities():
    """
    Plot a boxplot of the # of features that map at each intensity.
        
    This example script uses the following classes and functions:
        - L{featureMapping.Map}
        - L{featureMapping.Map.unmappedIntensities}
        - L{featureMapping.Map.mappedIntensities}
        - L{rPlots.PlotGenerics.boxplotDataframe}
        - L{parseFeatureXML.Reader}
        - L{rFunctions.takeLog}
        - L{rFunctions.fillNA}
    
    B{Example:}
    
    >>> from pyMSA import featureMapping as fm
    >>> from pyMSA import parseFeatureXML
    >>> from pyMSA import rPlots
    >>> featureXML_1 = parseFeatureXML.Reader('/homes/ndeklein/Doreen data/featureXML/JG_TiO2_C2_01.featureXML')            
    >>> featureXML_2 = parseFeatureXML.Reader('/homes/ndeklein/Doreen data/featureXML/JG_TiO2_C2_01A.featureXML')
    >>> featuremap = featureMapping.Map(featureXML_1, featureXML_2, '/homes/ndeklein/Doreen data/trafoXML/JG_TiO2-C2_01-C2_01A-file_1.trafoXML')
    >>> unmapped_1, unmapped_2 = featureamap.unmappedIntensities()    
    >>> mapped_1, mapped_2 = featuremap.mappedIntensities()  
    >>> maxLength = len(max([unmapped_1, unmapped_2, mapped_1, mapped_2], key = len))
    >>> unmappedVector_1 = rFunctions.takeLog(R.FloatVector(rFunctions.fillNA(unmapped_1, maxLength-len(unmapped_1),'na_real')),10)
    >>> unmappedVector_2 = rFunctions.takeLog(R.FloatVector(rFunctions.fillNA(unmapped_2, maxLength-len(unmapped_2), 'na_real')),10)
    >>> mappedVector_1 = rFunctions.takeLog(R.FloatVector(rFunctions.fillNA(mapped_1, maxLength-len(mapped_1),'na_real')),10)
    >>> mappedVector_2 = rFunctions.takeLog(R.FloatVector(rFunctions.fillNA(mapped_2, maxLength-len(mapped_2), 'na_real')),10)
    >>> dataDict = {'not aligned file 1':unmappedVector_1, 'not aligned file 2':unmappedVector_2,
    ...            'aligned file 1':mappedVector_1, 'aligned file 2':mappedVector_2}
    >>> dataframe = R.DataFrame(dataDict)    
    >>> plots = rPlots.Plots()
    >>> plots.boxplotDataframe('example_files/output/intensity_of_mapped_and_unmapped.png', dataframe, title='Intensity of each aligned or non-aligned feature of two mapped featureXML files',
    ...                        xlab='', ylab='intensity of each feature', width=600, height=600)
                   
    """
    # reading the necesarry files
    featureXML_1 = parseFeatureXML.Reader('/homes/ndeklein/Doreen data/featureXML/JG_TiO2_C2_01.featureXML')            
    featureXML_2 = parseFeatureXML.Reader('/homes/ndeklein/Doreen data/featureXML/JG_TiO2_C2_01A.featureXML')
    
    # getting the FeatureMappingQuality instance
    featuremap = featureMapping.Map(featureXML_1, featureXML_2, '/homes/ndeklein/Doreen data/trafoXML/JG_TiO2-C2_01-C2_01A-file_1.trafoXML')
    # getting lists of the mapped and unmapped intensities
    unmapped_1, unmapped_2 = featuremap.unmappedIntensities()    
    mapped_1, mapped_2 = featuremap.mappedIntensities()  
    
    # get the length of the longest list, used for adding NA vlaues later
    maxLength = len(max([unmapped_1, unmapped_2, mapped_1, mapped_2], key = len))
    
    # making the vectors to go in the dataframe. Log 10 is taken of all intensities in the list. rFunctions.fillNA makes sure that all
    # vectors are of the same length before adding them to the dataframe
    unmappedVector_1 = rFunctions.takeLog(R.FloatVector(rFunctions.fillNA(unmapped_1, maxLength-len(unmapped_1),'na_real')),10)
    unmappedVector_2 = rFunctions.takeLog(R.FloatVector(rFunctions.fillNA(unmapped_2, maxLength-len(unmapped_2), 'na_real')),10)
    mappedVector_1 = rFunctions.takeLog(R.FloatVector(rFunctions.fillNA(mapped_1, maxLength-len(mapped_1),'na_real')),10)
    mappedVector_2 = rFunctions.takeLog(R.FloatVector(rFunctions.fillNA(mapped_2, maxLength-len(mapped_2), 'na_real')),10)
    
    
    dataDict = {'not aligned file 1':unmappedVector_1, 'not aligned file 2':unmappedVector_2,
                'aligned file 1':mappedVector_1, 'aligned file 2':mappedVector_2}
    dataframe = R.DataFrame(dataDict)    
    
    plots = rPlotGenerics.Plots()
    plots.boxplotDataframe('example_files/output/intensity_of_mapped_and_unmapped.png', dataframe, title='Intensity of each aligned or non-aligned feature of two mapped featureXML files',
                    xlab='', ylab='intensity', width=600, height=600)
Exemplo n.º 5
0
def msms_per_feature_per_intensity_boxplot():
    """
    Example of how to make a boxplot of the # of MS/MS per feature per intensity.
    
    This example script uses the following functions:
      - L{rFunctions.readCsvFile}
      - L{rFunctions.getRowsWithUniqColumn}
      - L{rFunctions.index}
      - L{rFunctions.takeLog}
      - L{rFunctions.getColumns}
      - L{rPlots.Plots.boxplotFormulae}
    
        
    >>> from pyMSA import rFunctions
    >>> from pyMSA import rPlots
    >>> featDataframe = rFunctions.readCsvFile('example_files/input/feature.csv')                                                            # Read a csv file into a rpy2 dataframe with a column containingintensities
    >>> featDataframeUniq = rFunctions.getRowsWithUniqColumn(featDataframe, 'id')                                                            # Remove the rows with redundant id's from featDataframe
    >>> precursorPerFeatureDataframe = rFunctions.readCsvFile('example_files/input/feature_precursor.csv', head=True, sep='\t')              # Read a csv file into a rpy2 dataframe with a column containing # MS/MS per feature
    >>> mergedFeatureDataframe = R.r['merge'](featDataframeUniq, precursorPerFeatureDataframe)                                               # Merge the two dataframes, so that each feature has an intensity and an #MS/MS per feature
    >>> mergedFeatureDataframe[rFunctions.index(mergedFeatureDataframe, 'intensity')] = R.r['round'](rFunctions.takeLog(featDataframeUniq[rFunctions.index(featDataframeUniq, 'intensity')], 10))    # Take the log10 and round all the values in the 'intensity' column
    >>> vector1 = mergedFeatureDataframe[rFunctions.index(mergedFeatureDataframe, 'X..precursors')]                                          # Retrieve a vector of all values in the column '# precursors'
    >>> vector2 = mergedFeatureDataframe[rFunctions.index(mergedFeatureDataframe,'intensity')]                                               # retrieve a vector of all values in the column 'intensity'
    >>> plots = rPlots.Plots()                                                                                                               # instantiate rPlots.PlotS()
    >>> plots.boxplotFormulae('example_files/output/msms_per_feature_per_intensity_boxplot.png', vector1, vector2, mergedFeatureDataframe,   # plot the boxplot
                    title = 'MS/MS per feature per intensity', ylab = '# of MS/MS per feature', xlab = 'Rounded log10 of intensity')
    """

    # Reading in a csv file, seperated by tabs into a dataFrame. A file that is separated by a different delimiter can be given using the
    # sep argument. So rFunctions.readCsvFile'example_files/input/feature.csv', sep=',') would separate on commas. Additional arguments can be found in the
    # documentation of rFunctions.readCsvFile()
    featDataframe = rFunctions.readCsvFile('example_files/input/feature.csv')
    
    # Remove all the rows from featDataframe of which the value in the column 'id' already exists. featDataframeUniq is a sub-dataframe of
    # featDataframe with only rows with unique id's.  
    featDataframeUniq = rFunctions.getRowsWithUniqColumn(featDataframe, 'id')
    
    # Reading in a csv file, seperated by tabs into a dataFrame. A file that is separated by a different delimiter can be given using the
    # sep argument. So rFunctions.readCsvFile('example_files/input/mzML.csv', sep=',') would separate on commas. Additional arguments can
    # be found in the documentation of rFunctions.readCsvFile().
    precursorPerFeatureDataframe = rFunctions.readCsvFile('example_files/input/feature_precursor.csv', head=True, sep='\t')

    # merge the precursorPerFEatureDataframe and the featDataframeUniq. Because both dataframes have a column named 'id' R's merge function will automatically append
    # the '# precursors' column values from precursorPerFeatureDataframe to the right row. That is, where the value in column 'id' is the same for both dataframes. 
    mergedFeatureDataframe = R.r['merge'](featDataframeUniq, precursorPerFeatureDataframe)

    # Take the log10 of all the values in the 'intensity' column and round them to their nearest full number (because discrete values needed for the boxplot)
    mergedFeatureDataframe[rFunctions.index(mergedFeatureDataframe, 'intensity')] = R.r['round'](rFunctions.takeLog(featDataframeUniq[rFunctions.index(featDataframeUniq, 'intensity')], 10))
    
    # Retrieves a vector of all the values in the column '# precursors' of mergedFeatureDataframe
    vector1 = mergedFeatureDataframe[rFunctions.index(mergedFeatureDataframe, 'X..precursors')]
    
    # Retrieves a vector of all the values in the column 'intensity' of mergedFeatureDataframe
    vector2 = mergedFeatureDataframe[rFunctions.index(mergedFeatureDataframe,'intensity')]

    # make an instance of rPlots.Plots()
    plots = rPlots.Plots()
   
    # using plots.boxplotFormulae to plot a boxplot. Because the boxplot is the values in the column '# precursors' per value in the column 'intensity', plots.boxplotFormulae 
    # is used instead of plots.boxplotDataframe. First argument is a the name of the ouput file. Second and third argument are 2 vectors which correspond to the x and  y 
    # explained in rpy2's robjects_formulae documentation.  http://rpy.sourceforge.net/rpy2/doc-2.2/html/robjects_formulae.html. 
    # Title is the title of the graph, xlab is the description that will go under the x-axis, y-lab is the description that will go to the y-axis 
    # The first 3 positional arguments are mandatory, the keyworded arguments are given to the **kwargs of barplot and are optional. For more possible arguments 
    # see the rPlots.Plots.barbplot documentation. 
    plots.boxplotFormulae('example_files/output/msms_per_feature_per_intensity_boxplot.png', vector1, vector2, mergedFeatureDataframe, 
                    title = 'MS/MS per feature per intensity', ylab = '# of MS/MS per feature', xlab = 'Rounded log10 of intensity')
Exemplo n.º 6
0
def feature_and_MSMS_per_intensity_histogram():
    """
    Example of how to make an overlapping histogram of the features and MS/MS precursors per intensity.
    
    Takes a feature.csv file and a mzml.csv file. It makes a vector out of all the intensities per feature in example_files/input/feature.csv
    and a vector of all the intensities per spectrums with ms level > 2 in example_files/input/feature.csv. It retrieves vectors of information 
    about the feature and the MS/MS, so with little tweaking this method can be used any time that you can make an n amount of vectors out 
    of your data. For better viewing the data is logged with base 10.
    
    This example script uses the following functions:
      - L{rFunctions.readCsvFile}
      - L{rFunctions.getRowsWithUniqColumn}
      - L{rFunctions.index}
      - L{rFunctions.takeLog}
      - L{rPlots.Plots.histogram}
    
     
    >>> from pyMSA import rFunctions
    >>> from pyMSA import rPlots
    >>> import rpy2.robjects as R
    >>> featDataframe = rFunctions.readCsvFile('example_files/input/feature.csv')                                         # Read a csv file into a rpy2 dataframe with featurs and intensities
    >>> featDataframeUniq = rFunctions.getRowsWithUniqColumn(featDataframe, 'id')                                         # Remove the rows with redundant id's from featDataframe
    >>> featIntensityVector = featDataframe[rFunctions.index(featDataframeUniq, 'intensity')]                             # Retrieve a vector of intensities from the unique features
    >>> featLogIntensityVector = rFunctions.takeLog(featIntensityVector,10)                                               # Take the logarithm of all the values in the vector with base 10
    >>> mzmlDataframe = rFunctions.readCsvFile('example_files/input/mzML.csv')                                            # Read a csv file into a rpy2 dataframe with precursors and intensities
    >>> precursorDataframe = mzmlDataframe.rx(mzmlDataframe[rFunctions.index(mzmlDataframe, 'ms.level')].ro >= 2, True)   # Retrieve a subset dataframe of mzmlDataframe where values in column 'ms level' > 2
    >>> mzmlIntensityVector = precursorDataframe[rFunctions.index(precursorDataframe, 'base.peak.intensity')]             # Retrieve the intensty of all the MS/MS precursors in precursorSubset
    >>> mzmlLogIntensityVector = rFunctions.takeLog(mzmlIntensityVector, 10)                                              # Take the logarithm with base 10 of all the values in mzmlIntensityVector 
    >>> rplots = rPlots.Plots()                                                                                           # Instantiate rPlots.Plots()
    >>> rplots.histogram('example_files/output/feature_and_msms_per_intensity_histogram.png', featLogIntensityVector,     # draw a plot with 2 histograms and a legend
                         mzmlLogIntensityVector, title='feature and MSMS per intensity', xlab='intensity', 
                         ylab='frequency of MS/MS and Intensity', 
                         legend= {'x':'topright','legend':R.StrVector(['features', 'MS/MS precursors']), 
                                  'lty':R.IntVector([1,1]), 'lwd':R.IntVector([2.5,2.5])})
    """

    # Reading in a csv file, seperated by tabs into a dataFrame. A feature.csv file contains at least a column 'id' with the id's of all features and a columne
    # 'intensity' with the intensity of each feature. A file that is separated by a different delimiter can be given using the
    # sep argument. So rFunctions.readCsvFile('example_files/input/feature.csv', sep=',') would separate on commas. Additional arguments can be found in the
    # documentation of rFunctions.readCsvFile()
    featDataframe = rFunctions.readCsvFile('example_files/input/feature.csv')

    # Remove all the rows from featDataframe of which the value in the column 'id' already exists. featDataframeUniq is a sub-dataframe of
    # featDataframe with only rows with unique id's.  
    featDataframeUniq = rFunctions.getRowsWithUniqColumn(featDataframe, 'id')

    # Retreive all the values of the column 'intensity' from featDataframeUniq and return them as a vector. The vector featIntensityVector conatains
    # the intensities of all the unique feature id's. 
    featIntensityVector = featDataframe[rFunctions.index(featDataframeUniq, 'intensity')]
    
    # Take the log with base 10 of all the values in featIntensityVector.
    featLogIntensityVector = rFunctions.takeLog(featIntensityVector,10)
    
    # Reading in a csv file, seperated by tabs into a dataFrame. The .csv file contains at least a column 'ms level' with ms levels, a column 'id' with the id's of 
    # the spectra and a column 'base peak intensities' with the base peak intensities. A file that is separated by a different delimiter can be given using the
    # sep argument. So rFunctions.readCsvFile('example_files/input/mzML.csv', sep=',') would separate on commas. Additional arguments can
    # be found in the documentation of rFunctions.readCsvFile().
    mzmlDataframe = rFunctions.readCsvFile('example_files/input/mzML.csv') 
    
    # Get a subset of mzmlDataframe with only rows that have a value of 2 or higher in the 'ms level' column (the rows that have an MS/MS precursor)
    # mzmlDataframe.rx and the .ro at the are rpy2 functions. It is possible to use these because mzmlDataframe is a rpy2.robjects.DataFrame object.
    # The rFunctions.index() is used because, although R allows getting a column by name, rpy2 only allows getting columns by number.
    # rFunctions.index(mzmlDataframe,'ms.level') returns the number of the 'ms level' column (rpy2 makes a '.' out of spaces).
    # The same would have been: mzmlDataframe.rx(mzmlDataframe[0].ro >= 2, True), if 'ms level' is the first column of mzmlDataframe.
    precursorDataframe = mzmlDataframe.rx(mzmlDataframe[rFunctions.index(mzmlDataframe, 'ms.level')].ro >= 2, True)
    
    # Retrieve all the values of the column 'base.peak.intensity' from precursorDataframe and return them as a vector. The vector precursorDataframe 
    # contains the intensities of all the unique MS/MS precursors (because in the previous step only 'ms level' >= 2 retrieved)
    mzmlIntensityVector = precursorDataframe[rFunctions.index(precursorDataframe, 'base.peak.intensity')]
    
    # Take the log with base 10 of all the values in mzmlIntensityVector
    mzmlLogIntensityVector = rFunctions.takeLog(mzmlIntensityVector, 10)

    # make an instance of rPlots.Plots()
    plots = rPlots.Plots()

    # using plots.histogram to plot 2 histograms in one figure. First argument is the outfile, second argument is the vector for one of the histograms,
    # third argument is the vector for the second histogram, title is the title of the graph, xlab is the description that will go under the x-axis,
    # y-lab is the description that will go to the y-axis and legend are the arguments given to make the legend. The first 3 positional arguments are 
    # mandatory, the keyworded arguments are given to the **kwargs of histogram and are optional. For more possible arguments see the rPlots.Plots.histogram 
    # documentation and R's ?legend documentation for more arguments to give to legend. 
    plots.histogram('example_files/output/feature_and_msms_per_intensity_histogram.png', featLogIntensityVector, mzmlLogIntensityVector, title='feature and MSMS per intensity', 
                    xlab='intensity', ylab='frequency of MS/MS and Intensity', 
                    legend= {'x':'topright','legend':R.StrVector(['features', 'MS/MS precursors']), 'lty':R.IntVector([1,1]), 'lwd':R.IntVector([2.5,2.5])})
Exemplo n.º 7
0
def msms_and_spectrum_per_ioncurrent_histogram():
    """
    Example of how to make a histogram of the ion current per ms level (1 and 2).
    
    Takes a mzml.csv file. It makes a vector out of all the total ion currents per spectrum in example_files/input/mzml.csv. It retrieves vectors of information 
    about the spectra, so with little tweaking this method can be used any time that you can make a vector out of your data. For better viewing the data is logged 
    with base 10.
    
    This example script uses the following functions:
      - L{rFunctions.readCsvFile}
      - L{rFunctions.getRowsWithUniqColumn}
      - L{rFunctions.index}
      - L{rFunctions.takeLog}
      - L{rPlots.Plots.histogram}
    
    
    >>> from pyMSA import rFunctions
    >>> from pyMSA import rPlots
    >>> import rpy2.robjects as R
    >>> mzmlDataframe = rFunctions.readCsvFile('example_files/input/mzML.csv')                                            # Read a csv file into a rpy2 dataframe with precursors and intensities
    >>> precursorDataframe = mzmlDataframe.rx(mzmlDataframe[rFunctions.index(mzmlDataframe, 'ms.level')].ro >= 2, True)   # Retrieve a subset dataframe of mzmlDataframe where values in column 'ms level' >= 2
    >>> precursorDataframe = mzmlDataframe.rx(mzmlDataframe[rFunctions.index(mzmlDataframe, 'ms.level')].ro == 1, True)   # Retrieve a subset dataframe of mzmlDataframe where values in column 'ms level' == 1
    >>> nonPrecursorIoncount = precursorDataframe[rFunctions.index(precursorDataframe, 'total.ion.current')]              # Retrieve a vector of all the data in the column 'total ion current' of precursorDataframe
    >>> nonPrecursorIoncount = nonPrecursorDataframe[rFunctions.index(nonPrecursorDataframe, 'total.ion.current')]        # Retrieve a vector of all the data in the column 'total ion current' of nonPrecursorDataframe
    >>> logPrecursorIoncount = rFunctions.takeLog(precursorIoncount, 10)                                                  # Take the logarithm of all the values in the vector with base 10
    >>> logNonPrecursorIoncount = rFunctions.takeLog(nonPrecursorIoncount, 10)                                            # Take the logarithm of all the values in the vector with base 10
    >>> rplots = rPlots.Plots()                                                                                           # Instantiate rPlots.Plots()
    plots.histogram('example_files/output/msms_and_spectrum_per_ionCurrent_histogram.png', logPrecursorIoncount, 
                    logNonPrecursorIoncount, title='ion current for ms level 1 and ms level 2', 
                    xlab='ion current', ylab='frequency of spectrum or MS/MS precursor', 
                    legend= {'x':'topright','legend':R.StrVector(['ms level 1', 'ms level 2']), 'lty':R.IntVector([1,1]), 
                            'lwd':R.IntVector([2.5,2.5])})

    """
    # Reading in a csv file, seperated by tabs into a dataFrame. The .csv file contains at least a column 'ms level' with ms levels, a column 'id' with the id's of 
    # the spectra and a column 'total ion current' with the base peak intensities. A file that is separated by a different delimiter can be given using the
    # sep argument. So rFunctions.readCsvFile('example_files/input/mzML.csv', sep=',') would separate on commas. Additional arguments can
    # be found in the documentation of rFunctions.readCsvFile().
    mzmlDataframe = rFunctions.readCsvFile('example_files/input/mzML.csv') 

    # Get a subset of mzmlDataframe with only rows that have a value of 2 or higher in the 'ms level' column (the rows that have an MS/MS precursor)
    # mzmlDataframe.rx and the .ro at the are rpy2 functions. It is possible to use these because mzmlDataframe is a rpy2.robjects.DataFrame object.
    # The rFunctions.index() is used because, although R allows getting a column by name, rpy2 only allows getting columns by number.
    # rFunctions.index(mzmlDataframe,'ms.level') returns the number of the 'ms level' column (rpy2 makes a '.' out of spaces).
    # The same would have been: mzmlDataframe.rx(mzmlDataframe[0].ro >= 2, True), if 'ms level' is the first column of mzmlDataframe.
    precursorDataframe = mzmlDataframe.rx(mzmlDataframe[rFunctions.index(mzmlDataframe, 'ms.level')].ro >= 2, True)
   
    # mzmlDataframe.rx and the .ro at the are rpy2 functions. It is possible to use these because mzmlDataframe is a rpy2.robjects.DataFrame object.
    # The rFunctions.index() is used because, although R allows getting a column by name, rpy2 only allows getting columns by number.
    # rFunctions.index(mzmlDataframe,'ms.level') returns the number of the 'ms level' column (rpy2 makes a '.' out of spaces).
    # The same would have been: mzmlDataframe.rx(mzmlDataframe[0].ro == 1, True), if 'ms level' is the first column of mzmlDataframe.
    nonPrecursorDataframe = mzmlDataframe.rx(mzmlDataframe[rFunctions.index(mzmlDataframe, 'ms.level')].ro == 1, True)

    # Retrieve all the values of the column 'total.ion.current' from precursorDataframe and return them as a vector. The vector precursorDataframe 
    # contains the ion currents of all the unique MS/MS precursors
    precursorIoncount = precursorDataframe[rFunctions.index(precursorDataframe, 'total.ion.current')] 

    # Retrieve all the values of the column 'total.ion.current' from precursorDataframe and return them as a vector. The vector precursorDataframe 
    # contains the ion currents of all the unique ms level: 1 precursors
    nonPrecursorIoncount = nonPrecursorDataframe[rFunctions.index(nonPrecursorDataframe, 'total.ion.current')] 

    # Take the log with base 10 of all the values in logPrecursorIoncount.
    logPrecursorIoncount = rFunctions.takeLog(precursorIoncount, 10)
    
    # Take the log with base 10 of all the values in nonPrecursorIoncount.
    logNonPrecursorIoncount = rFunctions.takeLog(nonPrecursorIoncount, 10)

    # make an instance of rPlots.Plots()
    plots = rPlots.Plots()
    
    # using plots.histogram to plot 2 histograms in one figure. First argument is the outfile, second argument is the vector for one of the histograms,
    # third argument is the vector for the second histogram, title is the title of the graph, xlab is the description that will go under the x-axis,
    # y-lab is the description that will go to the y-axis and legend are the arguments given to make the legend. The first 3 positional arguments are 
    # mandatory, the keyworded arguments are given to the **kwargs of histogram and are optional. For more possible arguments see the rPlots.Plots.histogram 
    # documentation and R's ?legend documentation for more arguments to give to legend. 
    plots.histogram('example_files/output/msms_and_spectrum_per_ionCurrent_histogram.png', logPrecursorIoncount, logNonPrecursorIoncount, title='ion current for ms level 1 and ms level 2', 
                    xlab='ion current', ylab='frequency of spectrum or MS/MS precursor', 
                    legend= {'x':'topright','legend':R.StrVector(['ms level 1', 'ms level 2']), 'lty':R.IntVector([1,1]), 'lwd':R.IntVector([2.5,2.5])})