def getMutData(self, category, study, plot, cutoff):

        if category == 'ontology':

            df = pd.crosstab(self.ontologyDF[category],
                             self.ontologyDF['mutation_type'])

        else:
            df = pd.crosstab(self.rawMutationDF[category],
                             self.rawMutationDF['mutation_type'])

        df['sum'] = df.sum(axis=1)

        df = getMutationSupertype(df)

        df.sort_values(by=['sum'], ascending=False, inplace=True)

        ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study + '/' +
                        category + '/')

        df.to_csv('/Users/Rohil/Documents/Young Dawgs/' + study + '/' +
                  category + '/' + category + '_mutation_data.txt')

        if plot == True:

            #take only the top 50 rows... only applies to gene mutation data but doesn't effect other categories
            df = df.head(50).drop('sum', axis=1)

            uplot.mutPlot(df, study, category, cutoff=0, weighted=False)
            uplot.mutSupertypeSumPlot(df, study, category)
            uplot.mutSupertypeIndividualPlot(df, study, category)
예제 #2
0
def genePlot(df, study, category, subcat, cutoff, weighted):

    if weighted == True:
        metric = 'normalized mutation count'
        m = 'weight'
    else:
        metric = 'mutation count'
        m = ''

    plot = df.drop(df.columns[df.apply(lambda col: col.sum() < cutoff)],
                   axis=1)[:50].plot(kind='bar',
                                     stacked=True,
                                     figsize=(35, 20),
                                     fontsize=15,
                                     color=colorList)

    plot.set_title(category + '\n' + subcat + '\n' + 'gene vs ' + metric,
                   fontsize=15)
    plot.set_ylabel(metric, fontsize=15)
    plot.set_xlabel('gene', fontsize=15)

    plot.legend(loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 15})

    fig = plot.get_figure()

    ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study + '/' +
                    category + '/' + category + '_gene' + m + '_images/')

    fig.savefig('/Users/Rohil/Documents/Young Dawgs/' + study + '/' +
                category + '/' + category + '_gene' + m + '_images/' + subcat +
                '_gene' + m + '_image.png',
                bbox_inches='tight')

    plt.close()
예제 #3
0
def mutSupertypeSumPlot(df, study, category):

    mutSupertypePlot = df.reset_index().plot(
        x=category,
        y=[
            'base_substitution', 'small_indel', 'chromosomal_instability',
            'fusion'
        ],
        kind='bar',
        stacked=True,
        figsize=(25, 15),
        title=category + ' vs #mutations',
        fontsize=15,
        color=['yellow', 'green', 'red', 'pink'])

    mutSupertypePlot.set_ylabel('#mutations')
    mutSupertypePlot.set_xlabel(category, size=15)

    mutSupertypePlot.legend(loc='center left',
                            bbox_to_anchor=(1, 0.5),
                            prop={'size': 15})

    mutSupertypeFig = mutSupertypePlot.get_figure()

    ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study + '/' +
                    category + '/')

    mutSupertypeFig.savefig('/Users/Rohil/Documents/Young Dawgs/' + study +
                            '/' + category + '/' + category +
                            'MutSupertypeSumData.png',
                            bbox_inches='tight')

    plt.clf()
예제 #4
0
def mutSupertypeIndividualPlot(df, study, category):

    baseSubCols = df.columns[df.columns.isin(baseSubList)]

    smallIndelCols = df.columns[df.columns.isin(smallIndelList)]

    chromInstabCols = df.columns[df.columns.isin(chromInstabilityList)]

    supertypeDict = {
        'base_substitution': baseSubCols,
        'small_indel': smallIndelCols,
        'chromosomal_instability': chromInstabCols
    }

    ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study + '/' +
                    category + '/')

    for key, value in supertypeDict.items():

        if df[value].empty:

            break

    else:

        df['sum'] = df[value].sum()
        df.sort_values(by=['sum'], ascending=False, inplace=True)
        df.drop('sum', axis=1, inplace=True)

        supertypePlot = df.reset_index().plot(x=category,
                                              y=value,
                                              kind='bar',
                                              stacked=True,
                                              figsize=(25, 15),
                                              title=category + ' vs #' + key +
                                              ' mutations',
                                              fontsize=15)

        supertypePlot.set_ylabel('#mutations', size=15)
        supertypePlot.set_xlabel(category, size=15)

        supertypePlot.legend(loc='center left',
                             bbox_to_anchor=(1, 0.5),
                             prop={'size': 15})

        supertypeFig = supertypePlot.get_figure()

        supertypeFig.savefig('/Users/Rohil/Documents/Young Dawgs/' + study +
                             '/' + category + '/' + category + '_' + key +
                             '.png',
                             bbox_inches='tight')

        plt.clf()
    def assimilateRawMutationDF(self, study):

        self.rawMutationDF = self.pointDF.append(self.cnaDF, ignore_index=True)
        self.rawMutationDF = self.rawMutationDF[[
            'gene_symbol', 'pointORcna', 'mutation_type', 'amino_acid_change',
            'functional_impact_score', 'sample_id', 'study_id', 'ontology'
        ]]
        self.rawMutationDF.sort_values('gene_symbol', inplace=True)

        self.rawMutationDF = combineRepeatMutations(self.rawMutationDF)

        ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study + '/')
    def getGeneData(self, category, study, plot, cutoff):

        if category == 'ontology':
            subcategoryList = self.ontologyDF[category].unique()

        else:
            subcategoryList = self.rawMutationDF[category].unique()

        for subcat in subcategoryList:

            if category == 'ontology':
                df = self.ontologyDF.loc[self.ontologyDF[category] == subcat]

            else:
                df = self.rawMutationDF.loc[self.rawMutationDF[category] ==
                                            subcat]

            geneDF = pd.crosstab(df['gene_symbol'], df['mutation_type'])
            geneDF['sum'] = geneDF.sum(axis=1)
            geneDF.sort_values(by='sum', ascending=False, inplace=True)

            geneWeight = df.groupby('gene_symbol').mutation_type.value_counts(
                normalize=True)
            geneWeightDF = geneWeight.unstack().fillna(0.0)
            geneWeightDF = geneWeightDF.reindex(geneDF.index)

            ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study +
                            '/' + category + '/')

            geneDF.to_csv('/Users/Rohil/Documents/Young Dawgs/' + study + '/' +
                          category + '/' + category + '_gene_data.txt')
            geneWeightDF.to_csv('/Users/Rohil/Documents/Young Dawgs/' + study +
                                '/' + category + '/' + category +
                                '_gene_weight_data.txt')

            if plot == True:

                geneDF.drop('sum', axis=1, inplace=True)

                uplot.genePlot(geneDF,
                               study,
                               category,
                               subcat,
                               cutoff,
                               weighted=False)
                uplot.genePlot(geneWeightDF,
                               study,
                               category,
                               subcat,
                               cutoff,
                               weighted=True)
    def groupByOntology(self, study):

        self.ontologyDF = self.rawMutationDF

        #drops and resets index
        self.ontologyDF = self.ontologyDF.reset_index(drop=True)

        #maps a list of ontologies to each row based on the gene_symbol
        self.ontologyDF['ontology'] = self.ontologyDF['gene_symbol'].map(
            ontologyDict)

        #if any rows do not have a value for 'ontology' set that value to a list containing 'no_ontology_found'
        self.ontologyDF.loc[self.ontologyDF['ontology'].isnull(),
                            ['ontology']] = self.ontologyDF.loc[
                                self.ontologyDF['ontology'].isnull(),
                                'ontology'].apply(
                                    lambda x: ['no_ontology_found'])

        #get number of ontologies for each row
        dfLength = self.ontologyDF['ontology'].str.len()

        #convert ontology list to numpy array
        dfValues = self.ontologyDF['ontology'].values

        #get an array with the indices repeated by the number of ontologies for each row
        dfIndex = np.repeat(self.ontologyDF.index, dfLength)

        #expand rows by duplicated index values
        self.ontologyDF = self.ontologyDF.loc[dfIndex]

        #add ontology values, flattened from the list
        self.ontologyDF['ontology'] = np.concatenate(dfValues)

        #reset the index again, since it currently contains repeat values
        self.ontologyDF = self.ontologyDF.reset_index(drop=True)

        ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study + '/')

        self.ontologyDF.to_csv('/Users/Rohil/Documents/Young Dawgs/' + study +
                               '/mappedDF.txt',
                               index=False)
    def getOntologyData(self, category, study,
                        plot):  #DEPRECATED... NO LONGER IN USE (TENTATIVE)

        for subcat in self.rawMutationDF[category].unique():

            df = self.ontologyDF.loc[self.ontologyDF[category] == subcat]

            ontologyDF = pd.crosstab(df['ontology'], df['mutation_type'])
            ontologyDF['sum'] = ontologyDF.sum(axis=1)
            ontologyDF.sort_values(by='sum', ascending=False, inplace=True)

            ontologyGrouped = df.groupby('ontology')
            ontologyWeight = ontologyGrouped.mutation_type.value_counts(
            ) / ontologyGrouped.gene_symbol.nunique()
            ontologyWeightDF = ontologyWeight.unstack().fillna(0.0)
            ontologyWeightDF['sum'] = ontologyWeightDF.sum(axis=1)
            ontologyWeightDF.sort_values(by='sum',
                                         ascending=False,
                                         inplace=True)

            ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study +
                            '/' + category + '/')

            ontologyDF.to_csv('/Users/Rohil/Documents/Young Dawgs/' + study +
                              '/' + category + '/' + category +
                              '_ontology_data.txt')
            ontologyWeightDF.to_csv('/Users/Rohil/Documents/Young Dawgs/' +
                                    study + '/' + category + '/' + category +
                                    '_ontology_weight_data.txt')

            if plot == True:

                ontologyDF.drop('sum', axis=1, inplace=True)
                ontologyWeightDF.drop('sum', axis=1, inplace=True)

                uplot.ontologyPlot(ontologyDF, study, category, subcat)
                uplot.ontologyWeightPlot(ontologyWeightDF, study, category,
                                         subcat)
    def getMutWeightData(self, category, study, plot, cutoff):

        #get a groupby object containing all the mutations grouped by the category
        #then gets the avg mutations per sample or gene

        if category == 'ontology':

            grouped = self.ontologyDF.groupby(category)
            mutWeight = grouped.mutation_type.value_counts(
            ) / grouped.gene_symbol.nunique()

        else:

            grouped = self.rawMutationDF.groupby(category)
            mutWeight = grouped.mutation_type.value_counts(
            ) / grouped.sample_id.nunique()

        #unstacks groupby object and converts to dataframe to plot
        #fills NaN values with 0... NaN values here mean that the category did not have that type of mutation
        mutWeightDF = mutWeight.unstack().fillna(0.0)

        mutWeightDF['sum'] = mutWeightDF.sum(axis=1)

        mutWeightDF.sort_values(by='sum', ascending=False, inplace=True)

        ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study + '/' +
                        category + '/')

        mutWeightDF.to_csv('/Users/Rohil/Documents/Young Dawgs/' + study +
                           '/' + category + '/' + category +
                           '_mut_weight_data.txt')

        if plot == True:

            mutWeightDF.drop('sum', axis=1, inplace=True)

            uplot.mutPlot(mutWeightDF, study, category, cutoff, weighted=True)