def getMutData(self, category, study, plot, cutoff): if category == 'ontology': df = pd.crosstab(self.ontologyDF[category], self.ontologyDF['mutation_type']) else: df = pd.crosstab(self.rawMutationDF[category], self.rawMutationDF['mutation_type']) df['sum'] = df.sum(axis=1) df = getMutationSupertype(df) df.sort_values(by=['sum'], ascending=False, inplace=True) ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study + '/' + category + '/') df.to_csv('/Users/Rohil/Documents/Young Dawgs/' + study + '/' + category + '/' + category + '_mutation_data.txt') if plot == True: #take only the top 50 rows... only applies to gene mutation data but doesn't effect other categories df = df.head(50).drop('sum', axis=1) uplot.mutPlot(df, study, category, cutoff=0, weighted=False) uplot.mutSupertypeSumPlot(df, study, category) uplot.mutSupertypeIndividualPlot(df, study, category)
def genePlot(df, study, category, subcat, cutoff, weighted): if weighted == True: metric = 'normalized mutation count' m = 'weight' else: metric = 'mutation count' m = '' plot = df.drop(df.columns[df.apply(lambda col: col.sum() < cutoff)], axis=1)[:50].plot(kind='bar', stacked=True, figsize=(35, 20), fontsize=15, color=colorList) plot.set_title(category + '\n' + subcat + '\n' + 'gene vs ' + metric, fontsize=15) plot.set_ylabel(metric, fontsize=15) plot.set_xlabel('gene', fontsize=15) plot.legend(loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 15}) fig = plot.get_figure() ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study + '/' + category + '/' + category + '_gene' + m + '_images/') fig.savefig('/Users/Rohil/Documents/Young Dawgs/' + study + '/' + category + '/' + category + '_gene' + m + '_images/' + subcat + '_gene' + m + '_image.png', bbox_inches='tight') plt.close()
def mutSupertypeSumPlot(df, study, category): mutSupertypePlot = df.reset_index().plot( x=category, y=[ 'base_substitution', 'small_indel', 'chromosomal_instability', 'fusion' ], kind='bar', stacked=True, figsize=(25, 15), title=category + ' vs #mutations', fontsize=15, color=['yellow', 'green', 'red', 'pink']) mutSupertypePlot.set_ylabel('#mutations') mutSupertypePlot.set_xlabel(category, size=15) mutSupertypePlot.legend(loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 15}) mutSupertypeFig = mutSupertypePlot.get_figure() ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study + '/' + category + '/') mutSupertypeFig.savefig('/Users/Rohil/Documents/Young Dawgs/' + study + '/' + category + '/' + category + 'MutSupertypeSumData.png', bbox_inches='tight') plt.clf()
def mutSupertypeIndividualPlot(df, study, category): baseSubCols = df.columns[df.columns.isin(baseSubList)] smallIndelCols = df.columns[df.columns.isin(smallIndelList)] chromInstabCols = df.columns[df.columns.isin(chromInstabilityList)] supertypeDict = { 'base_substitution': baseSubCols, 'small_indel': smallIndelCols, 'chromosomal_instability': chromInstabCols } ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study + '/' + category + '/') for key, value in supertypeDict.items(): if df[value].empty: break else: df['sum'] = df[value].sum() df.sort_values(by=['sum'], ascending=False, inplace=True) df.drop('sum', axis=1, inplace=True) supertypePlot = df.reset_index().plot(x=category, y=value, kind='bar', stacked=True, figsize=(25, 15), title=category + ' vs #' + key + ' mutations', fontsize=15) supertypePlot.set_ylabel('#mutations', size=15) supertypePlot.set_xlabel(category, size=15) supertypePlot.legend(loc='center left', bbox_to_anchor=(1, 0.5), prop={'size': 15}) supertypeFig = supertypePlot.get_figure() supertypeFig.savefig('/Users/Rohil/Documents/Young Dawgs/' + study + '/' + category + '/' + category + '_' + key + '.png', bbox_inches='tight') plt.clf()
def assimilateRawMutationDF(self, study): self.rawMutationDF = self.pointDF.append(self.cnaDF, ignore_index=True) self.rawMutationDF = self.rawMutationDF[[ 'gene_symbol', 'pointORcna', 'mutation_type', 'amino_acid_change', 'functional_impact_score', 'sample_id', 'study_id', 'ontology' ]] self.rawMutationDF.sort_values('gene_symbol', inplace=True) self.rawMutationDF = combineRepeatMutations(self.rawMutationDF) ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study + '/')
def getGeneData(self, category, study, plot, cutoff): if category == 'ontology': subcategoryList = self.ontologyDF[category].unique() else: subcategoryList = self.rawMutationDF[category].unique() for subcat in subcategoryList: if category == 'ontology': df = self.ontologyDF.loc[self.ontologyDF[category] == subcat] else: df = self.rawMutationDF.loc[self.rawMutationDF[category] == subcat] geneDF = pd.crosstab(df['gene_symbol'], df['mutation_type']) geneDF['sum'] = geneDF.sum(axis=1) geneDF.sort_values(by='sum', ascending=False, inplace=True) geneWeight = df.groupby('gene_symbol').mutation_type.value_counts( normalize=True) geneWeightDF = geneWeight.unstack().fillna(0.0) geneWeightDF = geneWeightDF.reindex(geneDF.index) ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study + '/' + category + '/') geneDF.to_csv('/Users/Rohil/Documents/Young Dawgs/' + study + '/' + category + '/' + category + '_gene_data.txt') geneWeightDF.to_csv('/Users/Rohil/Documents/Young Dawgs/' + study + '/' + category + '/' + category + '_gene_weight_data.txt') if plot == True: geneDF.drop('sum', axis=1, inplace=True) uplot.genePlot(geneDF, study, category, subcat, cutoff, weighted=False) uplot.genePlot(geneWeightDF, study, category, subcat, cutoff, weighted=True)
def groupByOntology(self, study): self.ontologyDF = self.rawMutationDF #drops and resets index self.ontologyDF = self.ontologyDF.reset_index(drop=True) #maps a list of ontologies to each row based on the gene_symbol self.ontologyDF['ontology'] = self.ontologyDF['gene_symbol'].map( ontologyDict) #if any rows do not have a value for 'ontology' set that value to a list containing 'no_ontology_found' self.ontologyDF.loc[self.ontologyDF['ontology'].isnull(), ['ontology']] = self.ontologyDF.loc[ self.ontologyDF['ontology'].isnull(), 'ontology'].apply( lambda x: ['no_ontology_found']) #get number of ontologies for each row dfLength = self.ontologyDF['ontology'].str.len() #convert ontology list to numpy array dfValues = self.ontologyDF['ontology'].values #get an array with the indices repeated by the number of ontologies for each row dfIndex = np.repeat(self.ontologyDF.index, dfLength) #expand rows by duplicated index values self.ontologyDF = self.ontologyDF.loc[dfIndex] #add ontology values, flattened from the list self.ontologyDF['ontology'] = np.concatenate(dfValues) #reset the index again, since it currently contains repeat values self.ontologyDF = self.ontologyDF.reset_index(drop=True) ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study + '/') self.ontologyDF.to_csv('/Users/Rohil/Documents/Young Dawgs/' + study + '/mappedDF.txt', index=False)
def getOntologyData(self, category, study, plot): #DEPRECATED... NO LONGER IN USE (TENTATIVE) for subcat in self.rawMutationDF[category].unique(): df = self.ontologyDF.loc[self.ontologyDF[category] == subcat] ontologyDF = pd.crosstab(df['ontology'], df['mutation_type']) ontologyDF['sum'] = ontologyDF.sum(axis=1) ontologyDF.sort_values(by='sum', ascending=False, inplace=True) ontologyGrouped = df.groupby('ontology') ontologyWeight = ontologyGrouped.mutation_type.value_counts( ) / ontologyGrouped.gene_symbol.nunique() ontologyWeightDF = ontologyWeight.unstack().fillna(0.0) ontologyWeightDF['sum'] = ontologyWeightDF.sum(axis=1) ontologyWeightDF.sort_values(by='sum', ascending=False, inplace=True) ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study + '/' + category + '/') ontologyDF.to_csv('/Users/Rohil/Documents/Young Dawgs/' + study + '/' + category + '/' + category + '_ontology_data.txt') ontologyWeightDF.to_csv('/Users/Rohil/Documents/Young Dawgs/' + study + '/' + category + '/' + category + '_ontology_weight_data.txt') if plot == True: ontologyDF.drop('sum', axis=1, inplace=True) ontologyWeightDF.drop('sum', axis=1, inplace=True) uplot.ontologyPlot(ontologyDF, study, category, subcat) uplot.ontologyWeightPlot(ontologyWeightDF, study, category, subcat)
def getMutWeightData(self, category, study, plot, cutoff): #get a groupby object containing all the mutations grouped by the category #then gets the avg mutations per sample or gene if category == 'ontology': grouped = self.ontologyDF.groupby(category) mutWeight = grouped.mutation_type.value_counts( ) / grouped.gene_symbol.nunique() else: grouped = self.rawMutationDF.groupby(category) mutWeight = grouped.mutation_type.value_counts( ) / grouped.sample_id.nunique() #unstacks groupby object and converts to dataframe to plot #fills NaN values with 0... NaN values here mean that the category did not have that type of mutation mutWeightDF = mutWeight.unstack().fillna(0.0) mutWeightDF['sum'] = mutWeightDF.sum(axis=1) mutWeightDF.sort_values(by='sum', ascending=False, inplace=True) ensureDirectory('/Users/Rohil/Documents/Young Dawgs/' + study + '/' + category + '/') mutWeightDF.to_csv('/Users/Rohil/Documents/Young Dawgs/' + study + '/' + category + '/' + category + '_mut_weight_data.txt') if plot == True: mutWeightDF.drop('sum', axis=1, inplace=True) uplot.mutPlot(mutWeightDF, study, category, cutoff, weighted=True)