def getCosmicGenes(self): cosmicGenes = InputParser().readCausalGeneFile( settings.files['causalGenesFile']) cosmicGeneNames = [] cancerTypes = dict() for gene in cosmicGenes: cosmicGeneNames.append(gene[3].name) cancerTypes[gene[3].name] = gene[4] return cosmicGeneNames, cancerTypes
def getCosmicGenes(self): """ Read all names of COSMIC genes into a dictionary. Keys are the names, associated cancer type as value. """ cosmicGenes = InputParser().readCausalGeneFile( settings.files['causalGenesFile']) cosmicGeneNames = [] cancerTypes = dict() for gene in cosmicGenes: cosmicGeneNames.append(gene[3].name) cancerTypes[gene[3].name] = gene[4] return cosmicGeneNames, cancerTypes
import random from scipy import stats path = sys.argv[2] sys.path.insert(1, path) sys.path.insert(1, 'linkSVsGenes/') import settings from inputParser import InputParser outDir = sys.argv[1] svTypes = ['DEL', 'DUP', 'INV', 'ITX'] #svTypes = ['DEL'] #get the cosmic genes cosmicGenes = InputParser().readCausalGeneFile( settings.files['causalGenesFile']) cosmicGeneNames = [] for gene in cosmicGenes: cosmicGeneNames.append(gene[3].name) nonCausalGenes = InputParser().readNonCausalGeneFile( settings.files['nonCausalGenesFile'], cosmicGenes) #In the same format as the causal genes. #Combine the genes into one set. allGenes = np.concatenate((cosmicGenes, nonCausalGenes), axis=0) bcGeneNames = [] bcGenesFile = '../data/genes/breastCancerCausalGenes.txt' #make setting with open(bcGenesFile, 'r') as inF:
def generateFrequencyScatterPlot(self, cancerTypes, pathogenicSNVCounts): """ Create figures S3A+B and figure 2A. Parameters: - cancerTypes: cancer types to include in the plot - pathogenicSNVCounts: dictionary with the cancer type as key, and as value a dictionary with each gene as key, and as value the count of high-impact SNVs affecting that gene in this cancer type. """ #Get the predicted positive SV-gene pairs (no longer cosmic-specific) allCosmicPairs = dict() for cancerType in cancerTypes: cosmicGeneNames, cosmicGeneCancerTypes = self.getCosmicGenes() correctCosmicPairs = self.getCorrectlyPredictedCosmicPairs( cancerType, cosmicGeneNames) allCosmicPairs[cancerType] = correctCosmicPairs #Create an order for the genes and cancer types cancerTypesIndex = dict() cosmicGenesIndex = dict() geneFrequencies = dict() geneInd = 0 cancerTypePlotNames = [] for cancerTypeInd in range(0, len(allCosmicPairs)): cancerType = list(allCosmicPairs.keys())[cancerTypeInd] cancerTypesIndex[cancerType] = cancerTypeInd splitCancerType = cancerType.split('_') cancerType2 = '_'.join(splitCancerType[1:2]) cancerTypePlotNames.append(cancerType2) if cancerType not in geneFrequencies: geneFrequencies[cancerType] = dict() for pair in allCosmicPairs[cancerType]: splitPair = pair.split('_') gene = splitPair[0] if gene not in cosmicGenesIndex: cosmicGenesIndex[gene] = geneInd geneInd += 1 if gene not in geneFrequencies[cancerType]: geneFrequencies[cancerType][gene] = 0 geneFrequencies[cancerType][gene] += 1 #check distribution of genes/cosmic etc uniqueGenes = dict() uniqueCosmicGenes = dict() uniqueSpecificGenes = dict() plotData = [] plotDataAllGenes = [] for cancerTypeInd in range(0, len(allCosmicPairs)): cancerType = list(allCosmicPairs.keys())[cancerTypeInd] cancerTypeNames = self.cancerTypeNames[cancerType] uniqueGenesC = dict() uniqueCosmicGenesC = dict() uniqueSpecificGenesC = dict() uniquePatients = dict() genesPerPatient = dict() for pair in allCosmicPairs[cancerType]: splitPair = pair.split('_') gene = splitPair[0] uniquePatients[splitPair[1]] = 0 uniqueGenes[gene] = 0 uniqueGenesC[gene] = 0 geneType = 'Predicted driver gene' if gene in cosmicGeneCancerTypes: geneType = 'CGC gene' uniqueCosmicGenes[gene] = 0 uniqueCosmicGenesC[gene] = 0 for keyword in cancerTypeNames: if re.search(keyword, cosmicGeneCancerTypes[gene], re.IGNORECASE): geneType = 'Cancer-type specific CGC gene' uniqueSpecificGenes[gene] = 0 uniqueSpecificGenesC[gene] = 0 if splitPair[1] not in genesPerPatient: genesPerPatient[splitPair[1]] = [] genesPerPatient[splitPair[1]].append(gene) print('cancer type: ', cancerType) print('genes: ', len(uniqueGenesC)) print('cosmic genes: ', len(uniqueCosmicGenesC)) print('specific genes: ', len(uniqueSpecificGenesC)) print(uniqueSpecificGenesC) print('number of patients: ', len(uniquePatients)) print('genes per patient: ', len(uniqueGenesC) / len(uniquePatients)) perPatientGeneDistribution = [] perPatientCosmicGeneDistribution = [] perPatientSCosmicGeneDistribution = [] for patient in genesPerPatient: geneCount = 0 cosmicGeneCount = 0 sCosmicGeneCount = 0 for gene in genesPerPatient[patient]: geneCount += 1 if gene in cosmicGeneCancerTypes: cosmicGeneCount += 1 for keyword in cancerTypeNames: if re.search(keyword, cosmicGeneCancerTypes[gene], re.IGNORECASE): sCosmicGeneCount += 1 perPatientGeneDistribution.append(geneCount) perPatientCosmicGeneDistribution.append(cosmicGeneCount) perPatientSCosmicGeneDistribution.append(sCosmicGeneCount) plotDataAllGenes.append( [cancerType, 'Predicted driver genes', geneCount, patient]) plotData.append( [cancerType, 'CGC genes', cosmicGeneCount, patient]) plotData.append([ cancerType, 'Cancer type-specific CGC genes', sCosmicGeneCount, patient ]) print('total drivers: ', len(uniqueGenes)) print('total known drivers: ', len(uniqueCosmicGenes)) print('total specific drivers: ', len(uniqueSpecificGenes)) #plot Fig S3A and S3B data = pd.DataFrame(plotData) data.columns = [ 'Cancer type', 'Gene type', 'Gene count per patient', 'Patient' ] v = sns.boxplot(y='Gene count per patient', x='Cancer type', data=data, hue='Gene type', palette=['#57db5f', '#5f57db']) plt.xticks(np.arange(0, len(cancerTypes)), cancerTypePlotNames, rotation='vertical') plt.tight_layout() plt.savefig('output/figures/figureS3A.svg') data = pd.DataFrame(plotDataAllGenes) data.columns = [ 'Cancer type', 'Gene type', 'Gene count per patient', 'Patient' ] v = sns.boxplot(y='Gene count per patient', x='Cancer type', data=data, hue='Gene type', palette=['#db5f57']) plt.xticks(np.arange(0, len(cancerTypes)), cancerTypePlotNames, rotation='vertical') plt.tight_layout() plt.savefig('output/figures/figureS3B.svg') ####Then use the same information to output figure 2A #instead of frequency by non-coding SVs, use number of coding events as size print('Calculating coding events...') codingFrequency = dict() normalizedCodingFrequency = dict() patientCounts = dict() #aside from normal codng events, also sample random genes to compare to iterationCount = 1 #get all genes to sample from causalGenes = InputParser().readCausalGeneFile( settings.files['causalGenesFile']) nonCausalGenes = InputParser().readNonCausalGeneFile( settings.files['nonCausalGenesFile'], causalGenes) #In the same format as the causal genes. #Combine the genes into one set. allGenes = np.concatenate((causalGenes, nonCausalGenes), axis=0) allGeneNames = [] for gene in allGenes: allGeneNames.append(gene[3].name) cosmicGeneNames = [] for gene in causalGenes: cosmicGeneNames.append(gene[3].name) intogenDrivers = [] intogenFile = '../data/genes/Compendium_Cancer_Genes.tsv' with open(intogenFile, 'r') as inF: lineCount = 0 for line in inF: if lineCount < 1: lineCount = 1 continue splitLine = line.split('\t') intogenDrivers.append(splitLine[0]) #instead, sample 10.000 times X genes of the same set size #take the average of that set. np.random.seed(1) randomSampleIterations = 100 geneFrequencies = dict() nonCodingOnlyGenes = dict() allPValues = [] for cancerType in cancerTypes: #if checking results for CTCF, make sure that we can find the results #in the pathogenic SNV pairs data. if cancerType == 'HMF_Breast_CTCF': cancerType2 = 'HMF_Breast' elif cancerType == 'HMF_Colorectal_CTCF': cancerType2 = 'HMF_Colorectal' elif cancerType == 'HMF_Lung_CTCF': cancerType2 = 'HMF_Lung' else: splitCancerType = cancerType.split('_') cancerType2 = '_'.join(splitCancerType[0:2]) nonCodingOnlyGenes[cancerType] = dict() geneFrequencies[cancerType] = dict() trueGenes = dict() for pair in allCosmicPairs[cancerType]: splitPair = pair.split('_') gene = splitPair[0] trueGenes[gene] = 0 randomDistribution = [] for iteration in range(0, randomSampleIterations): #sample random genes of the same size. randomGenes = np.random.choice(allGeneNames, len(trueGenes)) for gene in randomGenes: if gene in pathogenicSNVCounts[cancerType2]: if gene not in intogenDrivers: continue randomDistribution.append( pathogenicSNVCounts[cancerType2][gene]) else: randomDistribution.append(0) randomMean = np.mean(randomDistribution) randomStd = np.std(randomDistribution) pValues = [] #allPValues = [] seenGenes = [] for pair in allCosmicPairs[cancerType]: splitPair = pair.split('_') patient = splitPair[1] svType = splitPair[2] gene = splitPair[0] if gene in seenGenes: continue seenGenes.append(gene) score = 0 if gene in pathogenicSNVCounts[cancerType2]: score = pathogenicSNVCounts[cancerType2][gene] else: #don't count duplicates, that would be more than 1 per patient nonCodingOnlyGenes[cancerType][gene] = 0 z = (score - randomMean) / randomStd pValue = stats.norm.sf(abs(z)) pValues.append([gene, z, pValue]) allPValues.append( [gene, cancerType, z, pValue, score, patient]) if len(allPValues) < 1: continue uncorrectedPValues = np.array(allPValues, dtype='object') #sort by most significant first uncorrectedPValues = uncorrectedPValues[np.argsort( uncorrectedPValues[:, 3])] #reject, pAdjusted, _, _ = multipletests(uncorrectedPValues[:,3], method='fdr_bh', alpha=0.1) #fdr_bh or bonferroni reject, pAdjusted, _, _ = multipletests(uncorrectedPValues[:, 3], method='bonferroni') signPatients = [] for pValueInd in range(0, len(uncorrectedPValues[:, 3])): gene = uncorrectedPValues[pValueInd, 0] cancerType = uncorrectedPValues[pValueInd, 1] if reject[pValueInd] == True and uncorrectedPValues[pValueInd, 2] > 0: geneFrequencies[cancerType][gene] = uncorrectedPValues[ pValueInd, 2] signPatients.append([ uncorrectedPValues[pValueInd][0], uncorrectedPValues[pValueInd][2], pAdjusted[pValueInd], uncorrectedPValues[pValueInd][3], uncorrectedPValues[pValueInd][4], uncorrectedPValues[pValueInd][5] ]) signPatients = np.array(signPatients, dtype='object') print(signPatients) print(signPatients.shape) cosmicCountSignificantGenes = 0 for gene in signPatients[:, 0]: if gene in cosmicGeneCancerTypes: cosmicCountSignificantGenes += 1 print('Number of Cosmic genes in significant genes: ', cosmicCountSignificantGenes) sortedPatients = signPatients[np.argsort(signPatients[:, 2])] signPatients = sortedPatients[0:50] print(signPatients) #save the significant genes to a file for table S3 tableS3Data = [] for row in sortedPatients: #find which cancer type had this gene. for cancerType in cancerTypes: for pair in allCosmicPairs[cancerType]: splitPair = pair.split('_') patient = splitPair[1] svType = splitPair[2] gene = splitPair[0] if gene == row[0]: #gene, uncorrected, corrected, patient, sv type, cancer type splitCancerType = cancerType.split('_') tableS3Data.append([ gene, row[2], row[3], patient, svType, splitCancerType[1] ]) tableS3Data = np.array(tableS3Data) np.savetxt('output/significantGenes.txt', tableS3Data, fmt='%s', delimiter='\t') #create the scatter plot in this order, use the frequency as point size genePlotIndices = dict() currentGenePlotIndex = 0 plotData = [] plotFrequencies = [] pointColors = [] cancerTypePlotNames = [] for cancerType in allCosmicPairs: splitCancerType = cancerType.split('_') cancerType2 = '_'.join(splitCancerType[1:2]) cancerTypePlotNames.append(cancerType2) cancerTypeIndex = cancerTypesIndex[cancerType] cancerTypeNames = self.cancerTypeNames[cancerType] for pair in allCosmicPairs[cancerType]: splitPair = pair.split('_') gene = splitPair[0] #get frequency of this gene if gene in geneFrequencies[ cancerType] and gene in signPatients[:, 0]: geneFrequency = geneFrequencies[cancerType][gene] if gene not in genePlotIndices: genePlotIndices[gene] = currentGenePlotIndex currentGenePlotIndex += 1 #determine the color based on if this gene is cancer-type specific edgecolors = 1 facecolors = 'black' if gene in cosmicGeneCancerTypes: facecolors = 'green' edgecolors = 3 for keyword in cancerTypeNames: if re.search(keyword, cosmicGeneCancerTypes[gene], re.IGNORECASE): print('match', cancerType, gene) edgecolors = 2 facecolors = 'red' plotData.append([ genePlotIndices[gene], cancerTypeIndex, edgecolors, geneFrequency * 500 ]) plotData = np.array(plotData) print(plotData) print(plotData.shape) data = pd.DataFrame(plotData) data.columns = ['Gene', 'Cancer type', 'color', 'frequency'] data = data.drop_duplicates() #make sure to use the same colors as in the other plots, and not skip colors because #not all cacner types have significant genes. customPalette = sns.color_palette("hls", len(cancerTypes)) finalPalette = [] for colorInd in range(0, len(customPalette)): if colorInd not in plotData[:, 1]: continue else: finalPalette.append(customPalette[colorInd]) plt.figure(figsize=(10, 6)) sns.scatterplot(data=data, x='Gene', y='Cancer type', size=data.frequency, hue=data['Cancer type'], legend=False, style=data.color, edgecolor='k', sizes=(20, 300), palette=finalPalette) plt.xticks(np.arange(0, len(genePlotIndices)), list(genePlotIndices.keys()), rotation='vertical') plt.yticks(np.arange(0, len(cancerTypesIndex)), cancerTypePlotNames) ax = plt.axes() ax.grid(which='minor', axis='y', linestyle='-') plt.tight_layout() plt.savefig('output/figures/figure2A.svg') plt.clf()
###parameters geneNameConversionFile = settings.files['geneNameConversionFile'] expressionFile = settings.files['normalizedExpressionFile'] outDir = sys.argv[2] randomize = sys.argv[3] #shuffle expression to get random z-scores? specificOutDir = outDir + '/tadDisruptionsZScores/' if not os.path.exists(specificOutDir): os.makedirs(specificOutDir) #For each TAD, determine which genes are there #first get all genes and their positions causalGenes = InputParser().readCausalGeneFile( settings.files['causalGenesFile']) nonCausalGenes = InputParser().readNonCausalGeneFile( settings.files['nonCausalGenesFile'], causalGenes) #In the same format as the causal genes. #Combine the genes into one set. allGenes = np.concatenate((causalGenes, nonCausalGenes), axis=0) genes = [] for gene in allGenes: genes.append([gene[0], gene[1], gene[2], gene[3].name]) genes = np.array(genes, dtype='object') #also use a map for the gene names geneNameConversionMap = dict()
def __init__(self, genes, svData): """ Initialize the neighborhood defining. This involves gathering all required data types, mapping these to TADs/genes, and associating the effects of SVs to genes. genes: (numpy array) array with the genes and their information. chr, start, end, geneObject svData: (numpy array) array with the SVs and their information. chr1, s1, e1, chr2, s2, e2, cancerType, sampleName, svObject. """ #1. Get TADs from the TAD file, and then map TADs to genes (left/right TAD). tadData = [] tadFile = settings.files['tadFile'] print("Getting TADs") tadData = InputParser().getTADsFromFile(tadFile) print("original number of svs:", svData.shape) if settings.general['shuffleTads'] == True: #Shuffle the TADs. Assign random genomic positions to the TADs, but keep the same length. genomicShuffler = GenomicShuffler() tadData = genomicShuffler.shuffleTADs(tadData) print("mapping TADs to genes") self.mapTADsToGenes(genes[:, 3], tadData) # #2. Get eQTLs from the eQTL file, and map eQTLs to TADs. eQTLFile = settings.files['eQTLFile'] print("getting eQTLs") eQTLData = InputParser().getEQTLsFromFile(eQTLFile, genes[:, 3], self) #map the regulatory elements to the TADs so that we can later on when looking at disrupted TADs easily find which elements are affected. tadData = self.mapElementsToTads(eQTLData, tadData) #map the genes to TADs. These are all the gene objects that we can then access when looking at disrupted TADs. tadData = self.mapGenesToTads(genes, tadData) #3. Get enhancers print("getting enhancers") enhancerData = InputParser().getEnhancersFromFile( settings.files['enhancerFile'], genes[:, 3], self) #Add the enhancers to TADs & genes as well tadData = self.mapElementsToTads(enhancerData, tadData) #4. Get promoters print("getting promoters") promoterData = InputParser().getPromotersFromFile( settings.files['promoterFile'], genes[:, 3], self) #Add the promoters to the TADs tadData = self.mapElementsToTads(promoterData, tadData) #5. Get CpG islands print("Getting cpg islands") cpgData = InputParser().getCpgIslandsFromFile( settings.files['cpgFile']) #Add the CpG sites to the TADs tadData = self.mapElementsToTads(cpgData, tadData) #6. Get Transcription factors print("Getting transcription factors") tfData = InputParser().getTranscriptionFactorsFromFile( settings.files['tfFile']) #Add the CpG sites to the TADs tadData = self.mapElementsToTads(tfData, tadData) #7. Get Hi-C data #print("Getting Hi-C data") #hicData = InputParser().getHiCInteractionsFromFile(settings.files['hicFile']) #Map the interactions to TADs as elements #tadData = self.mapInteractionsToTads(hicData, tadData) #8. Get histone marks print("Getting histone marks") #files = [settings.files['h3k9me3'], settings.files['h3k4me3'], settings.files['h3k27ac'], settings.files['h3k27me3'], # settings.files['h3k4me1'], settings.files['h3k36me3']] #types = ['h3k9me3', 'h3k4me3', 'h3k27ac', 'h3k27me3', 'h3k4me1', 'h3k36me3'] #only use the types that matter files = [ settings.files['h3k4me3'], settings.files['h3k27ac'], settings.files['h3k27me3'], settings.files['h3k4me1'] ] types = ['h3k4me3', 'h3k27ac', 'h3k27me3', 'h3k4me1'] for histoneFileInd in range(0, len(files)): histoneData = InputParser().getHistoneMarksFromFile( files[histoneFileInd], types[histoneFileInd]) #map the histone marks to the TADs tadData = self.mapElementsToTads(histoneData, tadData) #9. Get DNAse I hypersensitivty sites print("Getting DNAse I hypersensitivity sites") dnaseIData = InputParser().getDNAseIFromFile( settings.files['dnaseIFile']) tadData = self.mapElementsToTads(dnaseIData, tadData) #10. get chromHMM states print("Getting chromHMM states") chromHmmData = InputParser().getChromHmmFromFile( settings.files['chromHmmFile']) tadData = self.mapElementsToTads(chromHmmData, tadData) #11. get RNAPolII peaks print("Getting rnaPol binding sites") rnaPolData = InputParser().getRnaPolFromFile( settings.files['rnaPolFile']) tadData = self.mapElementsToTads(rnaPolData, tadData) #12. get super enhancers print("Getting super enhancers") superEnhancerData = InputParser().getSuperEnhancersFromFile( settings.files['superEnhancerFile']) tadData = self.mapElementsToTads(superEnhancerData, tadData) #13. get CTCF sites print("Getting ctcf sites") ctcfData = InputParser().getCTCFSitesFromFile( settings.files['ctcfFile']) tadData = self.mapElementsToTads(ctcfData, tadData) tadData = self.mapCTCFStrengthToTads(ctcfData, tadData) #3. Determine the effect of the SVs on the neighborhood/regulator set print("Mapping SVs to the neighborhood") self.mapSVsToNeighborhood(genes, svData, tadData)
def __init__(self): self.inputParser = InputParser() self.networkParser = NetworkParser() self.connection = Connection() self.output = ""
def getBinScores(zScores, rules, cosmic, expressionCutoff, randomExpression, svType, elementType): """ Get the z-scores in each bin in the TADs. zScores (numpy array): z-scores as calculated in computeZScoresDisruptedTads.py rules (str): True or False, do we only plot genes that have an SV-gene pair identified by the rules? cosmic (str): True or False, do we only focus on COSMIC genes? expressionCutoff (str): obsolete randomExpression (str): True or False, do we use randomized z-scores (across & between patients/genes) to plot? svType (str): which SV type are we plotting for elementType (str): eQTL_se_enh: run with only SV-gene pairs that gain/lose an enhancer, eQTL or super enhancer, enh: only enhancers, promoter: only promoters, se: only super enhancers. """ splitZScores = [] allPatients = [] for zScore in zScores: splitScore = zScore[0].split("_") splitZScores.append([splitScore[0], splitScore[1], float(zScore[5])]) if splitScore[0] not in allPatients: allPatients.append(splitScore[0]) zScores = np.array(splitZScores, dtype='object') causalGenes = InputParser().readCausalGeneFile(settings.files['causalGenesFile']) nonCausalGenes = InputParser().readNonCausalGeneFile(settings.files['nonCausalGenesFile'], causalGenes) #In the same format as the causal genes. #Combine the genes into one set. allGenes = np.concatenate((causalGenes, nonCausalGenes), axis=0) if cosmic == 'True': allGenes = causalGenes causalGeneList = [] for gene in causalGenes: causalGeneList.append(gene[3].name) #then go through the TADs that are disrupted by a non-coding SV. #Get all SVs svDir = settings.files['svDir'] svData = InputParser().getSVsFromFile_hmf(svDir) #Filter for the right SV type that we make this plot for. filteredSVs = [] types = [] for sv in svData: if svType != 'ALL': if sv[8].svType != svType: continue svEntry = sv[0] + "_" + str(sv[1]) + "_" + str(sv[2]) + "_" + sv[3] + "_" + str(sv[4]) + "_" + str(sv[5]) + "_" + sv[8].sampleName filteredSVs.append(sv) if sv[8].svType not in types: types.append(sv[8].svType) print(types) filteredSVs = np.array(filteredSVs, dtype='object') #For each SV, determine which TAD it starts and ends in. #Keep this as a TAD pair. tadFile = settings.files['tadFile'] tadData = InputParser().getTADsFromFile(tadFile) tadPairs = dict() #keep the pair as name, and the patients as value. for sv in filteredSVs: #get the left and rightmost TAD. #if intrachromosomal, check overlap if sv[0] == sv[3]: tadChrSubsetInd = sv[0] == tadData[:,0] tadChrSubset = tadData[tadChrSubsetInd] #If the SV start is before the end of the TAD, and the SV end after the start of the TAD, the TAD is overlapped. startMatches = sv[1] <= tadChrSubset[:,2] endMatches = sv[5] >= tadChrSubset[:,1] tadMatches = tadChrSubset[startMatches * endMatches] if tadMatches.shape[0] < 2: #no matches, or overlapping just 1 TAD. continue #Get the leftmost and rightmost TADs farLeftTad = tadMatches[0] #This list is sorted farRightTad = tadMatches[tadMatches.shape[0]-1] tadPair = farLeftTad[0] + '_' + str(farLeftTad[1]) + '_' + str(farLeftTad[2]) + '_' + farRightTad[0] + '_' + str(farRightTad[1]) + '_' + str(farRightTad[2]) if tadPair not in tadPairs: tadPairs[tadPair] = [] tadPairs[tadPair].append(sv[7]) else: #if interchromosomal, determine the TAD based on breakpoints on either chromosome. tadChr1SubsetInd = sv[0] == tadData[:,0] tadChr1Subset = tadData[tadChr1SubsetInd] #If the SV start is before the end of the TAD, and the SV end after the start of the TAD, the TAD is overlapped. startMatches = sv[1] <= tadChr1Subset[:,2] endMatches = sv[5] >= tadChr1Subset[:,1] tadMatches = tadChr1Subset[startMatches * endMatches] if tadMatches.shape[0] < 1: #no matches continue #Get the leftmost and rightmost TADs farLeftTad = tadMatches[0] #This list is sorted #repeat for right TAD tadChr2SubsetInd = sv[0] == tadData[:,0] tadChr2Subset = tadData[tadChr2SubsetInd] #If the SV start is before the end of the TAD, and the SV end after the start of the TAD, the TAD is overlapped. startMatches = sv[1] <= tadChr2Subset[:,2] endMatches = sv[5] >= tadChr2Subset[:,1] tadMatches = tadChr2Subset[startMatches * endMatches] if tadMatches.shape[0] < 1: #no matches continue farRightTad = tadMatches[0] tadPair = farLeftTad[0] + '_' + str(farLeftTad[1]) + '_' + str(farLeftTad[2]) + '_' + farRightTad[0] + '_' + str(farRightTad[1]) + '_' + str(farRightTad[2]) if tadPair not in tadPairs: tadPairs[tadPair] = [] tadPairs[tadPair].append(sv[7]) #have an additional filter here for the TADs; if there is one TAD pair where we also see the same TAD boundary disrupted again in the same patient, but on another side, we should ignore it for now. #if the start of the left TAD is also the end of another pair, or te end of the right TAD is the start of another pair, then we should remove this pair. splitPairs = [] for pair in tadPairs: splitPair = pair.split('_') splitPairs.append([splitPair[0], int(splitPair[1]), int(splitPair[2]), splitPair[3], int(splitPair[4]), int(splitPair[5])]) splitPairs = np.array(splitPairs, dtype='object') tadPairsFiltered = dict() for pair in splitPairs: pairChrSubset = splitPairs[splitPairs[:,3] == pair[0]] pairStr = '_'.join([str(i) for i in pair]) pairPatients = tadPairs[pairStr] matched = False if pair[1] in pairChrSubset[:,5]: matchingPairs = pairChrSubset[pairChrSubset[:,5] == pair[1]] #for these matches, check if they are also disrupted in the same patient. for matchedPair in matchingPairs: matchedPairStr = '_'.join([str(i) for i in matchedPair]) matchedPairPatients = tadPairs[matchedPairStr] for patient in matchedPairPatients: if patient in pairPatients: #print(pair, ' has match in : ', matchedPairStr, ' patient: ', patient) matched = True if pair[5] in pairChrSubset[:,1]: matchingPairs = pairChrSubset[pairChrSubset[:,1] == pair[5]] #for these matches, check if they are also disrupted in the same patient. for matchedPair in matchingPairs: matchedPairStr = '_'.join([str(i) for i in matchedPair]) matchedPairPatients = tadPairs[matchedPairStr] for patient in matchedPairPatients: if patient in pairPatients: matched = True #print(pairStr, ' has match in : ', matchedPairStr, ' patient: ', patient) windowOverlap = False if matched == False and windowOverlap == False: if pairStr not in tadPairsFiltered: tadPairsFiltered[pairStr] = pairPatients #also use a map for the gene names, because these are different in the expression data. geneNameConversionMap = dict() geneNameConversionFile = settings.files['geneNameConversionFile'] with open(geneNameConversionFile, 'r') as inF: lineCount = 0 for line in inF: if lineCount < 1: lineCount += 1 continue line = line.strip() splitLine = line.split("\t") ensgId = splitLine[3] splitEnsgId = ensgId.split('.') #we only keep everything before the dot geneName = splitLine[4] geneNameConversionMap[splitEnsgId[0]] = geneName #always get the rules so that we can do the filter out genes overlapped by CNV amplifications that are not affected by non-coding duplications. ruleBasedCombinations = np.loadtxt(sys.argv[1] + '/linkedSVGenePairs/nonCoding_geneSVPairs.txt_', dtype='object') ruleBasedPairs = [] ruleBasedPairsSVs = [] for combination in ruleBasedCombinations: splitPair = combination[0].split('_') #use for the CNV amp check ruleBasedPairsSVs.append(splitPair[0] + '_' + splitPair[7] + '_' + splitPair[12]) #use to exclude based on rules #now check for specific features. #eQTLs: 0 and 26, enhancers: 1 and 27, SEs: 24, 50 if elementType == 'eQTL_se_enh': if combination[1] == '1.0' or combination[27] == '1.0' or combination[2] == '1.0' or combination[28] == '1.0' or combination[25] == '1.0' or combination[51] == '1.0': ruleBasedPairs.append(splitPair[0] + '_' + splitPair[7]) elif elementType == 'enh': #enhancers only if combination[2] == '1.0' or combination[28] == '1.0': ruleBasedPairs.append(splitPair[0] + '_' + splitPair[7]) elif elementType == 'se': #se only if combination[25] == '1.0' or combination[51] == '1.0': ruleBasedPairs.append(splitPair[0] + '_' + splitPair[7]) elif elementType == 'promoter': if combination[3] == '1.0' or combination[29] == '1.0': ruleBasedPairs.append(splitPair[0] + '_' + splitPair[7]) else: #add everything, without filter. ruleBasedPairs.append(splitPair[0] + '_' + splitPair[7]) #Collect all patients with mutations, easier in the adjacent TAds to just filter all patienst with ANY mutations witout having to go through all types individually. mutDir = sys.argv[1] + '/patientGeneMutationPairs/' #svPatients = np.load(mutDir + 'svPatients.npy', allow_pickle=True, encoding='latin1').item() snvPatients = np.load(mutDir + 'snvPatients.npy', allow_pickle=True, encoding='latin1').item() #cnvPatients = np.load(mutDir + 'cnvPatients.npy', allow_pickle=True, encoding='latin1').item() svPatientsDel = np.load(mutDir + 'svPatientsDel.npy', allow_pickle=True, encoding='latin1').item() svPatientsDup = np.load(mutDir + 'svPatientsDup.npy', allow_pickle=True, encoding='latin1').item() svPatientsInv = np.load(mutDir + 'svPatientsInv.npy', allow_pickle=True, encoding='latin1').item() svPatientsItx = np.load(mutDir + 'svPatientsItx.npy', allow_pickle=True, encoding='latin1').item() cnvPatientsDel = np.load(mutDir + 'cnvPatientsDel.npy', allow_pickle=True, encoding='latin1').item() cnvPatientsAmp = np.load(mutDir + 'cnvPatientsAmp.npy', allow_pickle=True, encoding='latin1').item() bins = 10 #have 10 on each side. binZScores = dict() for binInd in range(0, bins*2): if binInd not in binZScores: binZScores[binInd] = [] binZScoresPerPatient = dict() for patient in allPatients: binZScoresPerPatient[patient] = dict() for binInd in range(0, bins*2): binZScoresPerPatient[patient][binInd] = [] perTadPositivePatients = dict() #now for each TAD, get the z-scores for tad in tadPairs: perTadPositivePatients[tad] = [] splitTad = tad.split('_') #Make a mapping for positions to the right bin. #determine the size and how large each bin should be binSizeTad1 = (float(splitTad[2]) - float(splitTad[1])) / bins currentStart = float(splitTad[1]) #start at the TAD start #currentStart = float(splitTad[1]) - offset binStartsTad1 = [currentStart] #list at which position each bin should start. for binInd in range(0, bins): currentStart += binSizeTad1 binStartsTad1.append(currentStart) #repeat for TAD 2 binSizeTad2 = (float(splitTad[5]) - float(splitTad[4])) / bins #binSizeTad2 = ((float(splitTad[5]) + offset) - float(splitTad[4])) / bins currentStart = float(splitTad[4]) #start at the TAD start binStartsTad2 = [currentStart] #list at which position each bin should start. for binInd in range(0, bins): currentStart += binSizeTad2 binStartsTad2.append(currentStart) #Go through the genes of the first TAD; find the genes that will be in this bin geneChrSubset = allGenes[allGenes[:,0] == splitTad[0]] for binInd in range(0, len(binStartsTad1)-1): #get the genes in this bin genes = geneChrSubset[(geneChrSubset[:,2] >= binStartsTad1[binInd]) * (geneChrSubset[:,1] <= binStartsTad1[binInd+1])] #get the z-scores of these genes allGeneZScores = [] geneZScoresPerPatient = dict() for gene in genes: geneName = gene[3].name if geneName in zScores[:,1]: #only add the gene if it has a match. geneZScores = zScores[zScores[:,1] == geneName] #keep the z-scores separate for each patient for patient in range(0, len(geneZScores[:,0])): if geneZScores[patient,0] not in tadPairs[tad]: continue if geneZScores[patient,0] not in perTadPositivePatients[tad]: perTadPositivePatients[tad].append(geneZScores[patient,0]) sample = geneZScores[patient,0] if rules == 'True': #if rules, do not include this gene if there was no SV linked to it in this patient. if geneName + '_' + sample not in ruleBasedPairs: continue #check cnv amp if gene[3].name in cnvPatientsAmp[sample] and gene[3].name + '_' + sample + '_DUP' not in ruleBasedPairsSVs: continue if svType == 'DEL': #only for a deletion, we do not need to print the deleted genes. #if a gene is deleted, the deletion will never result in the gain effect. #this is only true for deletions. if gene[3].name in svPatientsDel[sample] or gene[3].name in cnvPatientsDel[sample]: continue if str(float(geneZScores[patient,2])) == 'nan': continue finalScore = 0 if randomExpression == 'True': randInd = random.sample(range(0, zScores.shape[0]), 1)[0] finalScore = float(zScores[randInd,2]) else: finalScore = float(geneZScores[patient,2]) allGeneZScores.append(finalScore) print('LT: ', binInd, geneName, geneZScores[patient,0], finalScore) if len(allGeneZScores) > 0: binZScores[binInd] += allGeneZScores #now for TAD 2, start from where the TAD 1 indices left off. geneChrSubset = allGenes[allGenes[:,0] == splitTad[3]] for binInd in range(0, len(binStartsTad2)-1): #get the genes in this bin genes = geneChrSubset[(geneChrSubset[:,2] >= binStartsTad2[binInd]) * (geneChrSubset[:,1] <= binStartsTad2[binInd+1])] #get the z-scores of these genes allGeneZScores = [] geneZScoresPerPatient = dict() for gene in genes: geneName = gene[3].name if geneName in zScores[:,1]: geneZScores = zScores[zScores[:,1] == geneName] #keep the z-scores separate for each patient for patient in range(0, len(geneZScores[:,0])): if geneZScores[patient,0] not in tadPairs[tad]: continue if geneZScores[patient,0] not in perTadPositivePatients[tad]: perTadPositivePatients[tad].append(geneZScores[patient,0]) sample = geneZScores[patient,0] if rules == 'True': if geneName + '_' + sample not in ruleBasedPairs: continue #remove incorrect CNV amplification effects. if gene[3].name in cnvPatientsAmp[sample] and gene[3].name + '_' + sample + '_DUP' not in ruleBasedPairsSVs: continue #do the check per SV type, depending on which SV we are looking at. #this is because if we have a deletion, there could still be effects from duplications in the same TAD, because we exclude genes overlapped by duplications to see dup effects. #but for deletions, this is not relevant, and we should remove all such mutations. if svType == 'DEL': #only for a deletion, we do not need to print the deleted genes. #if a gene is deleted, the deletion will never result in the gain effect. #this is only true for deletions. if gene[3].name in svPatientsDel[sample] or gene[3].name in cnvPatientsDel[sample]: continue if str(float(geneZScores[patient,2])) == 'nan': continue finalScore = 0 if randomExpression == 'True': randInd = random.sample(range(0, zScores.shape[0]), 1)[0] finalScore = float(zScores[randInd,2]) else: finalScore = float(geneZScores[patient,2]) allGeneZScores.append(finalScore) print('RT: ', binInd, geneName, geneZScores[patient,0], finalScore) if len(allGeneZScores) > 0: binZScores[binInd+bins] += allGeneZScores #divide the region into 3 bins on each side. #so, get the coordinates on each side depending on where the TAD pair starts and ends #determine which genes are in these regions #add the additional bins. binZScoresOffset = dict() for binInd in range(0, 40): if binInd not in binZScoresOffset: binZScoresOffset[binInd] = [] for binInd in range(0, bins*2): binZScoresOffset[binInd+10] = binZScores[binInd] binZScoresPerPatientOffset = dict() for patient in allPatients: binZScoresPerPatientOffset[patient] = dict() for binInd in range(0, 40): binZScoresPerPatientOffset[patient][binInd] = [] for binInd in range(0, bins*2): binZScoresPerPatientOffset[patient][binInd+10] = binZScoresPerPatient[patient][binInd] #get the expression data expressionFile = settings.files['expressionFile'] expressionData = [] samples = [] with open(expressionFile, 'r') as inF: lineCount = 0 for line in inF: line = line.strip() if lineCount == 0: samples = [''] samples += line.split("\t") lineCount += 1 continue splitLine = line.split("\t") fullGeneName = splitLine[0] if fullGeneName not in geneNameConversionMap: continue geneName = geneNameConversionMap[fullGeneName] #get the gene name rather than the ENSG ID data = splitLine[1:len(splitLine)] fixedData = [geneName] fixedData += data expressionData.append(fixedData) expressionData = np.array(expressionData, dtype="object") #generate the randomized expression for the adjacent TADs, where we cannot use the z-scores anymore since these were not computed for the adjacent TADs. if randomExpression == 'True': from copy import deepcopy randomizedExpressionMatrices = [] shuffleIterations = 1 for i in range(0,shuffleIterations): genes = expressionData[:,0] expression = deepcopy(expressionData[:,1:]) expressionT = expression.T np.random.shuffle(expressionT) shuffledExpression = expressionT.T shuffledExpressionData = np.empty(expressionData.shape, dtype='object') shuffledExpressionData[:,0] = genes shuffledExpressionData[:,1:] = shuffledExpression randomizedExpressionMatrices.append(shuffledExpressionData) expressionData = randomizedExpressionMatrices[0] #pre-filter expression data, for the positive and negative set in the adjacent TADs. #this makes it quicker to search through filteredExpressionData = dict() for sampleInd in range(0, len(samples)): sample = samples[sampleInd] if sample == '': continue if sample not in filteredExpressionData: filteredExpressionData[sample] = dict() for row in expressionData: geneName = row[0] filteredExpressionData[sample][geneName] = float(row[sampleInd]) #Get all TADs that are affected by SVs (positive) and that are not (negative) affectedCount = 0 tadPositiveAndNegativeSet = [] with open(sys.argv[1] + '/tadDisruptionsZScores/tadPositiveAndNegativeSet.txt', 'r') as inF: for line in inF: splitLine = line.split('\t') tad = splitLine[0] positiveSet = ast.literal_eval(splitLine[1]) negativeSet = ast.literal_eval(splitLine[2]) svTypes = ast.literal_eval(splitLine[3]) if len(positiveSet) > 0: affectedCount += 1 tadPositiveAndNegativeSet.append([tad, positiveSet, negativeSet, svTypes]) tadPositiveAndNegativeSet = np.array(tadPositiveAndNegativeSet, dtype='object') print('affected tads: ', affectedCount) #so instead of looking at a region around the TADs, use the TADs that are not affected. #so per pair, find where it is in the positive/negative set file #get the previous or next one #check if this tad is affected or not #if the tad is not affected, add the same amount of bins as the affected tads and plot these on the left and right. for tad in tadPairs: splitTad = tad.split('_') leftTad = splitTad[0] + '_' + splitTad[1] + '_' + splitTad[2] #get the TAD to the left of this tad pair leftTadPosition = np.where(tadPositiveAndNegativeSet[:,0] == leftTad)[0] leftAdjacentTad = tadPositiveAndNegativeSet[leftTadPosition-1][0] splitLeftAdjacentTad = leftAdjacentTad[0].split('_') leftNegativeSet = leftAdjacentTad[2] splitPos = splitLeftAdjacentTad[0].split('_') if splitPos[0] != splitTad[0]: #check if the TAD is on the next chromosome continue #otherwise, divide this tad into bins, and get the z-scores of z-scores for the genes. binSizeTad1 = (float(splitLeftAdjacentTad[2]) - float(splitLeftAdjacentTad[1])) / bins currentStart = float(splitLeftAdjacentTad[1]) #start at the TAD start binStartsTad1 = [currentStart] #list at which position each bin should start. for binInd in range(0, bins): currentStart += binSizeTad1 binStartsTad1.append(currentStart) #Go through the genes of the first TAD; find the genes that will be in this bin geneChrSubset = allGenes[allGenes[:,0] == splitLeftAdjacentTad[0]] for binInd in range(0, len(binStartsTad1)-1): #get the genes in this bin genes = geneChrSubset[(geneChrSubset[:,2] >= binStartsTad1[binInd]) * (geneChrSubset[:,1] <= binStartsTad1[binInd+1])] #get the z-scores of these genes allGeneZScores = [] geneZScoresPerPatient = dict() for gene in genes: geneName = gene[3].name #get the expression of this gene in the negative set negativeExpr = [] positiveExpr = [] if geneName not in expressionData[:,0]: continue positiveSampleInd = [] negativeSampleInd = [] positivePatients = [] negativePatients = [] for sample in range(0, len(samples)): if samples[sample] == '': continue #we use the tad itself to define the positive set. #based on the left adjacent tad, we define the negative set. if samples[sample] in perTadPositivePatients[tad]: if samples[sample] in leftAdjacentTad[1]: #skip if this patient has a disruption of the adjacent TAD. continue #exclude this gene if it overlaps a mutation if geneName in svPatientsDel[samples[sample]] or geneName in svPatientsDup[samples[sample]] or geneName in svPatientsInv[samples[sample]] or geneName in svPatientsItx[samples[sample]] or geneName in snvPatients[samples[sample]] or geneName in cnvPatientsAmp[samples[sample]] or geneName in cnvPatientsDel[samples[sample]]: continue positiveSampleInd.append(sample) positiveExpr.append(filteredExpressionData[samples[sample]][geneName]) positivePatients.append(samples[sample]) elif samples[sample] in leftNegativeSet: #exclude this gene if it overlaps a mutation if geneName in svPatientsDel[samples[sample]] or geneName in svPatientsDup[samples[sample]] or geneName in svPatientsInv[samples[sample]] or geneName in svPatientsItx[samples[sample]] or geneName in snvPatients[samples[sample]] or geneName in cnvPatientsAmp[samples[sample]] or geneName in cnvPatientsDel[samples[sample]]: continue negativeExpr.append(filteredExpressionData[samples[sample]][geneName]) negativePatients.append(samples[sample]) negativeSampleInd.append(sample) for patientInd in range(0, len(positiveExpr)): patient = positiveExpr[patientInd] if float(np.std(negativeExpr)) == 0: continue z = (float(patient) - np.mean(negativeExpr)) / float(np.std(negativeExpr)) if str(z) == 'nan': continue print('LAT: ', binInd, geneName, positivePatients[patientInd], z) allGeneZScores.append(z) if len(allGeneZScores) > 0: #binZScoresOffset[binInd].append(np.mean(allGeneZScores)) binZScoresOffset[binInd] += allGeneZScores #repeat for right TAD rightTad = splitTad[3] + '_' + splitTad[4] + '_' + splitTad[5] #get the TAD to the left of this tad pair rightTadPosition = np.where(tadPositiveAndNegativeSet[:,0] == rightTad)[0] if rightTadPosition+1 >= len(tadPositiveAndNegativeSet): continue #TAD is outside the genome. rightAdjacentTad = tadPositiveAndNegativeSet[rightTadPosition+1][0] splitRightAdjacentTad = rightAdjacentTad[0].split('_') rightNegativeSet = rightAdjacentTad[2] splitPos = splitRightAdjacentTad[0].split('_') if splitPos[0] != splitTad[3]: #check if the TAD is on the next chromosome continue #otherwise, divide this tad into bins, and get the z-scores of z-scores for the genes. binSizeTad1 = (float(splitRightAdjacentTad[2]) - float(splitRightAdjacentTad[1])) / bins currentStart = float(splitRightAdjacentTad[1]) #start at the TAD start binStartsTad1 = [currentStart] #list at which position each bin should start. for binInd in range(0, bins): currentStart += binSizeTad1 binStartsTad1.append(currentStart) #Go through the genes of the first TAD; find the genes that will be in this bin geneChrSubset = allGenes[allGenes[:,0] == splitRightAdjacentTad[0]] for binInd in range(0, len(binStartsTad1)-1): #get the genes in this bin genes = geneChrSubset[(geneChrSubset[:,2] >= binStartsTad1[binInd]) * (geneChrSubset[:,1] <= binStartsTad1[binInd+1])] #get the z-scores of these genes allGeneZScores = [] geneZScoresPerPatient = dict() for gene in genes: geneName = gene[3].name #get the expression of this gene in the negative set negativeExpr = [] positiveExpr = [] if geneName not in expressionData[:,0]: continue positiveSampleInd = [] negativeSampleInd = [] positivePatients = [] negativePatients = [] for sample in range(0, len(samples)): if samples[sample] == '': continue #we use the tad itself to define the positive set. #based on the left adjacent tad, we define the negative set. if samples[sample] in perTadPositivePatients[tad]: if samples[sample] in rightAdjacentTad[1]: #skip if this patient has a disruption of the adjacent TAD. continue #exclude this gene if it overlaps a mutation if geneName in svPatientsDel[samples[sample]] or geneName in svPatientsDup[samples[sample]] or geneName in svPatientsInv[samples[sample]] or geneName in svPatientsItx[samples[sample]] or geneName in snvPatients[samples[sample]] or geneName in cnvPatientsAmp[samples[sample]] or geneName in cnvPatientsDel[samples[sample]]: continue positiveSampleInd.append(sample) positiveExpr.append(filteredExpressionData[samples[sample]][geneName]) positivePatients.append(samples[sample]) elif samples[sample] in rightNegativeSet: #exclude this gene if it overlaps a mutation if geneName in svPatientsDel[samples[sample]] or geneName in svPatientsDup[samples[sample]] or geneName in svPatientsInv[samples[sample]] or geneName in svPatientsItx[samples[sample]] or geneName in snvPatients[samples[sample]] or geneName in cnvPatientsAmp[samples[sample]] or geneName in cnvPatientsDel[samples[sample]]: #if geneName in svPatients[samples[sample]] or geneName in snvPatients[samples[sample]] or geneName in cnvPatients[samples[sample]]: continue negativeExpr.append(filteredExpressionData[samples[sample]][geneName]) negativePatients.append(samples[sample]) negativeSampleInd.append(sample) for patientInd in range(0, len(positiveExpr)): patient = positiveExpr[patientInd] if float(np.std(negativeExpr)) == 0: continue z = (float(patient) - np.mean(negativeExpr)) / float(np.std(negativeExpr)) if str(z) == 'nan': continue print('RAT: ', binInd, geneName, positivePatients[patientInd], z) allGeneZScores.append(z) if len(allGeneZScores) > 0: binZScoresOffset[binInd+30] += allGeneZScores return binZScoresOffset
import sys sys.path.insert(0, './jeopardy_helpers') import kmeans import graphGenerator from inputParser import InputParser import utils import nodeInserter import datetime from random import randint if __name__ == "__main__": start = datetime.datetime.now() parsedInfo = InputParser(sys.argv) info = parsedInfo.getInfo() info = graphGenerator.graphGen(info) mainInfo = info bestPath = None bestValue = 0 while True: info = kmeans.getClusters(info) info = utils.associateInsertedMap(info) info = utils.RouteInitPhase(info) info['clusterParameter'] = [1, 1.1, 1.2, 1.3][randint(0, 3)] for i in range(0, 10): info, shouldGoAhead = nodeInserter.insertNodesInPaths(info) if not shouldGoAhead: break
def generateFrequencyScatterPlot(self, allCosmicPairs, cosmicGeneCancerTypes, pathogenicSNVCounts): #Create an order for the genes and cancer types cancerTypesIndex = dict() cosmicGenesIndex = dict() geneFrequencies = dict() geneInd = 0 for cancerTypeInd in range(0, len(allCosmicPairs)): cancerType = list(allCosmicPairs.keys())[cancerTypeInd] cancerTypesIndex[cancerType] = cancerTypeInd if cancerType not in geneFrequencies: geneFrequencies[cancerType] = dict() for pair in allCosmicPairs[cancerType]: splitPair = pair.split('_') gene = splitPair[0] if gene not in cosmicGenesIndex: cosmicGenesIndex[gene] = geneInd geneInd += 1 if gene not in geneFrequencies[cancerType]: geneFrequencies[cancerType][gene] = 0 geneFrequencies[cancerType][gene] += 1 #check distribution of genes/cosmic etc uniqueGenes = dict() uniqueCosmicGenes = dict() uniqueSpecificGenes = dict() for cancerTypeInd in range(0, len(allCosmicPairs)): cancerType = list(allCosmicPairs.keys())[cancerTypeInd] cancerTypeNames = self.cancerTypeNames[cancerType] uniqueGenesC = dict() uniqueCosmicGenesC = dict() uniqueSpecificGenesC = dict() for pair in allCosmicPairs[cancerType]: splitPair = pair.split('_') gene = splitPair[0] uniqueGenes[gene] = 0 uniqueGenesC[gene] = 0 if gene in cosmicGeneCancerTypes: uniqueCosmicGenes[gene] = 0 uniqueCosmicGenesC[gene] = 0 for keyword in cancerTypeNames: if re.search(keyword, cosmicGeneCancerTypes[gene], re.IGNORECASE): uniqueSpecificGenes[gene] = 0 uniqueSpecificGenesC[gene] = 0 print('cancer type: ', cancerType) print('genes: ', len(uniqueGenesC)) print('cosmic genes: ', len(uniqueCosmicGenesC)) print('specific genes: ', len(uniqueSpecificGenesC)) print(uniqueSpecificGenesC) print('total drivers: ', len(uniqueGenes)) print('total known drivers: ', len(uniqueCosmicGenes)) print('total specific drivers: ', len(uniqueSpecificGenes)) #instead of frequency by non-coding SVs, use number of coding events as size print('Calculating coding events...') codingFrequency = dict() normalizedCodingFrequency = dict() patientCounts = dict() #aside from normal codng events, also sample random genes to compare to iterationCount = 1 #get all genes to sample from causalGenes = InputParser().readCausalGeneFile( settings.files['causalGenesFile']) nonCausalGenes = InputParser().readNonCausalGeneFile( settings.files['nonCausalGenesFile'], causalGenes) #In the same format as the causal genes. #Combine the genes into one set. allGenes = np.concatenate((causalGenes, nonCausalGenes), axis=0) allGeneNames = [] for gene in allGenes: allGeneNames.append(gene[3].name) cosmicGeneNames = [] for gene in causalGenes: cosmicGeneNames.append(gene[3].name) #allGenes = nonCausalGenes np.random.seed(42) randomGenes = np.random.choice(allGeneNames, 100) geneFrequencies = dict() nonCodingOnlyGenes = dict() allPValues = [] for cancerType in self.cancerTypes: nonCodingOnlyGenes[cancerType] = dict() geneFrequencies[cancerType] = dict() randomDistribution = [] for gene in randomGenes: if gene in pathogenicSNVCounts[cancerType]: randomDistribution.append( pathogenicSNVCounts[cancerType][gene]) else: randomDistribution.append(0) # print(cancerType) # print(randomDistribution) # print(np.mean(randomDistribution)) pValues = [] for pair in allCosmicPairs[cancerType]: splitPair = pair.split('_') gene = splitPair[0] score = 0 if gene in pathogenicSNVCounts[cancerType]: #print(gene, ': ', pathogenicSNVCounts[cancerType][gene]) score = pathogenicSNVCounts[cancerType][gene] else: #print(gene, ' not pathogenic') #don't count duplicates, that would be more than 1 per patient nonCodingOnlyGenes[cancerType][gene] = 0 z = (score - np.mean(randomDistribution)) / np.std(randomDistribution) pValue = stats.norm.sf(abs(z)) pValues.append([gene, z, pValue]) allPValues.append([gene, cancerType, z, pValue]) #uncorrectedPValues = np.array(pValues, dtype = 'object') #adjust across cancer types #print(uncorrectedPValues) uncorrectedPValues = np.array(allPValues, dtype='object') reject, pAdjusted, _, _ = multipletests( uncorrectedPValues[:, 3], method='bonferroni') #fdr_bh or bonferroni signPatients = [] for pValueInd in range(0, len(uncorrectedPValues[:, 3])): gene = uncorrectedPValues[pValueInd, 0] cancerType = uncorrectedPValues[pValueInd, 1] if reject[pValueInd] == True and uncorrectedPValues[pValueInd, 2] > 0: geneFrequencies[cancerType][gene] = uncorrectedPValues[ pValueInd, 2] signPatients.append([ uncorrectedPValues[pValueInd][0], uncorrectedPValues[pValueInd][2], pAdjusted[pValueInd] ]) signPatients = np.array(signPatients, dtype='object') print(signPatients) exit() #create the scatter plot in this order, use the frequency as point size genePlotIndices = dict() currentGenePlotIndex = 0 plotData = [] plotFrequencies = [] pointColors = [] for cancerType in allCosmicPairs: cancerTypeIndex = cancerTypesIndex[cancerType] cancerTypeNames = self.cancerTypeNames[cancerType] for pair in allCosmicPairs[cancerType]: splitPair = pair.split('_') gene = splitPair[0] #get frequency of this gene if gene in geneFrequencies[cancerType]: geneFrequency = geneFrequencies[cancerType][gene] #use frequency of coding events #geneFrequency = normalizedCodingFrequency[cancerType][gene] #3.5 if gene not in genePlotIndices: genePlotIndices[gene] = currentGenePlotIndex currentGenePlotIndex += 1 #determine the color based on if this gene is cancer-type specific edgecolors = 1 facecolors = 'black' if gene in cosmicGeneCancerTypes: facecolors = 'green' edgecolors = 3 for keyword in cancerTypeNames: if re.search(keyword, cosmicGeneCancerTypes[gene], re.IGNORECASE): print('match', cancerType, gene) edgecolors = 2 facecolors = 'red' plotData.append([ genePlotIndices[gene], cancerTypeIndex, edgecolors, geneFrequency * 500 ]) #plt.scatter(genePlotIndices[gene], cancerTypeIndex, color=facecolors, s = geneFrequency*5) plotData = np.array(plotData) data = pd.DataFrame(plotData) data.columns = ['gene', 'cancerType', 'color', 'frequency'] data = data.drop_duplicates() print(data) #exit() sns.scatterplot(data=data, x='gene', y='cancerType', size=data.frequency, hue=data.cancerType, legend=False, style=data.color, edgecolor='k', sizes=(20, 300), palette=sns.color_palette("hls", data.cancerType.nunique())) #plt.yticks(np.arange(0, len(genePlotIndices)), list(genePlotIndices.keys())) #plt.xticks(np.arange(0, len(cancerTypesIndex)), list(cancerTypesIndex.keys()), rotation = 'vertical') plt.xticks(np.arange(0, len(genePlotIndices)), list(genePlotIndices.keys()), rotation='vertical') plt.yticks(np.arange(0, len(cancerTypesIndex)), list(cancerTypesIndex.keys())) plt.tight_layout() plt.savefig('frequency_scatter.svg') plt.show() return 0
def plotPathogenicSVFrequency(self): # plotData = dict() # plotData['pathogenicSVs'] = [] # plotData['totalSVs'] = [] plotData = [] for cancerType in self.cancerTypes: #count how many pathogenic SVs we have pathogenicSVFile = 'output/' + cancerType + '/linkedSVGenePairs/nonCoding_geneSVPairs.txt_pathogenicPairsFeatures.txt' pathogenicSVCount = 0 with open(pathogenicSVFile, 'r') as inF: for line in inF: pathogenicSVCount += 1 plotData.append([cancerType, pathogenicSVCount]) #plotData['pathogenicSVs'].append(pathogenicSVCount) #count the total number of SVs # svDir = settings.files['svDir'] # svData = InputParser().getSVs_hmf(svDir, self.cancerTypeMetadataNames[cancerType]) # #plotData['totalSVs'].append(svData.shape[0]) # # # plotData.append([cancerType, svData.shape[0], 'SV']) data = pd.DataFrame(plotData) data.columns = ['cancerType', 'svCount'] #make bar plot ax = sns.barplot(x="cancerType", y="svCount", data=data, color='#a2d5f2') plt.xticks(np.arange(0, len(self.cancerTypes)), self.cancerTypes, rotation='vertical') plt.tight_layout() # Show graphic plt.show() exit() plotData = [] samplePlotData = [] for cancerType in self.cancerTypes: #count the total number of SVs svDir = settings.files['svDir'] svData = InputParser().getSVs_hmf( svDir, self.cancerTypeMetadataNames[cancerType]) #plotData['totalSVs'].append(svData.shape[0]) plotData.append([cancerType, svData.shape[0]]) samplePlotData.append([cancerType, len(np.unique(svData[:, 7]))]) data = pd.DataFrame(plotData) data.columns = ['cancerType', 'svCount'] #make bar plot ax = sns.barplot(x="cancerType", y="svCount", data=data, color='#07689f') plt.xticks(np.arange(0, len(self.cancerTypes)), self.cancerTypes, rotation='vertical') plt.tight_layout() # Show graphic plt.show() data = pd.DataFrame(samplePlotData) data.columns = ['cancerType', 'sampleCount'] #make bar plot ax = sns.barplot(x="cancerType", y="sampleCount", data=data, color='#ff7e67') plt.xticks(np.arange(0, len(self.cancerTypes)), self.cancerTypes, rotation='vertical') plt.tight_layout() # Show graphic plt.show() exit() #read the line count of the pathogenic SV files. pathogenicSVCounts = dict() svTypeDistribution = dict() plotData = [] for cancerType in self.cancerTypes: pathogenicSVCounts[cancerType] = 0 svTypeDistribution[cancerType] = dict() svTypeDistribution[cancerType]['DEL'] = 0 svTypeDistribution[cancerType]['DUP'] = 0 svTypeDistribution[cancerType]['INV'] = 0 svTypeDistribution[cancerType]['ITX'] = 0 countsPerSample = dict() pathogenicSVFile = 'output/' + cancerType + '/linkedSVGenePairs/nonCoding_geneSVPairs.txt_pathogenicPairsFeatures.txt' with open(pathogenicSVFile, 'r') as inF: for line in inF: pathogenicSVCounts[cancerType] += 1 splitLine = line.split('\t') pair = splitLine[0] splitPair = pair.split('_') sample = splitPair[7] if sample not in countsPerSample: countsPerSample[sample] = 0 countsPerSample[sample] += 1 svType = splitPair[12] svTypeDistribution[cancerType][svType] += 1 print(cancerType) print(len(countsPerSample)) for sample in countsPerSample: plotData.append([ cancerType, sample, countsPerSample[sample], pathogenicSVCounts[cancerType] ]) #plotData.append([cancerType, pathogenicSVCounts[cancerType]]) #exit() #plotData = np.array(plotData) data = pd.DataFrame(plotData) data.columns = ['cancerType', 'sample', 'sampleCount', 'svCount'] #sns.scatterplot(data=data, x='cancerType', y='svCount', legend=False) v = sns.violinplot(data=data, x='cancerType', y='sampleCount', legend=False) # add n = X to show total count. cancerTypesWithCounts = [] for cancerType in self.cancerTypes: cancerTypesWithCounts.append(cancerType + ' (N = ' + str(pathogenicSVCounts[cancerType]) + ')') plt.xticks(np.arange(0, len(self.cancerTypes)), cancerTypesWithCounts, rotation='vertical') plt.ylim([0, 200]) plt.tight_layout() plt.show() ###make a plot showing how many pathogenic SVs vs total SVs #make the SV type bar chart #make a dictionary per SV type, where each array is then the cancer type. plotData = dict() for svType in ['DEL', 'DUP', 'INV', 'ITX']: plotData[svType] = [] for cancerType in svTypeDistribution: plotData[svType].append(svTypeDistribution[cancerType][svType]) df = pd.DataFrame(plotData) # From raw value to percentage totals = [ i + j + k + l for i, j, k, l in zip(df['DEL'], df['DUP'], df['INV'], df['ITX']) ] delBars = [i / j * 100 for i, j in zip(df['DEL'], totals)] dupBars = [i / j * 100 for i, j in zip(df['DUP'], totals)] invBars = [i / j * 100 for i, j in zip(df['INV'], totals)] itxBars = [i / j * 100 for i, j in zip(df['ITX'], totals)] # plot barWidth = 0.85 r = np.arange(0, len(self.cancerTypes)) names = self.cancerTypes # Create green Bars plt.bar(r, delBars, color='#e41a1c', edgecolor='white', width=barWidth, label='DEL') # Create orange Bars plt.bar(r, dupBars, bottom=delBars, color='#377eb8', edgecolor='white', width=barWidth, label='DUP') # Create blue Bars plt.bar(r, invBars, bottom=[i + j for i, j in zip(delBars, dupBars)], color='#4daf4a', edgecolor='white', width=barWidth, label='INV') plt.bar( r, itxBars, bottom=[i + j + k for i, j, k in zip(delBars, dupBars, invBars)], color='#984ea3', edgecolor='white', width=barWidth, label='ITX') # Custom x axis plt.xticks(r, names, rotation='vertical') #plt.xlabel("Cancer type") plt.legend(loc='upper left', bbox_to_anchor=(1, 1), ncol=1) plt.tight_layout() # Show graphic plt.show()
#maybe output the .npy files to a tmp dir to make them easier to find outFilePrefix = gainLossName + '_' + cosmicName + '_' if gainLossName != 'loss': svTypes = ['DEL', 'DUP', 'INV', 'ITX'] else: svTypes = ['INV', 'ITX'] usedSVTypes = [ ] #use this to later determine which colors need to be used in the plot in case we skip an sv type, e.g. #for cosmic when translocations are not linked to any cosmic gene. if cosmicName != 'all': #read the cosmic files to split instances into cosmic/non-cosmic. cosmicGenes = InputParser().readCausalGeneFile( settings.files['causalGenesFile']) cosmicGeneNames = [] for gene in cosmicGenes: cosmicGeneNames.append(gene[3].name) if generatePlottingData == "True": adjustedPValues = dict() allFeatureZScores = dict() svTypeInd = 0 for svType in svTypes: #define the classifiers to use (from optimization) #would be nicer if these are in 1 file somewhere, since they are also used in another script if svType == 'DEL': clf = RandomForestClassifier(random_state=785,
def plotSVStatsPanels(cancerTypes, loopType): """ Plot panels A, B, C and D of Figure S2. Also C and D can be pt 1 and 2 of figure 4B if CTCF data is provided. Parameters: - cancerTypes: cancerTypes to output for. Should equal output folder names - loopType: used to determine output figure name. TAD for Figure S3, CTCF for Figure 4B. """ #1. plot pathogenic SV count plotData = [] cancerTypePlotNames = [] for cancerType in cancerTypes: splitCancerType = cancerType.split('_') cancerType2 = '_'.join(splitCancerType[1:2]) if loopType == 'TAD': cancerTypePlotNames.append(cancerType2) else: cancerTypePlotNames.append(cancerType2 + '_CTCF') #count how many pathogenic SVs we have pathogenicSVFile = 'output/' + cancerType + '/linkedSVGenePairs/nonCoding_geneSVPairs.txt_pathogenicPairsFeatures.txt' pathogenicSVCount = 0 with open(pathogenicSVFile, 'r') as inF: for line in inF: pathogenicSVCount += 1 plotData.append([cancerType, pathogenicSVCount]) pathogenicSVCounts = pd.DataFrame(plotData) pathogenicSVCounts.columns = ['Cancer type', 'Number of pathogenic SVs'] #make bar plot ax = sns.barplot(data=pathogenicSVCounts, x="Cancer type", y="Number of pathogenic SVs", color='#a2d5f2') plt.xticks(np.arange(0, len(cancerTypes)), cancerTypePlotNames, rotation='vertical') plt.tight_layout() #Plot and save based on CTCF/TAD input if loopType == 'TAD': plt.savefig('output/figures/figureS2C.svg') else: plt.savefig('output/figures/figure4A_A.svg') plt.clf() #2. Make plot of the total SV counts plotData = [] samplePlotData = [] for cancerType in cancerTypes: #count the total number of SVs svDir = settings.files['svDir'] svData = InputParser().getSVs_hmf(svDir, cancerTypeMetadataNames[cancerType]) plotData.append([cancerType, svData.shape[0]]) samplePlotData.append([cancerType, len(np.unique(svData[:, 7]))]) totalSVCounts = pd.DataFrame(plotData) totalSVCounts.columns = ['Cancer type', 'Number of SVs'] #make bar plot ax = sns.barplot(x="Cancer type", y="Number of SVs", data=totalSVCounts, color='#07689f') plt.xticks(np.arange(0, len(cancerTypes)), cancerTypePlotNames, rotation='vertical') plt.tight_layout() if loopType == 'TAD': plt.savefig('output/figures/figureS2B.svg') plt.clf() #3. Make plot of the sample counts sampleCounts = pd.DataFrame(samplePlotData) sampleCounts.columns = ['Cancer type', 'Number of samples'] #make bar plot ax = sns.barplot(x="Cancer type", y="Number of samples", data=sampleCounts, color='#ff7e67') plt.xticks(np.arange(0, len(cancerTypes)), cancerTypePlotNames, rotation='vertical') plt.tight_layout() if loopType == 'TAD': plt.savefig('output/figures/figureS2A.svg') plt.clf() #4. Show the relative % of pathogenic compared to total SVs. plotData = [] for index, row in totalSVCounts.iterrows(): cancerType = row['Cancer type'] svGenePairFile = 'output/' + cancerType + '/linkedSVGenePairs/nonCoding_geneSVPairs.txt_' svGenePairCount = 0 with open(svGenePairFile, 'r') as inF: for line in inF: svGenePairCount += 1 pathogenicSVCount = pathogenicSVCounts[ pathogenicSVCounts['Cancer type'] == row['Cancer type']]['Number of pathogenic SVs'] relativeFrequency = (pathogenicSVCount / float(svGenePairCount)) * 100 plotData.append([row['Cancer type'], relativeFrequency]) sampleCounts = pd.DataFrame(plotData) sampleCounts.columns = ['Cancer type', 'Relative pathogenic SV frequency'] #make bar plot ax = sns.barplot(x="Cancer type", y="Relative pathogenic SV frequency", data=sampleCounts, color='black') plt.xticks(np.arange(0, len(cancerTypes)), cancerTypePlotNames, rotation='vertical') plt.tight_layout() if loopType == 'TAD': plt.savefig('output/figures/figureS2D.svg') else: plt.savefig('output/figures/figure4A_B.svg') plt.clf()