def convertToStandardInput(bedInputFilePath, writeManager: WriteManager, onlySingleBaseSubs, includeIndels): print("Converting custom bed file to standard bed input...") # Iterate through the input file one line at a time, converting each line to an acceptable format for the rest of the pipeline. with open(bedInputFilePath, 'r') as bedInputFile: for line in bedInputFile: choppedUpLine = str(line).strip().split('\t') # Is this an SNP from a purine? If so, flip the strand to the pyrimidine containing strand. if isPurine(choppedUpLine[3]) and choppedUpLine[4].upper() in ( 'A', 'C', 'G', 'T'): choppedUpLine[3] = reverseCompliment(choppedUpLine[3]) choppedUpLine[4] = reverseCompliment(choppedUpLine[4]) if choppedUpLine[5] == '+': choppedUpLine[5] = '-' elif choppedUpLine[5] == '-': choppedUpLine[5] = '+' # Is this an indel, and are those included? if choppedUpLine[3] == '*' or choppedUpLine[4] == '*': if not includeIndels: continue # Is this a single base substitution, and if not, should it even be included? if not isSingleBaseSubstitution(choppedUpLine): if onlySingleBaseSubs: continue # Center features greater than a single nucleotide so that they occur at a single nucleotide position (or half position) else: center = (float(choppedUpLine[1]) + float(choppedUpLine[2]) - 1) / 2 if int(center) == center: center = int( center ) # Remove the decimal from the float if possible. choppedUpLine[1] = str(center) choppedUpLine[2] = str(center + 1) # Call on the write manager to handle the rest! if len(choppedUpLine) == 7: writeManager.writeData(choppedUpLine[0], choppedUpLine[1], choppedUpLine[2], choppedUpLine[3], choppedUpLine[4], choppedUpLine[5], choppedUpLine[6]) else: writeManager.writeData(choppedUpLine[0], choppedUpLine[1], choppedUpLine[2], choppedUpLine[3], choppedUpLine[4], choppedUpLine[5])
def copyBedData(preBedDirectory, bedDirectory, fileName): print("Reading from " + fileName + ".") # A list of all of the individual mutations in bed format bedData = list() # Use gzip to read the file contents and generate a bed formatted output. with gzip.open(preBedDirectory + "/" + fileName, "r") as preBedData: # Each line contains one mutation to be converted and added to bedData. for mutationData in preBedData.readlines(): # The start of the bed formatted data. bedMutation = "" # Convert the bbglab data into a list of data entries. preBedDataCols = list() for data in mutationData.strip().split(): preBedDataCols.append(str(data, "utf-8")) # Construct the bed data format from the bbglab data. bedMutation += preBedDataCols[0] + "\t" # Chromosome number bedMutation += str(int(preBedDataCols[1]) - 1) + "\t" # base-0 start (hence the "-1") bedMutation += preBedDataCols[1] + "\t" # base-1 end # Based on the nature of the mutation, asign it to either the + or - strand and output the mutation accordingly. # Mutations are assumed to have arisen in pyrimidines. if isPurine(preBedDataCols[2]): bedMutation += reverseCompliment(preBedDataCols[2]) + '\t' bedMutation += reverseCompliment( preBedDataCols[2]) + ">" + reverseCompliment( preBedDataCols[3]) + "\t" bedMutation += "-\n" else: bedMutation += preBedDataCols[2] + '\t' bedMutation += preBedDataCols[2] + ">" + preBedDataCols[ 3] + "\t" bedMutation += "+\n" # Add the mutation entry to the list of bed data. bedData.append(bedMutation) # Write (gzipped) the bed formatted mutation data to the bed directory. print("Writing bed data.") with gzip.open(bedDirectory + "/" + fileName, "w") as bedFile: for data in bedData: bedFile.write(data.encode())
def getGenomeContextCounts(genomeContextFrequencyFilePath, countPlusStrand=True, countMinusStrand=True): # Make sure one strand is actually being counted. if not (countMinusStrand or countPlusStrand): raise ValueError("Error: At least one strand must be selected.") # A function that adds the given counts to its context in a dictionary. # Initializes the context if necessary. def addFrequency(context, counts): contextCounts.setdefault(context, 0) contextCounts[context] += counts contextCounts = dict() # A dictionary to store the context counts. # Access the file with context counts. with open(genomeContextFrequencyFilePath, 'r') as genomeContextFrequencyFile: for lineNum, line in enumerate(genomeContextFrequencyFile): # The first two lines are headers, so ignore them. if lineNum < 2: continue # Get the context sequence, its reverse compliment, and its counts for this line. context = line.strip().split('\t')[0] reverseContext = reverseCompliment(context) counts = int(line.strip().split('\t')[1]) # Add counts to the dictionary based on the parameters set. if countPlusStrand: addFrequency(context, counts) if countMinusStrand: addFrequency(reverseContext, counts) return contextCounts
def generateNucleosomeMutationBackgroundFile( dyadPosContextCountsFilePath, mutationBackgroundFilePath, nucleosomeMutationBackgroundFilePath, dyadRadius, linkerOffset): # Dictionaries of expected mutations for every dyad position included in the analysis, one for each strand. plusStrandNucleosomeMutationBackground = dict() minusStrandNucleosomeMutationBackground = dict() # This is a bit weird. If the context number is even, we need to account for half positions, # but if the context number is odd, we need to keep in mind that there's one extra valid position in the dyad range. if getContext(mutationBackgroundFilePath, asInt=True) % 2 == 0: halfBaseOffset = 0.5 extraDyadPos = 0 else: halfBaseOffset = 0 extraDyadPos = 1 # Initialize the dictionary for i in range(-dyadRadius - linkerOffset, dyadRadius + linkerOffset + extraDyadPos): dyadPos = i + halfBaseOffset plusStrandNucleosomeMutationBackground[dyadPos] = 0 minusStrandNucleosomeMutationBackground[dyadPos] = 0 # Get the corresponding mutation background and context counts dictionaries. backgroundMutationRate = getGenomeBackgroundMutationRates( mutationBackgroundFilePath) dyadPosContextCounts = getDyadPosContextCounts( dyadPosContextCountsFilePath) # Calculate the expected mutation rates for each dyad position based on the context counts at that position and that context's mutation rate for dyadPos in dyadPosContextCounts: for context in dyadPosContextCounts[dyadPos]: reverseContext = reverseCompliment(context) # Add the context's mutation rate to the running total in the background dictionaries. plusStrandNucleosomeMutationBackground[ dyadPos] += backgroundMutationRate[ context] * dyadPosContextCounts[dyadPos][context] minusStrandNucleosomeMutationBackground[ dyadPos] += backgroundMutationRate[ reverseContext] * dyadPosContextCounts[dyadPos][context] # Write the results of the dictionary to the nucleosome mutation background file. with open(nucleosomeMutationBackgroundFilePath, 'w') as nucleosomeMutationBackgroundFile: # Write the headers for the data. headers = '\t'.join(("Dyad_Position", "Expected_Mutations_Plus_Strand", "Expected_Mutations_Minus_Strand", "Expected_Mutations_Both_Strands", "Expected_Mutations_Aligned_Strands")) nucleosomeMutationBackgroundFile.write(headers + '\n') # Write the data for each dyad position. for i in range(-dyadRadius - linkerOffset, dyadRadius + linkerOffset + extraDyadPos): dyadPos = i + halfBaseOffset dataRow = '\t'.join( (str(dyadPos), str(plusStrandNucleosomeMutationBackground[dyadPos]), str(minusStrandNucleosomeMutationBackground[dyadPos]), str(plusStrandNucleosomeMutationBackground[dyadPos] + minusStrandNucleosomeMutationBackground[dyadPos]), str(plusStrandNucleosomeMutationBackground[dyadPos] + minusStrandNucleosomeMutationBackground[-dyadPos]))) nucleosomeMutationBackgroundFile.write(dataRow + '\n')
def autoAcquireAndQACheck(bedInputFilePath: str, genomeFilePath, autoAcquiredFilePath, onlySingleBaseSubs, includeIndels): print( "Checking custom bed file for formatting and auto-acquire requests...") # To start, assume that no sequences need to be acquired, and do it on the fly if need be. autoAcquiring = False autoAcquireFastaIterator = None fastaEntry = None cohortDesignationPresent = None # Unless indels are included, determine the context of the feqtures in the file. if includeIndels: context = 0 else: context = None # Get the list of acceptable chromosomes acceptableChromosomes = getAcceptableChromosomes(genomeFilePath) acceptableChromosomesFilePath = getAcceptableChromosomes( genomeFilePath, True) # Create a temporary file to write the data to (potentially after auto-acquiring). # Will replace original file at the end if auto-acquiring occurred. temporaryBedFilePath = bedInputFilePath + ".tmp" # Iterate through the input file one line at a time, checking the format of each entry and looking for auto-acquire requests. with open(bedInputFilePath, 'r') as bedInputFile: with open(temporaryBedFilePath, 'w') as temporaryBedFile: for line in bedInputFile: choppedUpLine = str(line).strip().split('\t') # If it isn't already, initialize the cohortDesignationPresent variable. if cohortDesignationPresent is None: cohortDesignationPresent = len(choppedUpLine) == 7 # Check for possible error states. checkForErrors(choppedUpLine, cohortDesignationPresent, acceptableChromosomes, acceptableChromosomesFilePath) # If this is the first entry requiring auto-acquiring, generate the required fasta file. if (not autoAcquiring and (choppedUpLine[3] == '.' or choppedUpLine[4] == '.' or (choppedUpLine[5] == '.' and choppedUpLine[3] != '*'))): print( "Found line with auto-acquire requested. Generating fasta..." ) autoAcquiring = True bedToFasta(bedInputFilePath, genomeFilePath, autoAcquiredFilePath) autoAcquiredFile = open(autoAcquiredFilePath, 'r') autoAcquireFastaIterator = FastaFileIterator( autoAcquiredFile) fastaEntry = autoAcquireFastaIterator.readEntry() print("Continuing...") # Check for any base identities that need to be auto-acquired. if choppedUpLine[3] == '.': # Find the equivalent fasta entry. while not equivalentEntries(fastaEntry, choppedUpLine): assert not autoAcquireFastaIterator.eof, ( "Reached end of fasta file without finding a match for: ", ' '.join(choppedUpLine)) fastaEntry = autoAcquireFastaIterator.readEntry() # Set the sequence. choppedUpLine[3] = fastaEntry.sequence # Check for any strand designations that need to be auto-acquired. # Also, make sure this isn't an insertion, in which case the strand designation cannot be determined. if choppedUpLine[5] == '.' and choppedUpLine[3] != '*': # Find the equivalent fasta entry. while not equivalentEntries(fastaEntry, choppedUpLine): assert not autoAcquireFastaIterator.eof, ( "Reached end of fasta file without finding a match for: ", ' '.join(choppedUpLine)) fastaEntry = autoAcquireFastaIterator.readEntry() # Determine which strand is represented. if fastaEntry.sequence == choppedUpLine[3]: choppedUpLine[5] = '+' elif fastaEntry.sequence == reverseCompliment( choppedUpLine[3]): choppedUpLine[5] = '-' else: assert False, ( "The given sequence " + choppedUpLine[3] + " for location " + fastaEntry.sequenceName + ' ' + "does not match the corresponding sequence in the given genome, or its reverse compliment." ) # Change any '.' characters in the "altered to" column to "OTHER" if choppedUpLine[4] == '.': choppedUpLine[4] = "OTHER" # Determine the sequence context of the line and whether or not it matches the sequence context for other. # Skip this if the file is "mixed", this line is an indel, or only single base substitutions are allowed and this line isn't one. if (not context == 0 and not (choppedUpLine[3] == '*' or choppedUpLine[4] == '*') and (not onlySingleBaseSubs or isSingleBaseSubstitution(choppedUpLine))): thisContext = len(choppedUpLine[3]) if context is None: context = thisContext elif thisContext != context: context = 0 # Write the current line to the temporary bed file. temporaryBedFile.write('\t'.join(choppedUpLine) + '\n') # If any lines were auto-acquired, replace the input bed file with the temporary bed file. (Which has auto-acquires) if autoAcquiring: print( "Overwriting custom bed input with auto-acquired bases/strand designations." ) os.replace(temporaryBedFilePath, bedInputFilePath) # Otherwise, just delete the temporary file. else: os.remove(temporaryBedFilePath) if context > 6: context = float("inf") return context
def parseKucabCompendium(kucabSubstitutionsFilePaths: List[str], genomeFilePath, nucPosFilePath, includeAllPAHs): for kucabSubstitutionsFilePath in kucabSubstitutionsFilePaths: print("\nWorking in:", os.path.basename(kucabSubstitutionsFilePath)) if not kucabSubstitutionsFilePath.endswith("final.txt"): raise InvalidPathError( kucabSubstitutionsFilePath, "Given kucab input file does not end in \"final.txt\":") # Prepare the output file path. localRootDirectory = os.path.dirname(kucabSubstitutionsFilePath) dataGroupName = getIsolatedParentDir(kucabSubstitutionsFilePath) if includeAllPAHs: outputDirectory = os.path.join(localRootDirectory, "all_PAHs") dataGroupName += "_all_PAHs" else: dataGroupName += "_smoker_lung" outputDirectory = os.path.join(localRootDirectory, "smoker_lung") # Make sure the data directory exists. if not os.path.exists(outputDirectory): os.mkdir(outputDirectory) # Generate the output file path and metadata outputTrinucBedFilePath = generateFilePath( directory=outputDirectory, dataGroup=dataGroupName, context="trinuc", dataType=DataTypeStr.mutations, fileExtension=".bed") generateMetadata( dataGroupName, getIsolatedParentDir(genomeFilePath), getIsolatedParentDir(nucPosFilePath), os.path.join("..", os.path.basename(kucabSubstitutionsFilePath)), outputDirectory) # Get the list of acceptable chromosomes acceptableChromosomes = getAcceptableChromosomes(genomeFilePath) # These are the designations for PAH mutation signatures, the ones related to tobacco smoke that we want to study. PAHDesignations = ("MSM0.54", "MSM0.26", "MSM0.92", "MSM0.2", "MSM0.42", "MSM0.74", "MSM0.103" "MSM0.14", "MSM0.82", "MSM0.130", "MSM0.12", "MSM0.132", "MSM0.13", "MSM0.96") # These designations specifically mimic the indel signature in smokers' lung cancer tumors. LungCancerSpecificDesignations = ("MSM0.26", "MSM0.92", "MSM0.2", "MSM0.103", "MSM0.14") # Set the designations that will be used to collect data based on the input to the function. if includeAllPAHs: relevantDesignations = PAHDesignations else: relevantDesignations = LungCancerSpecificDesignations print("Reading data and writing to trinuc bed file...") with open(kucabSubstitutionsFilePath, 'r') as kucabSubstitutionsFile: with open(outputTrinucBedFilePath, 'w') as outputTrinucBedFile: firstLineFlag = True for line in kucabSubstitutionsFile: # Skip the first line with headers. if firstLineFlag: firstLineFlag = False continue # The lines are separated by tabs. The relevant data have the following indices in a tab-separated list: # 15: mutagen designation # 4: Chromosome # 5: Start Pos (1 base) # 6: Reference base # 7: Mutated base # 13: pre-base context # 14: post-base context choppedUpLine = line.strip().split('\t') # Skip the mutation if it does not belong to the relevant group. if not choppedUpLine[15] in relevantDesignations: continue # Compile the necessary information for the bed file. chromosome = "chr" + choppedUpLine[4] # Handle the weird chromsome formatting and then check for invalid chromosomes. if chromosome == "chr23": chromosome = "chrX" if chromosome == "chr24": chromosome = "chrY" if not chromosome in acceptableChromosomes: continue startPos1Base = choppedUpLine[5] startPos0Base = str(int(startPos1Base) - 1) mutatedFrom = choppedUpLine[6] mutatedTo = choppedUpLine[7] trinucContext = ''.join( (choppedUpLine[13], mutatedFrom, choppedUpLine[14])) # If the mutated base is listed as arising from a purine, flip the mutation and the strand. if isPurine(mutatedFrom): mutation = reverseCompliment( mutatedFrom) + '>' + reverseCompliment(mutatedTo) strand = '-' trinucContext = reverseCompliment(trinucContext) else: mutation = mutatedFrom + '>' + mutatedTo strand = '+' # Write the information to the trinuc bed file. outputTrinucBedFile.write('\t'.join( (chromosome, startPos0Base, startPos1Base, trinucContext, mutation, strand)) + '\n') # Sort the output file. print("Sorting output file...") subprocess.run(("sort", "-k1,1", "-k2,2n", outputTrinucBedFilePath, "-o", outputTrinucBedFilePath), check=True)