Пример #1
0
    def setUpForMSStratification(self) -> MSIIdentifier:

        self.stratifyByMS = True

        self.MSICohorts = dict()  # A hashtable of individual cohorts with MSI

        # Create the necessary directories, file paths, and metadata.
        aggregateMSDirectory = os.path.join(self.rootMetadata.directory,
                                            "microsatellite_analysis")
        aggregateMSSDirectory = os.path.join(aggregateMSDirectory, "MSS")
        aggregateMSIDirectory = os.path.join(aggregateMSDirectory, "MSI")
        checkDirs(aggregateMSSDirectory, aggregateMSIDirectory)

        generateMetadata(
            "MSS_" + self.rootMetadata.dataGroupName,
            self.rootMetadata.genomeName,
            os.path.join('..', '..', self.rootMetadata.localParentDataPath),
            self.rootMetadata.inputFormat, aggregateMSSDirectory, "MSS")
        generateMetadata(
            "MSI_" + self.rootMetadata.dataGroupName,
            self.rootMetadata.genomeName,
            os.path.join('..', '..', self.rootMetadata.localParentDataPath),
            self.rootMetadata.inputFormat, aggregateMSIDirectory, "MSI")
        self.aggregateMSSMutCounts = 0
        self.aggregateMSIMutCounts = 0

        self.aggregateMSSFilePath = generateFilePath(
            directory=aggregateMSSDirectory,
            dataGroup="MSS_" + self.rootMetadata.dataGroupName,
            context=self.context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        self.aggregateMSSFile = open(self.aggregateMSSFilePath, 'w')
        self.aggregateMSIFilePath = generateFilePath(
            directory=aggregateMSIDirectory,
            dataGroup="MSI_" + self.rootMetadata.dataGroupName,
            context=self.context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        self.aggregateMSIFile = open(self.aggregateMSIFilePath, 'w')

        # Set up the MSIIdentifier to be returned.
        intermediateFilesDir = os.path.join(self.rootDataDir,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)
        MSISeqInputDataFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=self.rootMetadata.dataGroupName,
            dataType="MSISeq_data",
            fileExtension=".tsv")
        self.MSICohortsFilePath = generateFilePath(
            directory=aggregateMSDirectory,
            dataGroup=self.rootMetadata.dataGroupName,
            dataType="MSI_cohorts",
            fileExtension=".txt")

        self.myMSIIdentifier = MSIIdentifier(MSISeqInputDataFilePath,
                                             self.MSICohortsFilePath)
        return (self.myMSIIdentifier)
            def generateBackgroundBasedOnRadius(usesNucGroup):

                # Set the dyad radius (And linker offset)
                if usesNucGroup:
                    dyadRadius = 1000
                    currentLinkerOffset = 0
                else:
                    dyadRadius = 73
                    currentLinkerOffset = linkerOffset

                # Generate the path to the tsv file of dyad position context counts
                dyadPosContextCountsFilePath = generateFilePath(
                    directory=os.path.dirname(metadata.baseNucPosFilePath),
                    dataGroup=metadata.nucPosName,
                    context=contextText,
                    linkerOffset=currentLinkerOffset,
                    usesNucGroup=usesNucGroup,
                    dataType="dyad_pos_counts",
                    fileExtension=".tsv")

                # Make sure we have a tsv file with the appropriate context counts at each dyad position.
                if not os.path.exists(dyadPosContextCountsFilePath):
                    print(
                        "Dyad position " + contextText +
                        " counts file not found at",
                        dyadPosContextCountsFilePath)
                    print("Generating genome wide dyad position " +
                          contextText + " counts file...")
                    # Make sure we have a fasta file for strongly positioned nucleosome coordinates
                    nucPosFastaFilePath = generateNucleosomeFasta(
                        metadata.baseNucPosFilePath, metadata.genomeFilePath,
                        dyadRadius, currentLinkerOffset)
                    generateDyadPosContextCounts(nucPosFastaFilePath,
                                                 dyadPosContextCountsFilePath,
                                                 contextNum, dyadRadius,
                                                 currentLinkerOffset)

                # A path to the final output file.
                nucleosomeMutationBackgroundFilePath = generateFilePath(
                    directory=metadata.directory,
                    dataGroup=metadata.dataGroupName,
                    context=contextText,
                    linkerOffset=currentLinkerOffset,
                    usesNucGroup=usesNucGroup,
                    dataType=DataTypeStr.nucMutBackground,
                    fileExtension=".tsv")

                # Generate the nucleosome mutation background file!
                generateNucleosomeMutationBackgroundFile(
                    dyadPosContextCountsFilePath, mutationBackgroundFilePath,
                    nucleosomeMutationBackgroundFilePath, dyadRadius,
                    currentLinkerOffset)

                nucleosomeMutationBackgroundFilePaths.append(
                    nucleosomeMutationBackgroundFilePath)
def parseAlexandrov(bedInputFilePaths, genomeFilePath, nucPosFilePath):

    outputBedFilePaths = list()

    for bedInputFilePath in bedInputFilePaths:

        print("\nWorking in:", os.path.basename(bedInputFilePath))

        # Get some important file system paths for the rest of the function and generate metadata.
        dataDirectory = os.path.dirname(bedInputFilePath)
        generateMetadata(os.path.basename(dataDirectory),
                         getIsolatedParentDir(genomeFilePath),
                         getIsolatedParentDir(nucPosFilePath),
                         os.path.basename(bedInputFilePath),
                         InputFormat.customBed,
                         os.path.dirname(bedInputFilePath))

        intermediateFilesDir = os.path.join(dataDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)

        # Get the list of acceptable chromosomes
        acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)

        # Generate the output file.
        outputBedFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=getIsolatedParentDir(bedInputFilePath),
            dataType=DataTypeStr.customInput,
            fileExtension=".bed")

        # Write data to the output file.
        with open(bedInputFilePath, 'r') as bedInputFile:
            with open(outputBedFilePath, 'w') as outputBedFile:

                for line in bedInputFile:

                    choppedUpLine = str(line).strip().split('\t')

                    # Make sure we have a valid chromosome
                    if (
                            "chr" + choppedUpLine[2]
                    ) in acceptableChromosomes and not '/' in choppedUpLine[5]:

                        # Convert the line to custom bed format.
                        if choppedUpLine[5] == '-': choppedUpLine[5] = '*'
                        if choppedUpLine[6] == '-': choppedUpLine[6] = '*'
                        outputBedFile.write('\t'.join(
                            ("chr" + choppedUpLine[2],
                             str(int(choppedUpLine[3]) - 1), choppedUpLine[4],
                             choppedUpLine[5], choppedUpLine[6], '.',
                             choppedUpLine[0])) + '\n')

        # Add the output file to the list.
        outputBedFilePaths.append(outputBedFilePath)

    # Pass the data to the custome bed parser.
    print("\nPassing data to custom bed parser.\n")
    parseCustomBed(outputBedFilePaths, genomeFilePath, nucPosFilePath, False,
                   False, False)
Пример #4
0
    def setUpFileSystem(self):

        # Store useful paths and names.
        localRootDirectory = os.path.dirname(self.inputDataFilePath)
        dataGroupName = getIsolatedParentDir(self.inputDataFilePath)

        # Create the intermediate files directory if necessary
        intermediateFilesDirectory = os.path.join(localRootDirectory,"intermediate_files")
        if not os.path.exists(intermediateFilesDirectory):
            os.mkdir(intermediateFilesDirectory)

        # If the given input data is in bigWig form, use it to generate the file paths to the intermediate bedGraph files.
        if self.bigWigReadsFilePathPair is not None:
            self.bedGraphReadsFilePathPair = list()
            for bigWigReadsFilePath in self.bigWigReadsFilePathPair:
                self.bedGraphReadsFilePathPair.append(os.path.join(intermediateFilesDirectory,
                                                                   os.path.basename(bigWigReadsFilePath).rsplit('.',1)[0]+".bedGraph"))

        # Generate the trimmed reads output, the fasta output, and bed output file paths.
        self.trimmedReadsFilePath = os.path.join(intermediateFilesDirectory,dataGroupName+"_trimmed_reads.bed")
        self.fastaReadsFilePath = os.path.join(intermediateFilesDirectory,dataGroupName+"_trimmed_reads.fa")
        self.lesionsBedFilePath = generateFilePath(directory = intermediateFilesDirectory, dataGroup = dataGroupName,
                                                   dataType = DataTypeStr.customInput, fileExtension = ".bed") 

        # Generate metadata
        generateMetadata(dataGroupName, getIsolatedParentDir(self.genomeFilePath),
                         os.path.basename(self.inputDataFilePath), InputFormat.xRSeq, localRootDirectory,
                         callParamsFilePath = self.callParamsFilePath)
def getBackgroundRawPairs(backgroundCountsFilePaths):

    # Match each background file path to its respective raw counts file path.
    backgroundRawPairs: Dict[str, List[str]] = dict()
    for backgroundCountsFilePath in backgroundCountsFilePaths:

        if not DataTypeStr.nucMutBackground in os.path.basename(
                backgroundCountsFilePath):
            raise InvalidPathError("Background counts file should have \"" +
                                   DataTypeStr.nucMutBackground +
                                   "\" in the name.  Given:")

        # Generate the expected raw counts file path
        metadata = Metadata(backgroundCountsFilePath)
        rawCountsFilePath = generateFilePath(
            directory=metadata.directory,
            dataGroup=metadata.dataGroupName,
            linkerOffset=getLinkerOffset(backgroundCountsFilePath),
            usesNucGroup=checkForNucGroup(backgroundCountsFilePath),
            dataType=DataTypeStr.rawNucCounts,
            fileExtension=".tsv")

        # Make sure it exists
        if not os.path.exists(rawCountsFilePath):
            raise ValueError("No raw counts file found to pair with " +
                             backgroundCountsFilePath +
                             "\nExpected file with path: " + rawCountsFilePath)

        if backgroundCountsFilePath not in backgroundRawPairs:
            backgroundRawPairs[backgroundCountsFilePath] = list()
        backgroundRawPairs[backgroundCountsFilePath].append(rawCountsFilePath)

    return backgroundRawPairs
Пример #6
0
    def setUpNewIndividualCohort(self, cohortID):

        # If this isn't the first opened cohort file, close and sort the last one.
        if self.currentIndividualCohortFile is not None:
            self.currentIndividualCohortFile.close()
            subprocess.run(
                ("sort", "-k1,1", "-k2,2n", self.individualCohortFilePath,
                 "-s", "-o", self.individualCohortFilePath),
                check=True)
            Metadata(self.individualCohortFilePath).addMetadata(
                Metadata.AddableKeys.mutCounts,
                self.currentIndividualCohortMutCounts)

        # Make sure this is actually a new cohort.
        if cohortID in self.completedIndividualCohorts:
            raise UserInputError(
                "The cohort " + cohortID +
                " was encountered in more than one distinct block of data.")
        else:
            self.currentIndividualCohortID = cohortID

        individualCohortDirectory = os.path.join(
            self.rootIndividualCohortsDirectory,
            self.currentIndividualCohortID)
        individualCohortDataGroup = self.currentIndividualCohortID + "_" + self.rootMetadata.dataGroupName

        checkDirs(individualCohortDirectory)

        # Determine which other set up "umbrella" cohorts this cohort belongs to.
        cohortMembership = [
            self.currentIndividualCohortID,
        ]
        if self.stratifyByMS:
            if self.currentIndividualCohortID in self.MSICohorts:
                cohortMembership.append("MSI")
            else:
                cohortMembership.append("MSS")
        if self.stratifyByMutSig:
            if self.currentIndividualCohortID in self.mutSigDesignations:
                for mutSig in self.mutSigDesignations[
                        self.currentIndividualCohortID]:
                    cohortMembership.append("mut_sig_" + mutSig)

        # Generate the file path and metadata file and open the file for writing.
        self.individualCohortFilePath = generateFilePath(
            directory=individualCohortDirectory,
            dataGroup=individualCohortDataGroup,
            context=self.context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        self.currentIndividualCohortFile = open(self.individualCohortFilePath,
                                                'w')
        generateMetadata(
            individualCohortDataGroup, self.rootMetadata.genomeName,
            os.path.join("..", self.rootMetadata.localParentDataPath),
            self.rootMetadata.inputFormat, individualCohortDirectory,
            *cohortMembership)
        self.currentIndividualCohortMutCounts = 0
Пример #7
0
def parseStandardBed(standardBedFilePaths: List[str], genomeFilePath):

    customBedOutputFilePaths = list(
    )  # The list of file paths to be passed to the custom bed parser.

    # Parse the given files into custom bed format.
    for standardBedFilePath in standardBedFilePaths:

        print("\nWorking in:", os.path.basename(standardBedFilePath))
        if not os.path.basename(standardBedFilePath).endswith(".bed"):
            raise InvalidPathError(
                standardBedFilePath,
                "Given file does not appear to be in bed format. (missing \".bed\" extension)"
            )

        # Store useful paths and names.
        localRootDirectory = os.path.dirname(standardBedFilePath)
        intermediateFilesDir = os.path.join(localRootDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)
        dataGroupName = getIsolatedParentDir(standardBedFilePath)

        # Generate the output file path and metadata
        customBedOutputFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=dataGroupName,
            dataType=DataTypeStr.customInput,
            fileExtension=".bed")
        customBedOutputFilePaths.append(customBedOutputFilePath)
        generateMetadata(dataGroupName, getIsolatedParentDir(genomeFilePath),
                         os.path.basename(standardBedFilePath),
                         InputFormat.standardBed, localRootDirectory)

        # Get the list of acceptable chromosomes.
        acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)

        # Iterate through the standard bed file entries preparing them for custom-bed input.
        print("Converting entries for custom bed input...")
        with open(standardBedFilePath, 'r') as standardBedFile:
            with open(customBedOutputFilePath, 'w') as customBedOutputFile:

                for line in standardBedFile:

                    choppedUpLine = line.strip().split("\t")

                    # Make sure the lesion is in a valid chromosome.  Otherwise, skip it.
                    if not choppedUpLine[0] in acceptableChromosomes: continue

                    choppedUpLine[3] = '.'
                    choppedUpLine[4] = "OTHER"

                    customBedOutputFile.write('\t'.join(choppedUpLine[:6]) +
                                              '\n')

    # Pass the generated files to the custom bed parser.
    parseCustomBed(customBedOutputFilePaths, genomeFilePath, False, False,
                   False, False)
def countInBindingMotifs(mutationFilePaths, bindingMotifsFilePaths):

    bindingMotifsMutationCountsFilePaths = list(
    )  # A list of paths to the output files generated by the function

    # Loop through each given mutation file path, creating a corresponding binding motifs mutation count file for each.
    for mutationFilePath in mutationFilePaths:
        for bindingMotifsFilePath in bindingMotifsFilePaths:

            print("\nWorking with", os.path.basename(mutationFilePath), "and",
                  os.path.basename(bindingMotifsFilePath))

            # Make sure we have the expected file type.
            if not DataTypeStr.mutations in os.path.basename(mutationFilePath):
                raise ValueError("Mutation file should have \"" +
                                 DataTypeStr.mutations + "\" in the name.")

            # Get metadata and use it to generate a path to the nucleosome positions file.
            metadata = Metadata(mutationFilePath)

            # Generate the output file path for mutation counts.
            binder = os.path.basename(bindingMotifsFilePath).rsplit(
                "binding_motifs", 1)[0]
            if "binding_motifs" not in os.path.basename(bindingMotifsFilePath):
                warnings.warn(
                    "\"binding_motifs\" not found in basename of binding motifs file.  The output file's name is probably a garbled mess."
                )

            bindingMotifsMutationCountsFilePath = generateFilePath(
                directory=metadata.directory,
                dataGroup=metadata.dataGroupName,
                fileExtension=".tsv",
                dataType=binder + "binding_motif_mutation_counts")
            bindingMotifsMutationCountsFilePaths.append(
                bindingMotifsMutationCountsFilePath)

            # Ready, set, go!
            counter = CountsFileGenerator(
                mutationFilePath, bindingMotifsFilePath,
                bindingMotifsMutationCountsFilePath,
                getAcceptableChromosomes(metadata.genomeFilePath))
            counter.count()
            counter.writeResults()

    return bindingMotifsMutationCountsFilePaths
def getCustomBackgroundRawPairs(customRawCountsFilePaths,
                                customBackgroundCountsDir):

    customBackgroundRawPairs: Dict[str, List[str]] = dict()

    # For every raw counts file given, try to match it to a raw counts file in the customBackgroundCountsDir.
    for customRawCountsFilePath in customRawCountsFilePaths:

        rawMetadata = Metadata(customRawCountsFilePath)
        backgroundDir = os.path.join(customBackgroundCountsDir,
                                     rawMetadata.nucPosName)
        if not os.path.exists(backgroundDir):
            raise UserInputError(
                "Expected a directory at " + backgroundDir +
                " to contain the background for " + customRawCountsFilePath +
                " but the directory does not exist.  Have you forgotten to run "
                "the analysis for the related nucleosome map?")
        backgroundMetadata = Metadata(backgroundDir)

        customBackgroundCountsFilePath = generateFilePath(
            directory=backgroundMetadata.directory,
            dataGroup=backgroundMetadata.dataGroupName,
            linkerOffset=getLinkerOffset(customRawCountsFilePath),
            usesNucGroup=checkForNucGroup(customRawCountsFilePath),
            dataType=DataTypeStr.rawNucCounts,
            fileExtension=".tsv")
        if not os.path.exists(customBackgroundCountsFilePath):
            raise UserInputError(
                "Expected file at " + customBackgroundCountsFilePath +
                " to use as custom background for " + customRawCountsFilePath +
                " but this file does not exist.  Have you forgotten to "
                "run the relevant analysis to generate it?")
        if customBackgroundCountsFilePath not in customBackgroundRawPairs:
            customBackgroundRawPairs[customBackgroundCountsFilePath] = list()
        customBackgroundRawPairs[customBackgroundCountsFilePath].append(
            customRawCountsFilePath)

    return customBackgroundRawPairs
Пример #10
0
    def __init__(self, rootDataDir, context):

        # Get the metadata from the given directory
        self.rootDataDir = rootDataDir
        self.rootMetadata = Metadata(self.rootDataDir)

        # Record the data's context.
        self.context = context

        # create and open the output file in the same directory as the root data.
        self.rootOutputFilePath = generateFilePath(
            directory=self.rootDataDir,
            dataGroup=self.rootMetadata.dataGroupName,
            context=self.context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        self.rootOutputFile = open(self.rootOutputFilePath, 'w')
        self.rootMutCounts = 0

        # By default, all other write options are off unless otherwise specified.
        self.stratifyByIndividualCohorts = False
        self.stratifyByMS = False
        self.stratifyByMutSig = False
        self.stratifyBySignature = False
Пример #11
0
    def setUpForMutSigStratification(self) -> MutSigIdentifier:

        self.stratifyByMutSig = True
        self.mutSigDesignations = dict(
        )  # A dictionary of the mutation signatures assigned to each cohort.

        mutSigs = ["1A", "1B"] + [str(x) for x in list(range(2, 22))
                                  ] + ["R1", "R2", "R3", "U1", "U2"]

        # Create the necessary directories, file paths, and metadata.
        parentMutSigDirectory = os.path.join(self.rootMetadata.directory,
                                             "mut_sig_analysis")
        self.mutSigFilePaths = dict()
        self.mutSigFiles = dict()
        self.mutSigMutCounts = dict()

        for mutSig in mutSigs:

            thisMutSigDataGroup = "mut_sig_" + mutSig + '_' + self.rootMetadata.dataGroupName

            # Directory
            thisMutSigDirectory = os.path.join(parentMutSigDirectory, mutSig)
            checkDirs(thisMutSigDirectory)

            # Metadata
            generateMetadata(
                thisMutSigDataGroup, self.rootMetadata.genomeName,
                os.path.join('..', '..',
                             self.rootMetadata.localParentDataPath),
                self.rootMetadata.inputFormat, thisMutSigDirectory,
                "mutSig" + mutSig)

            # Mutation Counter
            self.mutSigMutCounts[mutSig] = 0

            # File path
            self.mutSigFilePaths[mutSig] = generateFilePath(
                directory=thisMutSigDirectory,
                dataGroup=thisMutSigDataGroup,
                context=self.context,
                dataType=DataTypeStr.mutations,
                fileExtension=".bed")
            self.mutSigFiles[mutSig] = open(self.mutSigFilePaths[mutSig], 'w')

        # Set up the MutSigIdentifier object to be returned.
        intermediateFilesDir = os.path.join(self.rootDataDir,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)

        deconstructSigsInputDataFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=self.rootMetadata.dataGroupName,
            dataType="deconstructSigs_data",
            fileExtension=".tsv")
        self.mutSigDesignationsFilePath = generateFilePath(
            directory=parentMutSigDirectory,
            dataGroup=self.rootMetadata.dataGroupName,
            dataType="mut_sig_assignments",
            fileExtension=".tsv")

        self.mutSigIdentifier = MutSigIdentifier(
            deconstructSigsInputDataFilePath, self.mutSigDesignationsFilePath)
        return (self.mutSigIdentifier)
Пример #12
0
def generateMutationBackground(mutationFilePaths, backgroundContextNum):

    mutationBackgroundFilePaths = list(
    )  # A list of paths to the output files generated by the function

    # A dictionary for converting context numbers to text.
    contextNumToText = {
        1: "singlenuc",
        2: "dinuc",
        3: "trinuc",
        4: "quadrunuc",
        5: "pentanuc",
        6: "hexanuc"
    }

    for mutationFilePath in mutationFilePaths:

        # Retrieve metadata
        metadata = Metadata(mutationFilePath)
        intermediateFilesDirectory = os.path.join(metadata.directory,
                                                  "intermediate_files")

        # If necessary, adjust the context for files with even-length features.
        if getContext(mutationFilePath, asInt=True) % 2 == 0:
            thisBackgroundContextNum = backgroundContextNum + 1
        else:
            thisBackgroundContextNum = backgroundContextNum

        # Set the name of the type of context being used.
        assert thisBackgroundContextNum in contextNumToText, "Unexpected background context number: " + str(
            thisBackgroundContextNum)
        contextText = contextNumToText[thisBackgroundContextNum]

        # Get the list of acceptable chromosomes
        acceptableChromosomes = getAcceptableChromosomes(
            metadata.genomeFilePath)

        print("\nWorking in:", os.path.split(mutationFilePath)[1])
        if not DataTypeStr.mutations in os.path.split(mutationFilePath)[1]:
            raise InvalidPathError(
                mutationFilePath,
                "Given mutation file does not have \"" +
                DataTypeStr.mutations + "\" in the name.",
                postPathMessage=
                "Are you sure you inputted a file from the mutperiod pipeline?"
            )

        # Generate the file path for the genome context frequency file.
        genomeContextFrequencyFilePath = generateFilePath(
            directory=os.path.dirname(metadata.genomeFilePath),
            dataGroup=metadata.genomeName,
            context=contextText,
            dataType="frequency",
            fileExtension=".tsv")

        # Generate the file path for the mutation context frequency file.
        mutationContextFrequencyFilePath = generateFilePath(
            directory=intermediateFilesDirectory,
            dataGroup=metadata.dataGroupName,
            context=contextText,
            dataType="mutation_frequencies",
            fileExtension=".tsv")

        # Generate the file path for the background mutation rate file.
        mutationBackgroundFilePath = generateFilePath(
            directory=metadata.directory,
            dataGroup=metadata.dataGroupName,
            context=contextText,
            dataType=DataTypeStr.mutBackground,
            fileExtension=".tsv")

        # If the genome context frequency file doesn't exist, create it.
        if not os.path.exists(genomeContextFrequencyFilePath):
            print("Genome", contextText,
                  "context frequency file not found at path:",
                  genomeContextFrequencyFilePath)
            print("Generating genome " + contextText +
                  " context frequency file...")
            generateGenomeContextFrequencyFile(metadata.genomeFilePath,
                                               genomeContextFrequencyFilePath,
                                               thisBackgroundContextNum,
                                               contextText,
                                               acceptableChromosomes)

        # Create a directory for intermediate files if it does not already exist...
        if not os.path.exists(intermediateFilesDirectory):
            os.mkdir(intermediateFilesDirectory)

        # Create the mutation context frequency file.
        print("Generating mutation context frequency file...")
        generateMutationContextFrequencyFile(mutationFilePath,
                                             mutationContextFrequencyFilePath,
                                             thisBackgroundContextNum,
                                             contextText,
                                             acceptableChromosomes)

        # Generate the mutation background file.
        generateMutationBackgroundFile(genomeContextFrequencyFilePath,
                                       mutationContextFrequencyFilePath,
                                       mutationBackgroundFilePath, contextText)

        mutationBackgroundFilePaths.append(mutationBackgroundFilePath)

    return mutationBackgroundFilePaths
def normalizeCounts(backgroundCountsFilePaths: List[str],
                    customRawCountsFilePaths: List[str] = list(),
                    customBackgroundCountsDir=None,
                    includeAlternativeScaling=False):

    normalizedCountsFilePaths = list()

    backgroundRawPairs = getBackgroundRawPairs(backgroundCountsFilePaths)

    # Get the background-raw pairs from the custom directories, if they were given.
    if customBackgroundCountsDir is not None:
        customBackgroundRawPairs = getCustomBackgroundRawPairs(
            customRawCountsFilePaths, customBackgroundCountsDir)
        for customBackgroundCountsFilePath in customBackgroundRawPairs:
            assert customBackgroundCountsFilePath not in backgroundRawPairs, "Unexpected intersection!"
            backgroundRawPairs[
                customBackgroundCountsFilePath] = customBackgroundRawPairs[
                    customBackgroundCountsFilePath]

    # Iterate through each background + raw counts pair
    for backgroundCountsFilePath in backgroundRawPairs:
        for rawCountsFilePath in backgroundRawPairs[backgroundCountsFilePath]:

            print("\nWorking with", os.path.basename(rawCountsFilePath), "and",
                  os.path.basename(backgroundCountsFilePath))

            metadata = Metadata(rawCountsFilePath)

            # Generate the path to the normalized file.
            if DataTypeStr.rawNucCounts in backgroundCountsFilePath:
                context = "custom_context"
            else:
                context = getContext(backgroundCountsFilePath)
            normalizedCountsFilePath = generateFilePath(
                directory=metadata.directory,
                dataGroup=metadata.dataGroupName,
                context=context,
                linkerOffset=getLinkerOffset(backgroundCountsFilePath),
                usesNucGroup=checkForNucGroup(backgroundCountsFilePath),
                dataType=DataTypeStr.normNucCounts,
                fileExtension=".tsv")

            # Prepare the arguments to the subprocess call.
            args = [
                "Rscript",
                os.path.join(rScriptsDirectory,
                             "NormalizeNucleosomeMutationCounts.R"),
                rawCountsFilePath, backgroundCountsFilePath,
                normalizedCountsFilePath
            ]

            # If alternative scaling is requested, determine the appropriate scaling factor and add it to the arguments
            if includeAlternativeScaling:

                # If we are normalizing by sequence context, just revert the automatic scaling.
                if customBackgroundCountsDir is None:
                    args.append(1)

                    # If we are normalizing by a custom context, scale based on the relative sizes of the parent background and raw data sets.
                else:
                    args.append(
                        str(
                            getParentDataFeatureCounts(
                                backgroundCountsFilePath) /
                            getParentDataFeatureCounts(rawCountsFilePath)))

            # Pass the file paths to the R script to generate the normalized counts file.
            print("Calling R script to generate normalized counts...")
            subprocess.run(args, check=True)

            normalizedCountsFilePaths.append(normalizedCountsFilePath)

    # Document where the custom background counts came from in each relevant directory.
    if customBackgroundCountsDir is not None:
        for customRawCountsDir in set([
                os.path.dirname(customRawCountsFilePath)
                for customRawCountsFilePath in customRawCountsFilePaths
        ]):
            metadata = Metadata(customRawCountsDir)
            customBackgroundInfoFilePath = generateFilePath(
                directory=metadata.directory,
                dataGroup=metadata.dataGroupName,
                dataType=DataTypeStr.customBackgroundInfo,
                fileExtension=".txt")
            with open(customBackgroundInfoFilePath,
                      'w') as customBackgroundInfoFile:
                customBackgroundInfoFile.write(
                    "Custom background directory: " +
                    customBackgroundCountsDir + '\n')
                customBackgroundInfoFile.write(
                    "Last date used: " +
                    str(datetime.datetime.now()).rsplit(':', 1)[0] + '\n')

    return normalizedCountsFilePaths
Пример #14
0
def parseCustomBed(bedInputFilePaths,
                   genomeFilePath,
                   stratifyByMS,
                   stratifyByMutSig,
                   separateIndividualCohorts,
                   onlySingleBaseSubs=False,
                   includeIndels=False):

    if onlySingleBaseSubs and includeIndels:
        raise UserInputError(
            "Indels are incompatible with single nucleotide substitutions.")
    if len(bedInputFilePaths) == 0:
        raise UserInputError("No bed files were found to parse.")

    for bedInputFilePath in bedInputFilePaths:

        print("\nWorking in:", os.path.basename(bedInputFilePath))

        # Get some important file system paths for the rest of the function and generate metadata
        # If this is an intermediate file, keep in mind that it's not in the data group's root directory
        # and metadata should already have been generated elsewhere
        if getIsolatedParentDir(bedInputFilePath) == "intermediate_files":
            dataDirectory = os.path.dirname(os.path.dirname(bedInputFilePath))
        else:
            dataDirectory = os.path.dirname(bedInputFilePath)
            generateMetadata(os.path.basename(dataDirectory),
                             getIsolatedParentDir(genomeFilePath),
                             os.path.basename(bedInputFilePath),
                             InputFormat.customBed,
                             os.path.dirname(bedInputFilePath))

        intermediateFilesDir = os.path.join(dataDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)
        autoAcquiredFilePath = os.path.join(intermediateFilesDir,
                                            "auto_acquire.fa")

        context = autoAcquireAndQACheck(bedInputFilePath, genomeFilePath,
                                        autoAcquiredFilePath,
                                        onlySingleBaseSubs, includeIndels)

        # Make sure the input file is not named the same as what will become the output file.  If it is, it needs to be copied
        # to the intermediate_files directory so it is available to be read from as the new output file is being written.
        expectedOutputFilePath = generateFilePath(
            directory=dataDirectory,
            dataGroup=os.path.basename(dataDirectory),
            context=context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        if bedInputFilePath == expectedOutputFilePath:
            inputFilePathCopy = os.path.join(
                intermediateFilesDir, os.path.basename(bedInputFilePath))
            print(
                "Input file path is identical to generated output file path and will be overwritten. ",
                "Creating a copy of the input file at:", inputFilePathCopy,
                "to use for reading.")
            shutil.copy2(bedInputFilePath, inputFilePathCopy)
            bedInputFilePath = inputFilePathCopy

        # Create an instance of the WriteManager to handle writing.
        with WriteManager(dataDirectory, context) as writeManager:

            # Check to see if cohort designations are present to see if preparations need to be made.
            optionalArgument = tuple()
            with open(bedInputFilePath, 'r') as bedInputFile:
                line = bedInputFile.readline()

                # Is the cohort designation present?
                if len(line.strip().split('\t')) == 7:

                    # Include in sort function
                    optionalArgument = ("-k7,7", )

                    # Prepare the write manager for individual cohorts if desired.
                    if separateIndividualCohorts:
                        writeManager.setUpForIndividualCohorts()

                elif stratifyByMS or stratifyByMutSig:
                    raise UserInputError(
                        "Additional stratification given, but no cohort designation given."
                    )
                elif separateIndividualCohorts:
                    raise UserInputError(
                        "Separation by individual cohorts requested, but no cohort designation given."
                    )

            # Sort the input data (should also ensure that the output data is sorted)
            subprocess.run(("sort", ) + optionalArgument +
                           ("-k1,1", "-k2,2n", "-k3,3n", bedInputFilePath,
                            "-s", "-o", bedInputFilePath),
                           check=True)

            # If requested, also prepare for stratification by microsatellite stability.
            if stratifyByMS:
                setUpForMSStratification(writeManager, bedInputFilePath)

            if stratifyByMutSig:
                setUpForMutSigStratification(writeManager, bedInputFilePath)

            # Go, go, go!
            convertToStandardInput(bedInputFilePath, writeManager,
                                   onlySingleBaseSubs, includeIndels)
Пример #15
0
def expandContext(inputBedFilePaths, expansionContextNum):

    assert expansionContextNum in (
        3, 5), "Unexpected expansion context: " + str(expansionContextNum)

    expandedContextFilePaths = list(
    )  # A list of paths to the output files generated by the function

    for inputBedFilePath in inputBedFilePaths:

        # Retrieve metadata
        metadata = Metadata(inputBedFilePath)

        # If necessary, adjust the context for files with even-length features.
        if getContext(inputBedFilePath, asInt=True) % 2 == 0:
            thisExpansionContextNum = expansionContextNum + 1
        else:
            thisExpansionContextNum = expansionContextNum

        # Make sure file names look valid.
        print("\nWorking in:", os.path.split(inputBedFilePath)[1])
        if not DataTypeStr.mutations in os.path.split(inputBedFilePath)[1]:
            raise InvalidPathError(
                inputBedFilePath,
                "Given mutation file does not have \"" +
                DataTypeStr.mutations + "\" in the name.",
                postPathMessage=
                "Are you sure you inputted a file from the mutperiod pipeline?"
            )

        # Make sure the context of the input bed file is less than the expansion context.
        if getContext(inputBedFilePath, asInt=True) >= thisExpansionContextNum:
            raise InvalidPathError(
                inputBedFilePath,
                "The given mutation file at does not have a lower context "
                "than the desired output context.",
                postPathMessage="There is nothing to expand.")

        # Generate paths to intermediate data files.
        intermediateFilesDirectory = os.path.join(metadata.directory,
                                                  "intermediate_files")

        bedExpansionFilePath = generateFilePath(
            directory=intermediateFilesDirectory,
            dataGroup=metadata.dataGroupName,
            dataType="intermediate_expansion",
            fileExtension=".bed")

        fastaReadsFilePath = generateFilePath(
            directory=intermediateFilesDirectory,
            dataGroup=metadata.dataGroupName,
            dataType="expanded_reads",
            fileExtension=".fa")

        # Generate a path to the final output file.
        expandedContextFilePath = generateFilePath(
            directory=metadata.directory,
            dataGroup=metadata.dataGroupName,
            context=thisExpansionContextNum,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")

        # Create a directory for intermediate files if it does not already exist...
        if not os.path.exists(intermediateFilesDirectory):
            os.mkdir(os.path.join(intermediateFilesDirectory))

        # Expand the nucleotide coordinates in the singlenuc context bed file as requested.
        expandBedPositions(inputBedFilePath, bedExpansionFilePath,
                           thisExpansionContextNum)

        # Convert the expanded coordinates in the bed file to the referenced nucleotides in fasta format.
        bedToFasta(bedExpansionFilePath, metadata.genomeFilePath,
                   fastaReadsFilePath)

        # Using the newly generated fasta file, create a new bed file with the expanded context.
        generateExpandedContext(inputBedFilePath, fastaReadsFilePath,
                                expandedContextFilePath,
                                thisExpansionContextNum)

        expandedContextFilePaths.append(expandedContextFilePath)

        # Delete the input file, which has the same mutation information, but a smaller context.
        print("Deleting old mutation context file...")
        os.remove(inputBedFilePath)

    return expandedContextFilePaths
Пример #16
0
def countNucleosomePositionMutations(mutationFilePaths, nucleosomeMapNames,
                                     countSingleNuc, countNucGroup,
                                     linkerOffset):

    # Check for the special case where a nucleosome map is being counted against itself to determine the nucleosome repeat length.
    if (len(mutationFilePaths) == 1 and len(nucleosomeMapNames) == 1
            and os.path.basename(mutationFilePaths[0]).rsplit(
                '.', 1)[0] == nucleosomeMapNames[0]):

        nucleosomeMapFilePath = mutationFilePaths[0]
        nucleosomeMapName = nucleosomeMapNames[0]

        print("Counting nucleosome map", nucleosomeMapName,
              "against itself in a 1000 bp radius.")

        countsFilePath = generateFilePath(
            directory=os.path.dirname(nucleosomeMapFilePath),
            dataGroup=nucleosomeMapName,
            usesNucGroup=True,
            fileExtension=".tsv",
            dataType="self_" + DataTypeStr.rawNucCounts)
        acceptableChromosomes = getAcceptableChromosomes(
            os.path.dirname(os.path.dirname(nucleosomeMapFilePath)))

        counter = NucleosomesInNucleosomesCounter(
            nucleosomeMapFilePath,
            nucleosomeMapFilePath,
            countsFilePath,
            encompassingFeatureExtraRadius=1000,
            acceptableChromosomes=acceptableChromosomes)
        counter.count()

        return [countsFilePath]

    if not (countSingleNuc or countNucGroup):
        raise UserInputError(
            "Must count in either a single nucleosome or group nucleosome radius."
        )

    nucleosomeMutationCountsFilePaths = list(
    )  # A list of paths to the output files generated by the function
    nucleosomeMapSortingChecked = False  # Use this to make sure files are checked for sorting only once.

    # Loop through each given mutation file path, creating a corresponding nucleosome mutation count file for each.
    for mutationFilePath in mutationFilePaths:

        print("\nWorking with", os.path.split(mutationFilePath)[1])

        # Make sure we have the expected file type.
        if not DataTypeStr.mutations in os.path.basename(mutationFilePath):
            raise InvalidPathError(
                mutationFilePath,
                "Given mutation file does not have \"" +
                DataTypeStr.mutations + "\" in the name.",
                postPathMessage=
                "Are you sure you inputted a file from the mutperiod pipeline?"
            )

        for nucleosomeMapName in nucleosomeMapNames:

            print("Counting with nucleosome map:", nucleosomeMapName)

            # Generate the path to the nucleosome-map-specific directory.
            nucleosomeMapDataDirectory = os.path.join(
                os.path.dirname(mutationFilePath), nucleosomeMapName)
            checkDirs(nucleosomeMapDataDirectory)

            # Check to see if the metadata for this directory has been generated before, and if not, set it up!
            if not os.path.exists(
                    os.path.join(nucleosomeMapDataDirectory, ".metadata")):

                print("No metadata found.  Generating...")

                parentMetadata = Metadata(mutationFilePath)

                # Check to see if the data name should be altered by this nucleosome map.
                dataGroupName = parentMetadata.dataGroupName

                dataGroupNameSuffixFilePath = os.path.join(
                    os.path.dirname(parentMetadata.genomeFilePath),
                    nucleosomeMapName, "append_to_data_name.txt")
                if os.path.exists(dataGroupNameSuffixFilePath):

                    with open(dataGroupNameSuffixFilePath
                              ) as dataGroupNameSuffixFile:
                        dataGroupName += dataGroupNameSuffixFile.readline(
                        ).strip()

                generateMetadata(
                    dataGroupName,
                    parentMetadata.genomeName,
                    os.path.join("..", parentMetadata.localParentDataPath),
                    parentMetadata.inputFormat,
                    nucleosomeMapDataDirectory,
                    *parentMetadata.cohorts,
                    callParamsFilePath=parentMetadata.callParamsFilePath,
                    associatedNucleosomePositions=nucleosomeMapName)

            # Get metadata and use it to generate a path to the nucleosome positions file.
            metadata = Metadata(nucleosomeMapDataDirectory)

            # Get the list of acceptable chromosomes
            acceptableChromosomes = getAcceptableChromosomes(
                metadata.genomeFilePath)

            # Generate the counts file for a single nucleosome region if requested.
            if countSingleNuc:

                # Generate the output file path
                nucleosomeMutationCountsFilePath = generateFilePath(
                    directory=metadata.directory,
                    dataGroup=metadata.dataGroupName,
                    linkerOffset=linkerOffset,
                    fileExtension=".tsv",
                    dataType=DataTypeStr.rawNucCounts)

                # Ready, set, go!
                print(
                    "Counting mutations at each nucleosome position in a 73 bp radius +",
                    str(linkerOffset), "bp linker DNA.")
                counter = MutationsInNucleosomesCounter(
                    mutationFilePath,
                    metadata.baseNucPosFilePath,
                    nucleosomeMutationCountsFilePath,
                    encompassingFeatureExtraRadius=73 + linkerOffset,
                    acceptableChromosomes=acceptableChromosomes,
                    checkForSortedFiles=(True,
                                         not nucleosomeMapSortingChecked))
                counter.count()

                nucleosomeMutationCountsFilePaths.append(
                    nucleosomeMutationCountsFilePath)

            # Generate the counts file for a nucleosome group region if requested.
            if countNucGroup:

                # Generate the output file path
                nucleosomeMutationCountsFilePath = generateFilePath(
                    directory=metadata.directory,
                    dataGroup=metadata.dataGroupName,
                    usesNucGroup=True,
                    fileExtension=".tsv",
                    dataType=DataTypeStr.rawNucCounts)

                # Ready, set, go!
                print(
                    "Counting mutations at each nucleosome position in a 1000 bp radius."
                )
                counter = MutationsInNucleosomesCounter(
                    mutationFilePath,
                    metadata.baseNucPosFilePath,
                    nucleosomeMutationCountsFilePath,
                    encompassingFeatureExtraRadius=1000,
                    acceptableChromosomes=acceptableChromosomes,
                    checkForSortedFiles=(True,
                                         not nucleosomeMapSortingChecked))
                counter.count()

                nucleosomeMutationCountsFilePaths.append(
                    nucleosomeMutationCountsFilePath)

        nucleosomeMapSortingChecked = True

    return nucleosomeMutationCountsFilePaths
def generateNucleosomeFasta(baseNucPosFilePath, genomeFilePath, dyadRadius,
                            linkerOffset):

    # Ensure that an intermediate files directory exists for the current nucleosome map.
    intermediateFilesDir = os.path.join(os.path.dirname(baseNucPosFilePath),
                                        "intermediate_files")
    checkDirs(intermediateFilesDir)

    # Generate a path to the fasta file of nucleosome sequences (Potentially including linker DNA).
    if dyadRadius == 73:
        nucPosFastaFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=os.path.basename(baseNucPosFilePath).rsplit('.', 1)[0],
            linkerOffset=linkerOffset,
            fileExtension=".fa")
    elif dyadRadius == 1000:
        nucPosFastaFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=os.path.basename(baseNucPosFilePath).rsplit('.', 1)[0],
            usesNucGroup=True,
            fileExtension=".fa")
    else:
        raise ValueError("Invalid dyad radius: " + str(dyadRadius) +
                         ".  Expected 73 or 1000.")

    # Make sure the file doesn't already exist.  If it does, we're done!
    if os.path.exists(nucPosFastaFilePath):
        print("Found relevant nucleosome fasta file:",
              os.path.basename(nucPosFastaFilePath))
        return nucPosFastaFilePath
    else:
        print("Nucleosome fasta file not found at: ",
              nucPosFastaFilePath,
              "\nGenerating...",
              sep='')

    # Generate the (temporary) expanded file path.
    expandedNucPosBedFilePath = generateFilePath(
        directory=intermediateFilesDir,
        dataGroup=os.path.basename(baseNucPosFilePath).rsplit('.', 1)[0],
        dataType="expanded",
        fileExtension=".bed")

    # Expand the bed coordinates.
    print("Expanding nucleosome coordinates...")
    with open(baseNucPosFilePath, 'r') as baseNucPosFile:
        with open(expandedNucPosBedFilePath, 'w') as expandedNucPosBedFile:

            # Write the expanded positions to the new file, one line at a time.
            for line in baseNucPosFile:
                choppedUpLine = line.strip().split('\t')
                choppedUpLine[1] = str(
                    int(choppedUpLine[1]) - dyadRadius - linkerOffset - 2)
                choppedUpLine[2] = str(
                    int(choppedUpLine[2]) + dyadRadius + linkerOffset + 2)

                # Write the results to the expansion file as long as it is not before the start of the chromosome.
                if int(choppedUpLine[1]) > -1:
                    expandedNucPosBedFile.write('\t'.join(choppedUpLine) +
                                                '\n')
                else:
                    print("Nucleosome at chromosome", choppedUpLine[0],
                          "with expanded start pos", choppedUpLine[1],
                          "extends into invalid positions.  Skipping.")

    # Convert the expanded bed file to fasta format.
    print("Converting expanded coordinates to fasta file...")
    bedToFasta(expandedNucPosBedFilePath,
               genomeFilePath,
               nucPosFastaFilePath,
               includeStrand=False)

    return nucPosFastaFilePath
Пример #18
0
def parseICGC(ICGCFilePaths: List[str], genomeFilePath, separateDonors,
              stratifyByMS, stratifyByMutSig):

    outputBedFilePaths = list()

    if len(ICGCFilePaths) == 0:
        raise UserInputError("No ICGC files were found to parse.")

    # Run the parser for each ICGC file given.
    for ICGCFilePath in ICGCFilePaths:

        print("\nWorking in:", os.path.split(ICGCFilePath)[1])

        if not ICGCFilePath.endswith(".gz"):
            raise InvalidPathError(
                ICGCFilePath,
                "Given ICGC file is not gzipped (.gz file format):")
        if not "simple_somatic_mutation" in os.path.basename(ICGCFilePath):
            raise InvalidPathError(
                ICGCFilePath,
                "Given ICGC file path does not have \"simple_somatic_mutation\" in the name:",
                "Note: if a directory was specified to search for ICGC input files, "
                "all files ending in .tsv.gz are selected.")

        # Get some important file system paths for the rest of the function and generate metadata.
        dataDirectory = os.path.dirname(ICGCFilePath)
        intermediateFilesDir = os.path.join(dataDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)

        generateMetadata(getIsolatedParentDir(ICGCFilePath),
                         getIsolatedParentDir(genomeFilePath),
                         os.path.basename(ICGCFilePath), InputFormat.ICGC,
                         os.path.dirname(ICGCFilePath))

        # Generate the output file.
        outputBedFilePath = generateFilePath(
            directory=intermediateFilesDir,
            dataGroup=getIsolatedParentDir(ICGCFilePath),
            dataType=DataTypeStr.customInput,
            fileExtension=".bed")

        # Write the relevant information from the ICGC file to the output file.
        print("Writing data to custom bed format.")
        with gzip.open(ICGCFilePath, 'r') as ICGCFile:
            with open(outputBedFilePath, 'w') as outputBedFile:
                for mutation in ICGCIterator(ICGCFile, genomeFilePath):

                    # Change the formatting if a deletion or insertion is given.
                    if mutation.mutatedFrom == '-':
                        mutation.mutatedFrom = '*'
                        # NOTE: We are making the assumption that the given base pos (1-based) is after the insertion, not before.
                        mutation.startPos = str(int(mutation.startPos) - 1)

                    elif mutation.mutatedTo == '-':
                        mutation.mutatedTo = '*'

                    outputBedFile.write('\t'.join(
                        (mutation.chromosome, mutation.startPos,
                         mutation.endPos, mutation.mutatedFrom,
                         mutation.mutatedTo, mutation.strand,
                         mutation.donorID)) + '\n')

        outputBedFilePaths.append(outputBedFilePath)

    # Pass the parsed bed files to the custom bed parser for even more parsing! (Hooray for modularization!)
    print("\nPassing data to custom bed parser...")
    parseCustomBed(outputBedFilePaths, genomeFilePath, stratifyByMS,
                   stratifyByMutSig, separateDonors, True)
def parseKucabCompendium(kucabSubstitutionsFilePaths: List[str],
                         genomeFilePath, nucPosFilePath, includeAllPAHs):

    for kucabSubstitutionsFilePath in kucabSubstitutionsFilePaths:

        print("\nWorking in:", os.path.basename(kucabSubstitutionsFilePath))

        if not kucabSubstitutionsFilePath.endswith("final.txt"):
            raise InvalidPathError(
                kucabSubstitutionsFilePath,
                "Given kucab input file does not end in \"final.txt\":")

        # Prepare the output file path.
        localRootDirectory = os.path.dirname(kucabSubstitutionsFilePath)
        dataGroupName = getIsolatedParentDir(kucabSubstitutionsFilePath)
        if includeAllPAHs:
            outputDirectory = os.path.join(localRootDirectory, "all_PAHs")
            dataGroupName += "_all_PAHs"
        else:
            dataGroupName += "_smoker_lung"
            outputDirectory = os.path.join(localRootDirectory, "smoker_lung")

        # Make sure the data directory exists.
        if not os.path.exists(outputDirectory): os.mkdir(outputDirectory)

        # Generate the output file path and metadata
        outputTrinucBedFilePath = generateFilePath(
            directory=outputDirectory,
            dataGroup=dataGroupName,
            context="trinuc",
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        generateMetadata(
            dataGroupName, getIsolatedParentDir(genomeFilePath),
            getIsolatedParentDir(nucPosFilePath),
            os.path.join("..", os.path.basename(kucabSubstitutionsFilePath)),
            outputDirectory)

        # Get the list of acceptable chromosomes
        acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)

        # These are the designations for PAH mutation signatures, the ones related to tobacco smoke that we want to study.
        PAHDesignations = ("MSM0.54", "MSM0.26", "MSM0.92", "MSM0.2",
                           "MSM0.42", "MSM0.74", "MSM0.103"
                           "MSM0.14", "MSM0.82", "MSM0.130", "MSM0.12",
                           "MSM0.132", "MSM0.13", "MSM0.96")
        # These designations specifically mimic the indel signature in smokers' lung cancer tumors.
        LungCancerSpecificDesignations = ("MSM0.26", "MSM0.92", "MSM0.2",
                                          "MSM0.103", "MSM0.14")

        # Set the designations that will be used to collect data based on the input to the function.
        if includeAllPAHs:
            relevantDesignations = PAHDesignations
        else:
            relevantDesignations = LungCancerSpecificDesignations

        print("Reading data and writing to trinuc bed file...")
        with open(kucabSubstitutionsFilePath, 'r') as kucabSubstitutionsFile:
            with open(outputTrinucBedFilePath, 'w') as outputTrinucBedFile:

                firstLineFlag = True
                for line in kucabSubstitutionsFile:

                    # Skip the first line with headers.
                    if firstLineFlag:
                        firstLineFlag = False
                        continue

                    # The lines are separated by tabs.  The relevant data have the following indices in a tab-separated list:
                    # 15: mutagen designation
                    # 4: Chromosome
                    # 5: Start Pos (1 base)
                    # 6: Reference base
                    # 7: Mutated base
                    # 13: pre-base context
                    # 14: post-base context
                    choppedUpLine = line.strip().split('\t')

                    # Skip the mutation if it does not belong to the relevant group.
                    if not choppedUpLine[15] in relevantDesignations: continue

                    # Compile the necessary information for the bed file.
                    chromosome = "chr" + choppedUpLine[4]

                    # Handle the weird chromsome formatting and then check for invalid chromosomes.
                    if chromosome == "chr23": chromosome = "chrX"
                    if chromosome == "chr24": chromosome = "chrY"
                    if not chromosome in acceptableChromosomes: continue
                    startPos1Base = choppedUpLine[5]
                    startPos0Base = str(int(startPos1Base) - 1)

                    mutatedFrom = choppedUpLine[6]
                    mutatedTo = choppedUpLine[7]
                    trinucContext = ''.join(
                        (choppedUpLine[13], mutatedFrom, choppedUpLine[14]))

                    # If the mutated base is listed as arising from a purine, flip the mutation and the strand.
                    if isPurine(mutatedFrom):
                        mutation = reverseCompliment(
                            mutatedFrom) + '>' + reverseCompliment(mutatedTo)
                        strand = '-'
                        trinucContext = reverseCompliment(trinucContext)
                    else:
                        mutation = mutatedFrom + '>' + mutatedTo
                        strand = '+'

                    # Write the information to the trinuc bed file.
                    outputTrinucBedFile.write('\t'.join(
                        (chromosome, startPos0Base, startPos1Base,
                         trinucContext, mutation, strand)) + '\n')

        # Sort the output file.
        print("Sorting output file...")
        subprocess.run(("sort", "-k1,1", "-k2,2n", outputTrinucBedFilePath,
                        "-o", outputTrinucBedFilePath),
                       check=True)