Exemplo n.º 1
0
def parseCustomBed(bedInputFilePaths,
                   genomeFilePath,
                   stratifyByMS,
                   stratifyByMutSig,
                   separateIndividualCohorts,
                   onlySingleBaseSubs=False,
                   includeIndels=False):

    if onlySingleBaseSubs and includeIndels:
        raise UserInputError(
            "Indels are incompatible with single nucleotide substitutions.")
    if len(bedInputFilePaths) == 0:
        raise UserInputError("No bed files were found to parse.")

    for bedInputFilePath in bedInputFilePaths:

        print("\nWorking in:", os.path.basename(bedInputFilePath))

        # Get some important file system paths for the rest of the function and generate metadata
        # If this is an intermediate file, keep in mind that it's not in the data group's root directory
        # and metadata should already have been generated elsewhere
        if getIsolatedParentDir(bedInputFilePath) == "intermediate_files":
            dataDirectory = os.path.dirname(os.path.dirname(bedInputFilePath))
        else:
            dataDirectory = os.path.dirname(bedInputFilePath)
            generateMetadata(os.path.basename(dataDirectory),
                             getIsolatedParentDir(genomeFilePath),
                             os.path.basename(bedInputFilePath),
                             InputFormat.customBed,
                             os.path.dirname(bedInputFilePath))

        intermediateFilesDir = os.path.join(dataDirectory,
                                            "intermediate_files")
        checkDirs(intermediateFilesDir)
        autoAcquiredFilePath = os.path.join(intermediateFilesDir,
                                            "auto_acquire.fa")

        context = autoAcquireAndQACheck(bedInputFilePath, genomeFilePath,
                                        autoAcquiredFilePath,
                                        onlySingleBaseSubs, includeIndels)

        # Make sure the input file is not named the same as what will become the output file.  If it is, it needs to be copied
        # to the intermediate_files directory so it is available to be read from as the new output file is being written.
        expectedOutputFilePath = generateFilePath(
            directory=dataDirectory,
            dataGroup=os.path.basename(dataDirectory),
            context=context,
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        if bedInputFilePath == expectedOutputFilePath:
            inputFilePathCopy = os.path.join(
                intermediateFilesDir, os.path.basename(bedInputFilePath))
            print(
                "Input file path is identical to generated output file path and will be overwritten. ",
                "Creating a copy of the input file at:", inputFilePathCopy,
                "to use for reading.")
            shutil.copy2(bedInputFilePath, inputFilePathCopy)
            bedInputFilePath = inputFilePathCopy

        # Create an instance of the WriteManager to handle writing.
        with WriteManager(dataDirectory, context) as writeManager:

            # Check to see if cohort designations are present to see if preparations need to be made.
            optionalArgument = tuple()
            with open(bedInputFilePath, 'r') as bedInputFile:
                line = bedInputFile.readline()

                # Is the cohort designation present?
                if len(line.strip().split('\t')) == 7:

                    # Include in sort function
                    optionalArgument = ("-k7,7", )

                    # Prepare the write manager for individual cohorts if desired.
                    if separateIndividualCohorts:
                        writeManager.setUpForIndividualCohorts()

                elif stratifyByMS or stratifyByMutSig:
                    raise UserInputError(
                        "Additional stratification given, but no cohort designation given."
                    )
                elif separateIndividualCohorts:
                    raise UserInputError(
                        "Separation by individual cohorts requested, but no cohort designation given."
                    )

            # Sort the input data (should also ensure that the output data is sorted)
            subprocess.run(("sort", ) + optionalArgument +
                           ("-k1,1", "-k2,2n", "-k3,3n", bedInputFilePath,
                            "-s", "-o", bedInputFilePath),
                           check=True)

            # If requested, also prepare for stratification by microsatellite stability.
            if stratifyByMS:
                setUpForMSStratification(writeManager, bedInputFilePath)

            if stratifyByMutSig:
                setUpForMutSigStratification(writeManager, bedInputFilePath)

            # Go, go, go!
            convertToStandardInput(bedInputFilePath, writeManager,
                                   onlySingleBaseSubs, includeIndels)
def parseKucabCompendium(kucabSubstitutionsFilePaths: List[str],
                         genomeFilePath, nucPosFilePath, includeAllPAHs):

    for kucabSubstitutionsFilePath in kucabSubstitutionsFilePaths:

        print("\nWorking in:", os.path.basename(kucabSubstitutionsFilePath))

        if not kucabSubstitutionsFilePath.endswith("final.txt"):
            raise InvalidPathError(
                kucabSubstitutionsFilePath,
                "Given kucab input file does not end in \"final.txt\":")

        # Prepare the output file path.
        localRootDirectory = os.path.dirname(kucabSubstitutionsFilePath)
        dataGroupName = getIsolatedParentDir(kucabSubstitutionsFilePath)
        if includeAllPAHs:
            outputDirectory = os.path.join(localRootDirectory, "all_PAHs")
            dataGroupName += "_all_PAHs"
        else:
            dataGroupName += "_smoker_lung"
            outputDirectory = os.path.join(localRootDirectory, "smoker_lung")

        # Make sure the data directory exists.
        if not os.path.exists(outputDirectory): os.mkdir(outputDirectory)

        # Generate the output file path and metadata
        outputTrinucBedFilePath = generateFilePath(
            directory=outputDirectory,
            dataGroup=dataGroupName,
            context="trinuc",
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        generateMetadata(
            dataGroupName, getIsolatedParentDir(genomeFilePath),
            getIsolatedParentDir(nucPosFilePath),
            os.path.join("..", os.path.basename(kucabSubstitutionsFilePath)),
            outputDirectory)

        # Get the list of acceptable chromosomes
        acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)

        # These are the designations for PAH mutation signatures, the ones related to tobacco smoke that we want to study.
        PAHDesignations = ("MSM0.54", "MSM0.26", "MSM0.92", "MSM0.2",
                           "MSM0.42", "MSM0.74", "MSM0.103"
                           "MSM0.14", "MSM0.82", "MSM0.130", "MSM0.12",
                           "MSM0.132", "MSM0.13", "MSM0.96")
        # These designations specifically mimic the indel signature in smokers' lung cancer tumors.
        LungCancerSpecificDesignations = ("MSM0.26", "MSM0.92", "MSM0.2",
                                          "MSM0.103", "MSM0.14")

        # Set the designations that will be used to collect data based on the input to the function.
        if includeAllPAHs:
            relevantDesignations = PAHDesignations
        else:
            relevantDesignations = LungCancerSpecificDesignations

        print("Reading data and writing to trinuc bed file...")
        with open(kucabSubstitutionsFilePath, 'r') as kucabSubstitutionsFile:
            with open(outputTrinucBedFilePath, 'w') as outputTrinucBedFile:

                firstLineFlag = True
                for line in kucabSubstitutionsFile:

                    # Skip the first line with headers.
                    if firstLineFlag:
                        firstLineFlag = False
                        continue

                    # The lines are separated by tabs.  The relevant data have the following indices in a tab-separated list:
                    # 15: mutagen designation
                    # 4: Chromosome
                    # 5: Start Pos (1 base)
                    # 6: Reference base
                    # 7: Mutated base
                    # 13: pre-base context
                    # 14: post-base context
                    choppedUpLine = line.strip().split('\t')

                    # Skip the mutation if it does not belong to the relevant group.
                    if not choppedUpLine[15] in relevantDesignations: continue

                    # Compile the necessary information for the bed file.
                    chromosome = "chr" + choppedUpLine[4]

                    # Handle the weird chromsome formatting and then check for invalid chromosomes.
                    if chromosome == "chr23": chromosome = "chrX"
                    if chromosome == "chr24": chromosome = "chrY"
                    if not chromosome in acceptableChromosomes: continue
                    startPos1Base = choppedUpLine[5]
                    startPos0Base = str(int(startPos1Base) - 1)

                    mutatedFrom = choppedUpLine[6]
                    mutatedTo = choppedUpLine[7]
                    trinucContext = ''.join(
                        (choppedUpLine[13], mutatedFrom, choppedUpLine[14]))

                    # If the mutated base is listed as arising from a purine, flip the mutation and the strand.
                    if isPurine(mutatedFrom):
                        mutation = reverseCompliment(
                            mutatedFrom) + '>' + reverseCompliment(mutatedTo)
                        strand = '-'
                        trinucContext = reverseCompliment(trinucContext)
                    else:
                        mutation = mutatedFrom + '>' + mutatedTo
                        strand = '+'

                    # Write the information to the trinuc bed file.
                    outputTrinucBedFile.write('\t'.join(
                        (chromosome, startPos0Base, startPos1Base,
                         trinucContext, mutation, strand)) + '\n')

        # Sort the output file.
        print("Sorting output file...")
        subprocess.run(("sort", "-k1,1", "-k2,2n", outputTrinucBedFilePath,
                        "-o", outputTrinucBedFilePath),
                       check=True)