示例#1
0
def convertToStandardInput(bedInputFilePath, writeManager: WriteManager,
                           onlySingleBaseSubs, includeIndels):

    print("Converting custom bed file to standard bed input...")

    # Iterate through the input file one line at a time, converting each line to an acceptable format for the rest of the pipeline.
    with open(bedInputFilePath, 'r') as bedInputFile:
        for line in bedInputFile:

            choppedUpLine = str(line).strip().split('\t')

            # Is this an SNP from a purine?  If so, flip the strand to the pyrimidine containing strand.
            if isPurine(choppedUpLine[3]) and choppedUpLine[4].upper() in (
                    'A', 'C', 'G', 'T'):

                choppedUpLine[3] = reverseCompliment(choppedUpLine[3])
                choppedUpLine[4] = reverseCompliment(choppedUpLine[4])
                if choppedUpLine[5] == '+': choppedUpLine[5] = '-'
                elif choppedUpLine[5] == '-': choppedUpLine[5] = '+'

            # Is this an indel, and are those included?
            if choppedUpLine[3] == '*' or choppedUpLine[4] == '*':
                if not includeIndels: continue

            # Is this a single base substitution, and if not, should it even be included?
            if not isSingleBaseSubstitution(choppedUpLine):
                if onlySingleBaseSubs:
                    continue

                    # Center features greater than a single nucleotide so that they occur at a single nucleotide position (or half position)
                else:
                    center = (float(choppedUpLine[1]) +
                              float(choppedUpLine[2]) - 1) / 2
                    if int(center) == center:
                        center = int(
                            center
                        )  # Remove the decimal from the float if possible.
                    choppedUpLine[1] = str(center)
                    choppedUpLine[2] = str(center + 1)

            # Call on the write manager to handle the rest!
            if len(choppedUpLine) == 7:
                writeManager.writeData(choppedUpLine[0], choppedUpLine[1],
                                       choppedUpLine[2], choppedUpLine[3],
                                       choppedUpLine[4], choppedUpLine[5],
                                       choppedUpLine[6])
            else:
                writeManager.writeData(choppedUpLine[0], choppedUpLine[1],
                                       choppedUpLine[2], choppedUpLine[3],
                                       choppedUpLine[4], choppedUpLine[5])
示例#2
0
def copyBedData(preBedDirectory, bedDirectory, fileName):

    print("Reading from " + fileName + ".")

    # A list of all of the individual mutations in bed format
    bedData = list()

    # Use gzip to read the file contents and generate a bed formatted output.
    with gzip.open(preBedDirectory + "/" + fileName, "r") as preBedData:

        # Each line contains one mutation to be converted and added to bedData.
        for mutationData in preBedData.readlines():

            # The start of the bed formatted data.
            bedMutation = ""

            # Convert the bbglab data into a list of data entries.
            preBedDataCols = list()
            for data in mutationData.strip().split():
                preBedDataCols.append(str(data, "utf-8"))

            # Construct the bed data format from the bbglab data.
            bedMutation += preBedDataCols[0] + "\t"  # Chromosome number
            bedMutation += str(int(preBedDataCols[1]) -
                               1) + "\t"  # base-0 start (hence the "-1")
            bedMutation += preBedDataCols[1] + "\t"  # base-1 end

            # Based on the nature of the mutation, asign it to either the + or - strand and output the mutation accordingly.
            # Mutations are assumed to have arisen in pyrimidines.
            if isPurine(preBedDataCols[2]):
                bedMutation += reverseCompliment(preBedDataCols[2]) + '\t'
                bedMutation += reverseCompliment(
                    preBedDataCols[2]) + ">" + reverseCompliment(
                        preBedDataCols[3]) + "\t"
                bedMutation += "-\n"
            else:
                bedMutation += preBedDataCols[2] + '\t'
                bedMutation += preBedDataCols[2] + ">" + preBedDataCols[
                    3] + "\t"
                bedMutation += "+\n"

            # Add the mutation entry to the list of bed data.
            bedData.append(bedMutation)

    # Write (gzipped) the bed formatted mutation data to the bed directory.
    print("Writing bed data.")
    with gzip.open(bedDirectory + "/" + fileName, "w") as bedFile:
        for data in bedData:
            bedFile.write(data.encode())
示例#3
0
def getGenomeContextCounts(genomeContextFrequencyFilePath,
                           countPlusStrand=True,
                           countMinusStrand=True):

    # Make sure one strand is actually being counted.
    if not (countMinusStrand or countPlusStrand):
        raise ValueError("Error: At least one strand must be selected.")

    # A function that adds the given counts to its context in a dictionary.
    # Initializes the context if necessary.
    def addFrequency(context, counts):
        contextCounts.setdefault(context, 0)
        contextCounts[context] += counts

    contextCounts = dict()  # A dictionary to store the context counts.

    # Access the file with context counts.
    with open(genomeContextFrequencyFilePath,
              'r') as genomeContextFrequencyFile:

        for lineNum, line in enumerate(genomeContextFrequencyFile):

            # The first two lines are headers, so ignore them.
            if lineNum < 2: continue

            # Get the context sequence, its reverse compliment, and its counts for this line.
            context = line.strip().split('\t')[0]
            reverseContext = reverseCompliment(context)
            counts = int(line.strip().split('\t')[1])

            # Add counts to the dictionary based on the parameters set.
            if countPlusStrand:
                addFrequency(context, counts)

            if countMinusStrand:
                addFrequency(reverseContext, counts)

    return contextCounts
def generateNucleosomeMutationBackgroundFile(
        dyadPosContextCountsFilePath, mutationBackgroundFilePath,
        nucleosomeMutationBackgroundFilePath, dyadRadius, linkerOffset):

    # Dictionaries of expected mutations for every dyad position included in the analysis, one for each strand.
    plusStrandNucleosomeMutationBackground = dict()
    minusStrandNucleosomeMutationBackground = dict()

    # This is a bit weird.  If the context number is even, we need to account for half positions,
    # but if the context number is odd, we need to keep in mind that there's one extra valid position in the dyad range.
    if getContext(mutationBackgroundFilePath, asInt=True) % 2 == 0:
        halfBaseOffset = 0.5
        extraDyadPos = 0
    else:
        halfBaseOffset = 0
        extraDyadPos = 1

    # Initialize the dictionary
    for i in range(-dyadRadius - linkerOffset,
                   dyadRadius + linkerOffset + extraDyadPos):
        dyadPos = i + halfBaseOffset
        plusStrandNucleosomeMutationBackground[dyadPos] = 0
        minusStrandNucleosomeMutationBackground[dyadPos] = 0

    # Get the corresponding mutation background and context counts dictionaries.
    backgroundMutationRate = getGenomeBackgroundMutationRates(
        mutationBackgroundFilePath)
    dyadPosContextCounts = getDyadPosContextCounts(
        dyadPosContextCountsFilePath)

    # Calculate the expected mutation rates for each dyad position based on the context counts at that position and that context's mutation rate
    for dyadPos in dyadPosContextCounts:

        for context in dyadPosContextCounts[dyadPos]:

            reverseContext = reverseCompliment(context)

            # Add the context's mutation rate to the running total in the background dictionaries.
            plusStrandNucleosomeMutationBackground[
                dyadPos] += backgroundMutationRate[
                    context] * dyadPosContextCounts[dyadPos][context]
            minusStrandNucleosomeMutationBackground[
                dyadPos] += backgroundMutationRate[
                    reverseContext] * dyadPosContextCounts[dyadPos][context]

    # Write the results of the dictionary to the nucleosome mutation background file.
    with open(nucleosomeMutationBackgroundFilePath,
              'w') as nucleosomeMutationBackgroundFile:

        # Write the headers for the data.
        headers = '\t'.join(("Dyad_Position", "Expected_Mutations_Plus_Strand",
                             "Expected_Mutations_Minus_Strand",
                             "Expected_Mutations_Both_Strands",
                             "Expected_Mutations_Aligned_Strands"))

        nucleosomeMutationBackgroundFile.write(headers + '\n')

        # Write the data for each dyad position.
        for i in range(-dyadRadius - linkerOffset,
                       dyadRadius + linkerOffset + extraDyadPos):

            dyadPos = i + halfBaseOffset
            dataRow = '\t'.join(
                (str(dyadPos),
                 str(plusStrandNucleosomeMutationBackground[dyadPos]),
                 str(minusStrandNucleosomeMutationBackground[dyadPos]),
                 str(plusStrandNucleosomeMutationBackground[dyadPos] +
                     minusStrandNucleosomeMutationBackground[dyadPos]),
                 str(plusStrandNucleosomeMutationBackground[dyadPos] +
                     minusStrandNucleosomeMutationBackground[-dyadPos])))

            nucleosomeMutationBackgroundFile.write(dataRow + '\n')
示例#5
0
def autoAcquireAndQACheck(bedInputFilePath: str, genomeFilePath,
                          autoAcquiredFilePath, onlySingleBaseSubs,
                          includeIndels):

    print(
        "Checking custom bed file for formatting and auto-acquire requests...")

    # To start, assume that no sequences need to be acquired, and do it on the fly if need be.
    autoAcquiring = False
    autoAcquireFastaIterator = None
    fastaEntry = None
    cohortDesignationPresent = None

    # Unless indels are included, determine the context of the feqtures in the file.
    if includeIndels: context = 0
    else: context = None

    # Get the list of acceptable chromosomes
    acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)
    acceptableChromosomesFilePath = getAcceptableChromosomes(
        genomeFilePath, True)

    # Create a temporary file to write the data to (potentially after auto-acquiring).
    # Will replace original file at the end if auto-acquiring occurred.
    temporaryBedFilePath = bedInputFilePath + ".tmp"

    # Iterate through the input file one line at a time, checking the format of each entry and looking for auto-acquire requests.
    with open(bedInputFilePath, 'r') as bedInputFile:
        with open(temporaryBedFilePath, 'w') as temporaryBedFile:
            for line in bedInputFile:

                choppedUpLine = str(line).strip().split('\t')

                # If it isn't already, initialize the cohortDesignationPresent variable.
                if cohortDesignationPresent is None:
                    cohortDesignationPresent = len(choppedUpLine) == 7

                # Check for possible error states.
                checkForErrors(choppedUpLine, cohortDesignationPresent,
                               acceptableChromosomes,
                               acceptableChromosomesFilePath)

                # If this is the first entry requiring auto-acquiring, generate the required fasta file.
                if (not autoAcquiring and
                    (choppedUpLine[3] == '.' or choppedUpLine[4] == '.' or
                     (choppedUpLine[5] == '.' and choppedUpLine[3] != '*'))):
                    print(
                        "Found line with auto-acquire requested.  Generating fasta..."
                    )
                    autoAcquiring = True
                    bedToFasta(bedInputFilePath, genomeFilePath,
                               autoAcquiredFilePath)
                    autoAcquiredFile = open(autoAcquiredFilePath, 'r')
                    autoAcquireFastaIterator = FastaFileIterator(
                        autoAcquiredFile)
                    fastaEntry = autoAcquireFastaIterator.readEntry()
                    print("Continuing...")

                # Check for any base identities that need to be auto-acquired.
                if choppedUpLine[3] == '.':

                    # Find the equivalent fasta entry.
                    while not equivalentEntries(fastaEntry, choppedUpLine):
                        assert not autoAcquireFastaIterator.eof, (
                            "Reached end of fasta file without finding a match for: ",
                            ' '.join(choppedUpLine))
                        fastaEntry = autoAcquireFastaIterator.readEntry()

                    # Set the sequence.
                    choppedUpLine[3] = fastaEntry.sequence

                # Check for any strand designations that need to be auto-acquired.
                # Also, make sure this isn't an insertion, in which case the strand designation cannot be determined.
                if choppedUpLine[5] == '.' and choppedUpLine[3] != '*':

                    # Find the equivalent fasta entry.
                    while not equivalentEntries(fastaEntry, choppedUpLine):
                        assert not autoAcquireFastaIterator.eof, (
                            "Reached end of fasta file without finding a match for: ",
                            ' '.join(choppedUpLine))
                        fastaEntry = autoAcquireFastaIterator.readEntry()

                    # Determine which strand is represented.
                    if fastaEntry.sequence == choppedUpLine[3]:
                        choppedUpLine[5] = '+'
                    elif fastaEntry.sequence == reverseCompliment(
                            choppedUpLine[3]):
                        choppedUpLine[5] = '-'
                    else:
                        assert False, (
                            "The given sequence " + choppedUpLine[3] +
                            " for location " + fastaEntry.sequenceName + ' ' +
                            "does not match the corresponding sequence in the given genome, or its reverse compliment."
                        )

                # Change any '.' characters in the "altered to" column to "OTHER"
                if choppedUpLine[4] == '.': choppedUpLine[4] = "OTHER"

                # Determine the sequence context of the line and whether or not it matches the sequence context for other.
                # Skip this if the file is "mixed", this line is an indel, or only single base substitutions are allowed and this line isn't one.
                if (not context == 0 and not (choppedUpLine[3] == '*'
                                              or choppedUpLine[4] == '*')
                        and (not onlySingleBaseSubs
                             or isSingleBaseSubstitution(choppedUpLine))):

                    thisContext = len(choppedUpLine[3])
                    if context is None: context = thisContext
                    elif thisContext != context: context = 0

                # Write the current line to the temporary bed file.
                temporaryBedFile.write('\t'.join(choppedUpLine) + '\n')

    # If any lines were auto-acquired, replace the input bed file with the temporary bed file. (Which has auto-acquires)
    if autoAcquiring:
        print(
            "Overwriting custom bed input with auto-acquired bases/strand designations."
        )
        os.replace(temporaryBedFilePath, bedInputFilePath)
    # Otherwise, just delete the temporary file.
    else:
        os.remove(temporaryBedFilePath)

    if context > 6: context = float("inf")
    return context
def parseKucabCompendium(kucabSubstitutionsFilePaths: List[str],
                         genomeFilePath, nucPosFilePath, includeAllPAHs):

    for kucabSubstitutionsFilePath in kucabSubstitutionsFilePaths:

        print("\nWorking in:", os.path.basename(kucabSubstitutionsFilePath))

        if not kucabSubstitutionsFilePath.endswith("final.txt"):
            raise InvalidPathError(
                kucabSubstitutionsFilePath,
                "Given kucab input file does not end in \"final.txt\":")

        # Prepare the output file path.
        localRootDirectory = os.path.dirname(kucabSubstitutionsFilePath)
        dataGroupName = getIsolatedParentDir(kucabSubstitutionsFilePath)
        if includeAllPAHs:
            outputDirectory = os.path.join(localRootDirectory, "all_PAHs")
            dataGroupName += "_all_PAHs"
        else:
            dataGroupName += "_smoker_lung"
            outputDirectory = os.path.join(localRootDirectory, "smoker_lung")

        # Make sure the data directory exists.
        if not os.path.exists(outputDirectory): os.mkdir(outputDirectory)

        # Generate the output file path and metadata
        outputTrinucBedFilePath = generateFilePath(
            directory=outputDirectory,
            dataGroup=dataGroupName,
            context="trinuc",
            dataType=DataTypeStr.mutations,
            fileExtension=".bed")
        generateMetadata(
            dataGroupName, getIsolatedParentDir(genomeFilePath),
            getIsolatedParentDir(nucPosFilePath),
            os.path.join("..", os.path.basename(kucabSubstitutionsFilePath)),
            outputDirectory)

        # Get the list of acceptable chromosomes
        acceptableChromosomes = getAcceptableChromosomes(genomeFilePath)

        # These are the designations for PAH mutation signatures, the ones related to tobacco smoke that we want to study.
        PAHDesignations = ("MSM0.54", "MSM0.26", "MSM0.92", "MSM0.2",
                           "MSM0.42", "MSM0.74", "MSM0.103"
                           "MSM0.14", "MSM0.82", "MSM0.130", "MSM0.12",
                           "MSM0.132", "MSM0.13", "MSM0.96")
        # These designations specifically mimic the indel signature in smokers' lung cancer tumors.
        LungCancerSpecificDesignations = ("MSM0.26", "MSM0.92", "MSM0.2",
                                          "MSM0.103", "MSM0.14")

        # Set the designations that will be used to collect data based on the input to the function.
        if includeAllPAHs:
            relevantDesignations = PAHDesignations
        else:
            relevantDesignations = LungCancerSpecificDesignations

        print("Reading data and writing to trinuc bed file...")
        with open(kucabSubstitutionsFilePath, 'r') as kucabSubstitutionsFile:
            with open(outputTrinucBedFilePath, 'w') as outputTrinucBedFile:

                firstLineFlag = True
                for line in kucabSubstitutionsFile:

                    # Skip the first line with headers.
                    if firstLineFlag:
                        firstLineFlag = False
                        continue

                    # The lines are separated by tabs.  The relevant data have the following indices in a tab-separated list:
                    # 15: mutagen designation
                    # 4: Chromosome
                    # 5: Start Pos (1 base)
                    # 6: Reference base
                    # 7: Mutated base
                    # 13: pre-base context
                    # 14: post-base context
                    choppedUpLine = line.strip().split('\t')

                    # Skip the mutation if it does not belong to the relevant group.
                    if not choppedUpLine[15] in relevantDesignations: continue

                    # Compile the necessary information for the bed file.
                    chromosome = "chr" + choppedUpLine[4]

                    # Handle the weird chromsome formatting and then check for invalid chromosomes.
                    if chromosome == "chr23": chromosome = "chrX"
                    if chromosome == "chr24": chromosome = "chrY"
                    if not chromosome in acceptableChromosomes: continue
                    startPos1Base = choppedUpLine[5]
                    startPos0Base = str(int(startPos1Base) - 1)

                    mutatedFrom = choppedUpLine[6]
                    mutatedTo = choppedUpLine[7]
                    trinucContext = ''.join(
                        (choppedUpLine[13], mutatedFrom, choppedUpLine[14]))

                    # If the mutated base is listed as arising from a purine, flip the mutation and the strand.
                    if isPurine(mutatedFrom):
                        mutation = reverseCompliment(
                            mutatedFrom) + '>' + reverseCompliment(mutatedTo)
                        strand = '-'
                        trinucContext = reverseCompliment(trinucContext)
                    else:
                        mutation = mutatedFrom + '>' + mutatedTo
                        strand = '+'

                    # Write the information to the trinuc bed file.
                    outputTrinucBedFile.write('\t'.join(
                        (chromosome, startPos0Base, startPos1Base,
                         trinucContext, mutation, strand)) + '\n')

        # Sort the output file.
        print("Sorting output file...")
        subprocess.run(("sort", "-k1,1", "-k2,2n", outputTrinucBedFilePath,
                        "-o", outputTrinucBedFilePath),
                       check=True)