Exemplo n.º 1
0
def convert(sequenceData, lastTranscriptID, fivePrimeSequence, fivePrimeUTRs,
            cdss, stopCodon, direction):
    cdsSequence = ""
    cdssAndStopCodon = cdss[:]

    for cds in cdssAndStopCodon:
        cdsSequence += gencode.readSequence(sequenceData, cds['start'],
                                            cds['end'], direction)

    stopCodonSequence = ""
    if stopCodon:
        #stop codon is always last
        cdssAndStopCodon.append(stopCodon)
        stopCodonSequence += gencode.readSequence(sequenceData,
                                                  stopCodon['start'],
                                                  stopCodon['end'], direction)

    sequence = fivePrimeSequence + cdsSequence + stopCodonSequence

    EnsemblID = lastTranscriptID.split(
        '.'
    )[0]  # transcript stripped of version number to match RefSeq dictionary transcript
    uORF_number = 1
    while lee_TIS_dict.get((EnsemblID, uORF_number)):
        RefSeqID, position_to_aTIS, annotation, RLTM_RCHX, codon, uORF_number = lee_TIS_dict.get(
            (EnsemblID, uORF_number))
        position = position_to_aTIS + sequence.find(cdsSequence)

        # Now find the closest codon in the transcript sequence
        offSet = 0
        lst_pos = []
        codon_position = sequence.find(codon, offSet)
        offSet = codon_position + 1
        while codon_position != -1:
            lst_pos.append(codon_position)
            codon_position = sequence.find(codon, offSet)
            offSet = codon_position + 1
        best_position = closest_match(lst_pos, position)

        #startPosition, endPosition = conversion_parts(fivePrimeUTRs, cdssAndStopCodon, best_position - 1, best_position + 3, direction)

        record = fivePrimeUTRs[0]
        uORF_ID = RefSeqID + ':' + annotation + '_' + codon

        if abs(best_position - position) <= 5:  # 5 is an arbitrary threshold
            writeUORFOutput(uORF_ID, fivePrimeUTRs, cdssAndStopCodon,
                            best_position, best_position + 3, direction,
                            RLTM_RCHX)
            # 5 is an arbitrary threshold for matching based on me seeing that most of the elements were off by 1 or 0, which is what I expect
            #leeToFritschFile.write(record['chromosome'] + '\t' + str(startPosition) + '\t' + str(endPosition) + '\t' + \
            #					  RefSeqID + ':' + annotation + '_' + codon + '\t' + RLTM_RCHX	+ '\t' + \
            #					 direction + '\n')
        else:
            tooFarleeuorfsFile.write(record['chromosome'] + '\t' + str(best_position-1) + '\t' + str(best_position + 3) + '\t' + \
                   RefSeqID + ':' + annotation + '_' + codon + '\t' + RLTM_RCHX + '\t' + \
                   direction + '\t' + str(best_position - position) + '\n')
        uORF_number += 1
Exemplo n.º 2
0
def transcriptCallback(records, sequenceData, lastTranscriptID, fivePrimeUTRs, fivePrimeContent, cdss, stopCodon, direction):
    cdsSequence = ""
    for cds in cdss:
        cdsSequence += gencode.readSequence(sequenceData, cds['start'], cds['end'], direction)
        cdsAnnotationFile.write(cds['line'])
    
    stopCodonSequence = ""
    if stopCodon:
        stopCodonSequence += gencode.readSequence(sequenceData, stopCodon['start'], stopCodon['end'], direction)
        
        #print 'error! No transcript ID for ' + str(lastTranscriptID)
    
    if cdsSequence:    
        cdsSequenceFile.write(">" + str(records[0]['geneID']) + "|" + str(lastTranscriptID) + "|" + str(records[0]["geneName"]) + "\n")
        cdsSequenceFile.write(cdsSequence + stopCodonSequence + "\n")
Exemplo n.º 3
0
def transcriptCallback(records, sequenceData, lastTranscriptID, fivePrimeUTRs,
                       fivePrimeContent, cdss, stopCodon, direction):
    #Scan intron's, ignore first cds block
    for cds in cdss[1:]:
        if direction == '+':
            startPosition = cds['start'] - 6
            endPosition = cds['start'] + 2
        elif direction == '-':
            startPosition = cds['end'] - 2
            endPosition = cds['end'] + 6

        #Not in bounds of sequence data, skip
        if endPosition > len(sequenceData):
            print 'once'
            print cds['start']
            print cds['end']
            print cds
            print len(sequenceData)
            continue

        sequence = gencode.readSequence(sequenceData, startPosition,
                                        endPosition, direction)

        #NAGNAGNAG

        lowerPosition = 'NA'
        higherPosition = 'NA'
        if sequence[1:3][0] == 'A' or sequence[1:3][1] == 'G':
            if direction == '+':
                lowerPosition = str(startPosition + 1)
            elif direction == '-':
                higherPosition = str(endPosition - 2)
        if sequence[7:9][0] == 'A' or sequence[7:9][1] == 'G':
            if direction == '+':
                higherPosition = str(startPosition + 7)
            elif direction == '-':
                lowerPosition = str(endPosition - 8)

        if "AG" == sequence[4:6]:
            spliceAcceptorCase = 'SpliceAcceptorIsAG'
        else:
            spliceAcceptorCase = 'SpliceAcceptorIsNotAG'

        agCase = None
        if "AG" == sequence[1:3] or "AG" == sequence[7:9]:
            agCase = 'FoundAGOnSides'
        elif sequence[1:3][0] == 'A' or sequence[1:3][1] == 'G' or sequence[
                7:9][0] == 'A' or sequence[7:9][1] == 'G':
            agCase = 'FoundEitherAOrGOnSides'

        if agCase:
            nagAnnotationFile.write("\t".join([
                records[0]['chromosome'], 'mayur', 'NAGNAG', records[0]
                ['geneID'], records[0]['geneName'], records[0]['geneType'],
                lastTranscriptID, records[0]['transcriptType'], direction,
                str(startPosition),
                str(endPosition), sequence, lowerPosition, higherPosition,
                spliceAcceptorCase, agCase
            ]) + "\n")
Exemplo n.º 4
0
def findKozak2(outputFile, sequenceData, transcript, cdss, stopCodon,
               direction):
    cdsSequence = ""
    cdssAndStopCodon = cdss[:]

    for cds in cdssAndStopCodon:
        cdsSequence += gencode.readSequence(sequenceData, cds['start'],
                                            cds['end'], direction)

    stopCodonSequence = ""
    if stopCodon:
        #stop codon is always last
        cdssAndStopCodon.append(stopCodon)
        stopCodonSequence += gencode.readSequence(sequenceData,
                                                  stopCodon['start'],
                                                  stopCodon['end'], direction)

    afterCDSAndStopCodonSequence = ""
    genomeRecord = {
        'chromosome': cdssAndStopCodon[0]['chromosome'],
        'type': 'genome',
        'transcriptName': cdssAndStopCodon[0]['transcriptName'],
        'geneID': cdssAndStopCodon[0]['geneID'],
        'geneName': cdssAndStopCodon[0]['geneName'],
        'transcriptStatus': cdssAndStopCodon[0]['transcriptStatus'],
        'geneStatus': cdssAndStopCodon[0]['geneStatus'],
        'levelNumber': ''
    }

    sequence = cdsSequence + stopCodonSequence + afterCDSAndStopCodonSequence

    atgSequenceLength = len(cdsSequence)
    atgOffsetToSearch = 0
    sequenceLength = len(sequence)

    transcriptNumber = 1

    genomeLength = 300

    while atgOffsetToSearch < atgSequenceLength - 2:
        atgIndex = cdsSequence.find('ATG', atgOffsetToSearch)
        if atgIndex == -1:
            break

        atgOffsetToSearch = atgIndex + 1

        if atgIndex < atgSequenceLength - 2:
            endIndex = atgIndex + 3

            while endIndex < sequenceLength - 2:
                threeCharacterSequence = sequence[endIndex:endIndex + 3]
                if threeCharacterSequence == 'TAG' or threeCharacterSequence == 'TAA' or threeCharacterSequence == 'TGA':
                    kozakSequence = sequence[atgIndex:endIndex + 3]
                    writeKozakOutput2(
                        outputFile,
                        transcript + ".kozak_cds." + str(transcriptNumber),
                        cdssAndStopCodon, genomeRecord, atgIndex, endIndex + 3,
                        direction)

                    transcriptNumber += 1
                    found = True
                    break

                endIndex += 3

                if endIndex >= sequenceLength - 2 and sequenceLength < len(
                        sequenceData):
                    if direction == '+':
                        afterCDSAndStopCodonSequence = gencode.readSequence(
                            sequenceData, cdssAndStopCodon[-1]['end'] + 1,
                            min(cdssAndStopCodon[-1]['end'] + genomeLength,
                                len(sequenceData)), direction)
                        genomeRecord['start'] = cdssAndStopCodon[-1]['end'] + 1
                        genomeRecord['end'] = len(sequenceData)
                    elif direction == '-':
                        afterCDSAndStopCodonSequence = gencode.readSequence(
                            sequenceData,
                            max(cdssAndStopCodon[-1]['start'] - genomeLength,
                                1), cdssAndStopCodon[-1]['start'] - 1,
                            direction)
                        genomeRecord['start'] = 1
                        genomeRecord['end'] = cdssAndStopCodon[-1]['start'] - 1

                    sequence = cdsSequence + stopCodonSequence + afterCDSAndStopCodonSequence
                    sequenceLength = len(sequence)
                    genomeLength += 300
Exemplo n.º 5
0
def findKozak(utrOutputFile, cdsOutputFile, transcript, sequenceData,
              fivePrimeSequence, fivePrimeUTRs, cdss, stopCodon, direction):
    cdsSequence = ""
    for cds in cdss:
        cdsSequence += gencode.readSequence(sequenceData, cds['start'],
                                            cds['end'], direction)

    beforeFivePrimeSequence = ""
    afterCDSSequence = ""
    if direction == "+":
        if fivePrimeUTRs[0]['start'] - 9 > 0:
            beforeFivePrimeSequence = gencode.readSequence(
                sequenceData, fivePrimeUTRs[0]['start'] - 9,
                fivePrimeUTRs[0]['start'] - 1, direction)

        if cdss[-1]['end'] + 7 <= len(sequenceData):
            afterCDSSequence = gencode.readSequence(sequenceData,
                                                    cdss[-1]['end'] + 1,
                                                    cdss[-1]['end'] + 7,
                                                    direction)
    else:
        if fivePrimeUTRs[-1]['start'] - 9 > 0:
            beforeFivePrimeSequence = gencode.readSequence(
                sequenceData, fivePrimeUTRs[-1]['start'] - 9,
                fivePrimeUTRs[-1]['start'] - 1, direction)

        if cdss[0]['end'] + 7 <= len(sequenceData):
            afterCDSSequence = gencode.readSequence(sequenceData,
                                                    cdss[0]['end'] + 1,
                                                    cdss[0]['end'] + 7,
                                                    direction)

    sequence = beforeFivePrimeSequence + fivePrimeSequence + cdsSequence + afterCDSSequence

    atgOffsetToSearch = len(beforeFivePrimeSequence)

    transcriptVersion = 1

    while atgOffsetToSearch < len(beforeFivePrimeSequence) + len(
            fivePrimeSequence) - 2:
        atgIndex = sequence.find('ATG', atgOffsetToSearch)
        if atgIndex == -1:
            break

        if atgIndex < len(beforeFivePrimeSequence) + len(
                fivePrimeSequence) - 2:
            writeKozakOutput(utrOutputFile, fivePrimeUTRs[0], atgIndex,
                             atgIndex - len(beforeFivePrimeSequence) + 1,
                             len(fivePrimeSequence), sequence, "kozak_utr",
                             transcript, transcriptVersion, direction)
            transcriptVersion += 1

        atgOffsetToSearch = atgIndex + 1

    transcriptVersion = 1
    atgOffsetToSearch = len(beforeFivePrimeSequence) + len(fivePrimeSequence)

    while atgOffsetToSearch < len(beforeFivePrimeSequence) + len(
            fivePrimeSequence) + len(cdsSequence) - 2:
        atgIndex = sequence.find('ATG', atgOffsetToSearch)
        if atgIndex == -1:
            break

        if atgIndex < len(beforeFivePrimeSequence) + len(
                fivePrimeSequence) + len(cdsSequence) - 2:
            writeKozakOutput(
                cdsOutputFile, cdss[0], atgIndex, atgIndex -
                len(fivePrimeSequence) - len(beforeFivePrimeSequence) + 1,
                len(cdsSequence), sequence, "kozak_cds", transcript,
                transcriptVersion, direction)
            transcriptVersion += 1

        atgOffsetToSearch = atgIndex + 1
Exemplo n.º 6
0
def findUORF(outputFile, outputSequenceFile, sequenceData, divisibleByThree,
             transcript, fivePrimeSequence, fivePrimeUTRs, cdss, stopCodon,
             direction):
    cdsSequence = ""
    cdssAndStopCodon = cdss[:]

    for cds in cdssAndStopCodon:
        cdsSequence += gencode.readSequence(sequenceData, cds['start'],
                                            cds['end'], direction)

    stopCodonSequence = ""
    if stopCodon:
        #stop codon is always last
        cdssAndStopCodon.append(stopCodon)
        stopCodonSequence += gencode.readSequence(sequenceData,
                                                  stopCodon['start'],
                                                  stopCodon['end'], direction)

    sequence = fivePrimeSequence + cdsSequence + stopCodonSequence

    atgOffsetToSearch = 0
    atgSequenceLength = len(fivePrimeSequence)
    sequenceLength = len(sequence)

    transcriptNumber = 1

    while atgOffsetToSearch < atgSequenceLength - 2:
        atgIndex = fivePrimeSequence.find(StartCodon, atgOffsetToSearch)
        if atgIndex == -1:
            break

        atgOffsetToSearch = atgIndex + 1

        if atgIndex < atgSequenceLength - 2:
            endIndex = atgIndex + 3
            while endIndex < sequenceLength - 2:
                threeCharacterSequence = sequence[endIndex:endIndex + 3]
                if threeCharacterSequence == 'TAG' or threeCharacterSequence == 'TAA' or threeCharacterSequence == 'TGA':
                    if not divisibleByThree or (endIndex - atgIndex) % 3 == 0:
                        uorfSequence = sequence[atgIndex:endIndex + 3]
                        howFarReached = ""
                        if endIndex + 3 <= len(fivePrimeSequence):
                            howFarReached = "UTRonly"
                        elif endIndex + 3 <= len(fivePrimeSequence) + len(
                                cdsSequence):
                            howFarReached = "CDSpartial"
                        else:
                            howFarReached = "CDSfull"

                        record = fivePrimeUTRs[0]
                        outputSequenceFile.write(record["chromosome"] + "|" +
                                                 record["geneID"] + "|" +
                                                 transcript + ".uORF_" +
                                                 StartCodon + "." +
                                                 str(transcriptNumber) + "|" +
                                                 direction + "|" +
                                                 record["geneName"] + "|" +
                                                 howFarReached + "\n")
                        outputSequenceFile.write(uorfSequence + "\n")
                        writeUORFOutput(
                            outputFile, transcript + ".uORF_" + StartCodon +
                            "." + str(transcriptNumber), fivePrimeUTRs,
                            cdssAndStopCodon, atgIndex, endIndex + 3,
                            direction)
                        transcriptNumber += 1
                        break

                endIndex += 1