Пример #1
0
    print line

line = infile.readline().rstrip()
coord = []
field = line.split("\t")
for item in field:
    if not item == '':
        coord.append(item)

locus1 = coord[0]
refCoord_start1 = int(coord[1])
refCoord_end1 = int(coord[2])
coord1_start = int(coord[3])
coord1_end = int(coord[4])
if coord1_start > coord1_end:
    startingScaffold = biomodule.reverseComplement(sequences[locus1])
else:
    startingScaffold = sequences[locus1]

step = 1
halfSteps = open("halpfSteps.fasta", "w")
while True:
    line = infile.readline().rstrip()
    if not line:
        break
    coord = []
    field = line.split("\t")
    for item in field:
        if not item == '':
            coord.append(item)
    locus2 = coord[0]
Пример #2
0
def greedyElongation(seq):
    unmappedSequences = {}
    reference = str(seq)

    os.system(
        installationDirectory +
        "src/conda/bin/cd-hit-est  -d 0  -i unmapped.fasta -o unmapped_cdhit.fasta  >null 2>&1"
    )

    cdhitfile = open("unmapped_cdhit.fasta.clstr")
    #Select only high representated unmapped reads

    unmapSeq = {}
    for seq_record in SeqIO.parse("unmapped.fasta", "fasta"):
        locus = str(seq_record.id)
        if not locus in unmapSeq:
            unmapSeq[locus] = str(seq_record.seq)

    clusters = {}
    numSeqInCluster = {}

    numCluster = 0
    line = cdhitfile.readline().rstrip()
    if not line:
        print("Do nothing")
    else:
        while True:
            clusterName = "cluster" + str(numCluster)
            if not clusterName in clusters:
                clusters[clusterName] = []
                numSeqInCluster[clusterName] = 0
            line = "start"
            while not line[0] == ">":
                line = cdhitfile.readline().rstrip()
                if not line:
                    break
                numSeqInCluster["cluster" + str(numCluster)] += 1
                clusters[clusterName].append(
                    ((line.split(">"))[1].split("..."))[0])

                if line[0] == ">":
                    numCluster += 1
            if not line:
                break

    biggerCluster = ""
    seqInBiggerScaffold = 0
    for item in numSeqInCluster:
        if numSeqInCluster[item] > seqInBiggerScaffold:
            seqInBiggerScaffold = numSeqInCluster[item]
            biggerCluster = item

    print("Bigger cluster", biggerCluster, "Size", seqInBiggerScaffold)
    #sys.stdin.read(1)
    outcdhitfile = open("tempCdhitFile", "w")
    if not len(clusters[biggerCluster]) == 0:
        for sequ in clusters[biggerCluster]:
            if sequ in unmapSeq:
                outcdhitfile.write(">" + sequ + "\n" + unmapSeq[sequ] + "\n")
    outcdhitfile.close()

    os.system("mv tempCdhitFile unmapped.fasta")

    for seq_record in SeqIO.parse("unmapped.fasta", "fasta"):
        locus = str(seq_record.id)
        if not locus in unmappedSequences:
            unmappedSequences[locus] = str(seq_record.seq)
    numElong = 0
    while True:

        toAssemble = open("toAssemble.fasta", "w")
        toAssemble.write(">toElong\n" + reference[-200:] + "\n")
        os.system(
            installationDirectory +
            "src/conda/bin/makeblastdb -in toElong.fasta -dbtype nucl  >null 2>&1"
        )
        os.system(
            installationDirectory +
            "src/conda/bin/blastn -query unmapped.fasta -db toElong.fasta -outfmt 6 -num_threads 10 -dust no -soft_masking false -out outputBlast.txt >null 2>&1 "
        )

        blastFile = open("outputBlast.txt")
        while True:
            line = blastFile.readline().rstrip()
            if not line:
                break
            fields = line.split("\t")
            if (int(fields[8]) > (len(reference) - 100) or int(fields[9]) >
                (len(reference) - 100)
                ) and abs(int(fields[9]) - int(fields[8])) > 40 and float(
                    fields[2]) > 95:  # and fields[5] == "0":

                if int(fields[8]) > int(fields[9]):
                    toAssemble.write(">" + fields[0] + "\n" +
                                     biomodule.reverseComplement(
                                         unmappedSequences[fields[0]]) + "\n")
                else:
                    toAssemble.write(">" + fields[0] + "\n" +
                                     unmappedSequences[fields[0]] + "\n")

        toAssemble.close()

        print("Perform phrap assembly step....")

        os.system(installationDirectory +
                  "src/conda/bin/cap3 toAssemble.fasta > cap3Assembly 2>null")

        numElong += 1

        longestScaffold = ""
        for seq_record in SeqIO.parse("toAssemble.fasta.cap.contigs", "fasta"):
            if len(str(seq_record.seq)) >= len(longestScaffold):
                longestScaffold = str(seq_record.seq)

        #Check whether the produced elonged scaffold is on the right orientation
        tempScaffold = open("tempScaffold.fasta", "w")
        tempScaffold.write(">tempScaffold\n" + longestScaffold + "\n")
        tempScaffold.close()
        tempScaffold = open("tempScaffold_query.fasta", "w")
        tempScaffold.write(">reference\n" + reference[-100:] + "\n")
        tempScaffold.close()

        os.system(
            installationDirectory +
            "src/conda/bin/makeblastdb -in tempScaffold.fasta -dbtype nucl  >null 2>&1"
        )
        os.system(
            installationDirectory +
            "src/conda/bin/blastn -query tempScaffold_query.fasta -db tempScaffold.fasta -outfmt 6 -num_threads 10 -dust no -soft_masking false -out tempScaffold_outputBlast.txt  >null 2>&1"
        )
        tempScaffold = open("tempScaffold_outputBlast.txt")
        line = tempScaffold.readline().rstrip()
        fieldBlast = line.split("\t")
        if len(fieldBlast) > 2:
            if int(fieldBlast[8]) > int(fieldBlast[9]):
                longestScaffold = biomodule.reverseComplement(longestScaffold)
            tempScaffold.close()

        else:
            print("WARNING! EXTENSION STOPPED FOR MISSING ELONGMENT!!")
            js = open(
                "joined_W_WARNING_" + sequenceToElong + "_" + sequenceToReach,
                "w")
            js.write(">joined_W_WARNING_" + sequenceToElong + "_" +
                     sequenceToReach + "\n" + startingSeq[:-1800] + reference)
            js.close()
            exit()

        sc = fuseSequences2(reference, longestScaffold)
        longestScaffold = sc
        print("Elonged sequence has now a size of", len(longestScaffold),
              "nucleotides")

        if len(longestScaffold) <= len(reference):
            return longestScaffold
        else:
            reference = longestScaffold
            toElong = open("toElong.fasta", "w")
            toElong.write(">toElong\n" + reference + "\n")
            toElong.close()
Пример #3
0
    if not line:
        break
    coord = []
    field = line.split("\t")
    for item in field:
        if not item == '':
            coord.append(item)
    locus2 = coord[0]
    coord2_start = int(coord[3])
    coord2_end = int(coord[4])

    tempAssembly = open("tempAssembly.fasta", 'w')
    if coord2_start > coord2_end:
        tempAssembly.write(">PartialGenome" + "\n" + startingScaffold + "\n" +
                           ">" + locus2 + "\n" +
                           biomodule.reverseComplement(sequeunces[locus2]) +
                           "\n")
    else:
        tempAssembly.write(">PartialGenome" + "\n" + startingScaffold + "\n" +
                           ">" + locus2 + "\n" + sequeunces[locus2] + "\n")
    tempAssembly.close()

    os.system("phrap tempAssembly.fasta")

    numSeq = 0
    for seq_record in SeqIO.parse("tempAssembly.fasta.contigs", "fasta"):
        startingScaffold = str(seq_record.seq)
        numSeq += 1
        halfSteps.write(">" + str(step) + "\n" + startingScaffold + "\n")
        if numSeq > 1:
            print "More than one seq"
Пример #4
0
os.system(
    installationDirectory +
    "/src/conda/bin/python joinScaffolds_careful.py join ../1_cleanReads/qualityFiltered_1.fq ../1_cleanReads/qualityFiltered_2.fq finalScaffold_1_2000_f.txt r finalScaffold_"
    + str(bestPos1) + "_" + str(bestPos2) + "_r.txt r " +
    installationDirectory + "  8")
for seq_record in SeqIO.parse(genomeToComplete, "fasta"):
    genomeToCompleteSeq = str(seq_record.seq)
if os.path.isfile("joined_finalScaffold_1_2000_f.txt_finalScaffold_" +
                  bestPos1 + "_" + bestPos2 + "_r.txt") == True:
    print("5' end successfully reconstructed!")

    for seq_record in SeqIO.parse(
            "joined_finalScaffold_1_2000_f.txt_finalScaffold_" + bestPos1 +
            "_" + bestPos2 + "_r.txt", "fasta"):
        firstPortion = str(seq_record.seq)
        firstPortion = bm.reverseComplement(firstPortion)

    firtPortionReconstructed = fuseSequences2(firstPortion,
                                              genomeToCompleteSeq)
    if len(firtPortionReconstructed) > 10:
        print("firstPortion successuffly joined!")
        genomeToCompleteSeq = firtPortionReconstructed
        outfile = open("newGenome1.fasta", "w")
        outfile.write(">finalScaffold\n" + firtPortionReconstructed + "\n")
        outfile.close()

    else:
        print("firstPortion not joined")
        outfile = open("newGenome1.fasta", "w")
        outfile.write(">finalScaffold\n" + genomeToCompleteSeq + "\n")
        outfile.close()
Пример #5
0
        if len(longestScaffold) <= len(reference):
            return longestScaffold
        else:
            reference = longestScaffold
            toElong = open("toElong.fasta", "w")
            toElong.write(">toElong\n" + reference + "\n")
            toElong.close()


sequences = {}
for seq_record in SeqIO.parse(sequenceToElong, "fasta"):
    startingSeq = str(seq_record.seq)
    id1 = str(seq_record.id)
    if sequenceToElongOrientation == "r":
        startingSeq = biomodule.reverseComplement(startingSeq)

for seq_record in SeqIO.parse(sequenceToReach, "fasta"):
    terminiSeq = str(seq_record.seq)
    id2 = str(seq_record.id)
    if sequenceToReachOrientation == "r":
        terminiSeq = biomodule.reverseComplement(terminiSeq)

termfile = open("termini.fasta", "w")
termfile.write(">termini\n" + terminiSeq[:500] + "\n")
termfile.close()

toElong = open("toElong.fasta", "w")
toElong.write(">toElong\n" + startingSeq[-1800:-300] + "\n")
toElong.close()
Пример #6
0
            downstreamAlignment = fields
            lastNucl = int(fields[9])

    if len(downstreamAlignment) > 0:
        newSequence = s1[:int(downstreamAlignment[9]
                              )] + s2[int(downstreamAlignment[7]):]
        blastFile.close()
        return newSequence
    else:
        return ""


for seq_record in SeqIO.parse(start, "fasta"):
    startSeq = str(seq_record.seq)
    if start_o == "r":
        startSeq = biomodule.reverseComplement(startSeq)

for seq_record in SeqIO.parse(end, "fasta"):
    terminiSeq = str(seq_record.seq)
    if end_o == "r":
        terminiSeq = biomodule.reverseComplement(terminiSeq)

elongedSequence = startSeq[-700:-200]
outputSeq = open("joinScaffold_trivialSeq.fasta", "w")
numCycle = 0
while True:
    bestElongation = 0
    numCycle += 1
    if numCycle == numCycles:
        outputSeq.write(">trivialSeq\n" + startSeq[:-700] + elongedSequence +
                        "\n")
Пример #7
0
        print "Lunghezza migliore Scaffold:",lengthBestScaffold
        print "Overhang:",overhang
        #Check forward contigs
        if reference[-15:] in sequence and (len(fuseSequences(reference,sequence))-len(reference)) > overhang:
            elongedSequence = fuseSequences(reference,sequence)
            overhang = len(elongedSequence)-len(reference)
            print "Sequence",elongedSequence
            print "Forward"
            print "Overhang",len(elongedSequence)-len(reference)
            #print "Dove si trova la sequenza ",sequence.find(reference[-15:])
            #print "Da cercare ",reference[-15:]
            print seq_record
            #sys.stdin.read(1)
        
        #Check reverse contigs
        revSequence = biomodule.reverseComplement(sequence)
        if reference[-15:] in revSequence and (len(fuseSequences(reference,revSequence))-len(reference)) > overhang:
            elongedSequence = fuseSequences(reference,revSequence)
            overhang = len(elongedSequence)-len(reference)
            print "Sequence",elongedSequence
            print "Reverse"
            print "Overhang",len(elongedSequence)-len(reference)
            #print "Dove si trova la sequenza ",sequence.find(reference[-15:])
            #print "Da cercare ",reference[-15:]
            print seq_record
            #sys.stdin.read(1)


    if overhang < 10:
        print "Poor elongment. Now exit...."
        os.system("cp toElong.fasta elonged.fasta")
Пример #8
0
def greedyElongation(seq):
    unmappedSequences = {}
    reference = str(seq)
    #os.system("mkdir tempor")
    for seq_record in SeqIO.parse("unmapped.fasta", "fasta"):
        locus = str(seq_record.id)
        if not locus in unmappedSequences:
            unmappedSequences[locus] = str(seq_record.seq)
    numElong = 0
    while True:
        #print "Perform the blast of the unmapped sequences...."
        toAssemble = open("toAssemble.fasta", "w")
        toAssemble.write(">toElong\n" + reference[-200:] + "\n")
        os.system(
            installationDirectory +
            "src/conda/bin/makeblastdb -in toElong.fasta -dbtype nucl >null 2>&1"
        )
        os.system(
            installationDirectory +
            "src/conda/bin/blastn -query unmapped.fasta -db toElong.fasta -outfmt 6 -num_threads 8 -dust no -soft_masking false -out outputBlast.txt  >null 2>&1"
        )
        #print "Done"

        #print "Fill the toAssemble file"
        blastFile = open("outputBlast.txt")
        while True:
            line = blastFile.readline().rstrip()
            if not line:
                break
            fields = line.split("\t")
            if (int(fields[8]) > (len(reference) - 100) or int(fields[9]) >
                (len(reference) - 100)
                ) and abs(int(fields[9]) - int(fields[8])) > 40 and float(
                    fields[2]) > 97 and fields[5] == "0":

                if int(fields[8]) > int(fields[9]):
                    toAssemble.write(">" + fields[0] + "\n" +
                                     biomodule.reverseComplement(
                                         unmappedSequences[fields[0]]) + "\n")
                else:
                    toAssemble.write(">" + fields[0] + "\n" +
                                     unmappedSequences[fields[0]] + "\n")

        toAssemble.close()

        print("Perform second phrap assembly step.......")

        os.system(installationDirectory +
                  "src/conda/bin/cap3 toAssemble.fasta > cap3Assembly 2>null")

        numElong += 1

        longestScaffold = ""
        for seq_record in SeqIO.parse("toAssemble.fasta.cap.contigs", "fasta"):
            if len(str(seq_record.seq)) >= len(longestScaffold):
                longestScaffold = str(seq_record.seq)

        #Check whether the produced elonged scaffold is on the right orientation
        tempScaffold = open("tempScaffold.fasta", "w")
        tempScaffold.write(">tempScaffold\n" + longestScaffold + "\n")
        tempScaffold.close()
        tempScaffold = open("tempScaffold_query.fasta", "w")
        tempScaffold.write(">reference\n" + reference[-100:] + "\n")
        tempScaffold.close()

        os.system(
            installationDirectory +
            "src/conda/bin/makeblastdb -in tempScaffold.fasta -dbtype nucl >null 2>&1"
        )
        os.system(
            installationDirectory +
            "src/conda/bin/blastn -query tempScaffold_query.fasta -db tempScaffold.fasta -outfmt 6 -num_threads 10 -dust no -soft_masking false -out tempScaffold_outputBlast.txt >null 2>&1"
        )
        tempScaffold = open("tempScaffold_outputBlast.txt")
        line = tempScaffold.readline().rstrip()
        fieldBlast = line.split("\t")
        if len(fieldBlast) > 2:
            if int(fieldBlast[8]) > int(fieldBlast[9]):
                longestScaffold = biomodule.reverseComplement(longestScaffold)
            tempScaffold.close()

        else:
            print("WARNING! EXTENSION STOPPED!!")
            js = open(
                "joined_W_WARNING_" + sequenceToElong + "_" + sequenceToReach,
                "w")
            js.write(">joined_W_WARNING_" + sequenceToElong + "_" +
                     sequenceToReach + "\n" + startingSeq[:-1800] + reference)
            js.close()
            exit()

        sc = fuseSequences2(reference, longestScaffold)
        longestScaffold = sc
        print("Elonged sequence has now a size of", len(longestScaffold),
              "nucleotides")

        if len(longestScaffold) <= len(reference):

            return longestScaffold
        else:
            reference = longestScaffold
            toElong = open("toElong.fasta", "w")
            toElong.write(">toElong\n" + reference + "\n")
            toElong.close()
Пример #9
0
    for seq_record in SeqIO.parse(
            outputFolder + "/" + sampleName + "/hcmv_genome.fasta_con.fasta",
            "fasta"):
        consensusSequence = str(seq_record.seq)

else:
    for seq_record in SeqIO.parse(consensusFile, "fasta"):
        consensusSequence = str(seq_record.seq)

#**************************************************************************************************************

#Create file with repeat flanking regions fro consensus sequence (this is needed by pipeline step 2)***********
flankingSequencesFile = open("repeatsFlanking.fasta", "w")
flankingSequencesFile.write(
    ">TRLflankingStarting\n" +
    bm.reverseComplement(consensusSequence[1364:3000]) + "\n")
flankingSequencesFile.write(
    ">TRLflankingEnding\nCCATTCCGGGCCGTGTGCTGGGTCCCCGAGGGGCGGGGGGGTGTTTTCTGCGGGGGGGTGAAATTTGGAGTTGCGTGTGTGGACGGCGACGGCGACTAGTTGCGTGTGCTGCGGTGGGTACGGCGACGGCGAATAAAAGCGACGTGCGGCGCGCACGGCGAAAAGCAGACGCGCGTCTGTGTCTGTTTGAGTCCCCAGGGGACGGCAGCG\n"
)
flankingSequencesFile.write(">IRflankingStarting\n" +
                            consensusSequence[192000:193500] + "\n")
flankingSequencesFile.write(">IRflankingEnding\n" +
                            consensusSequence[197000:197500] + "\n")
flankingSequencesFile.write(">TRSflankingEnding\n" +
                            consensusSequence[232000:233500] + "\n")
flankingSequencesFile.write(
    ">TRLflankingEnding\nCCCGGCCAACACACCCCGACACACCCGGCACACGCCCGCGACACACCCGGCCAACACACCCCGACACACCCGGCACACGCCCGCGACACACCCGCGGCACACCCTGACACACCCGCCACACCCGGCACACACCCACCCCGCCGCGCCCCCGACACACCCCGACCGCCGCCGGTGCGGGACAGGGCT\n"
)
flankingSequencesFile.close()
os.system("mv repeatsFlanking.fasta ./2_ElongationFlankingRepeats/")
#****************************************************************************************************************
Пример #10
0
                    sequence[startPosition:]), len(reference), str(
                        seq_record.id)

        if reference[:20] in sequence:
            if len(sequence) > lengthBestScaffold:
                lengthBestScaffold = len(sequence)
                startPosition = sequence.find(reference[:20])
                elongedSequence = sequence[startPosition:]
                bestScaffold = sequence
                overhang = len(sequence[startPosition:]) - len(reference)
                print "Overhang composition:", len(
                    sequence[startPosition:]), len(reference), str(
                        seq_record.id)

        #check reverse contigs
        if biomodule.reverseComplement(reference[:20]) in sequence:
            if len(sequence) > lengthBestScaffold:
                lengthBestScaffold = len(sequence)
                startPosition = (biomodule.reverseComplement(sequence)).find(
                    reference[:20])
                elongedSequence = (
                    biomodule.reverseComplement(sequence))[startPosition:]
                bestScaffold = sequence
                overhang = len((biomodule.reverseComplement(sequence)
                                )[startPosition:]) - len(reference)
                print "Overhang composition:", len(
                    (biomodule.reverseComplement(sequence)
                     )[startPosition:]), len(reference), str(seq_record.id)

    if overhang < 10:
        print "Poor elongment. Now exit...."