Python fastaWrite 예제들, sonLib.bioio.fastaWrite Python 예제들

예제 #1

0

파일 보기

파일: cactus_makeAlphaNumericHeaders.py 프로젝트: zqingyuan/cactus

def main():
    ##########################################
    #Construct the arguments.
    ##########################################

    usage = "usage: %prog [options] <fasta input file> <fasta output file>\n\n" + \
            "    <fasta file>:  fasta sequence to annotate\n"
    description = "Ensure sequence names contain only alphanumeric characters\n"
    parser = OptionParser(usage=usage, description=description)

    options, args = parser.parse_args()

    if len(args) != 2:
        parser.print_help()
        return 1

    inputName = args[0]
    inputFile = open(inputName, "r")
    outputName = args[1]
    outputFile = open(outputName, "w")

    for header, seq in fastaRead(inputFile):
        fastaWrite(outputFile, fixHeader(header), seq)

    outputFile.close()
    inputFile.close()
    return 0

예제 #2

0

파일 보기

    def testFastaReadWriteC(self):
        """Tests consistency with C version of this function.
        """
        tempFile = getTempFile()
        self.tempFiles.append(tempFile)
        tempFile2 = getTempFile()
        self.tempFiles.append(tempFile2)
        for test in range(0, self.testNo):
            fastaNumber = random.choice(range(10))
            l = [getRandomSequence() for i in range(fastaNumber)]
            fileHandle = open(tempFile, 'w')
            for name, seq in l:
                fastaWrite(fileHandle, name, seq)
            fileHandle.close()

            command = "sonLib_fastaCTest %s %s" % (tempFile, tempFile2)

            print(command)

            system(command)

            fileHandle = open(tempFile2, 'r')
            l.reverse()
            outFh = io.StringIO()
            for i in fastaRead(fileHandle):
                name, seq = i
                assert i == l.pop()
                fastaWrite(outFh, name, seq)
            outFh.close()
            fileHandle.close()

예제 #3

0

파일 보기

파일: blastTest.py 프로젝트: zqingyuan/cactus

 def testBlastRandom(self):
     """Make some sequences, put them in a file, call blast with random parameters 
     and check it runs okay.
     """
     tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa")
     self.tempFiles.append(tempSeqFile)
     for test in xrange(self.testNo):
         seqNo = random.choice(xrange(0, 10))
         seq = getRandomSequence(8000)[1]
         fileHandle = open(tempSeqFile, 'w')
         for fastaHeader, seq in [(str(i),
                                   mutateSequence(seq,
                                                  0.3 * random.random()))
                                  for i in xrange(seqNo)]:
             if random.random() > 0.5:
                 seq = reverseComplement(seq)
             fastaWrite(fileHandle, fastaHeader, seq)
         fileHandle.close()
         chunkSize = random.choice(xrange(500, 9000))
         overlapSize = random.choice(xrange(2, 100))
         toilDir = os.path.join(getTempDirectory(self.tempDir), "toil")
         runCactusBlast([tempSeqFile], self.tempOutputFile, toilDir,
                        chunkSize, overlapSize)
         #runToilStatusAndFailIfNotComplete(toilDir)
         if getLogLevelString() == "DEBUG":
             system("cat %s" % self.tempOutputFile)
         system("rm -rf %s " % toilDir)

예제 #4

0

파일 보기

파일: cactus_makeAlphaNumericHeaders.py 프로젝트: benedictpaten/cactus

def main():
    ##########################################
    #Construct the arguments.
    ##########################################    
    
    usage = "usage: %prog [options] <fasta input file> <fasta output file>\n\n" + \
            "    <fasta file>:  fasta sequence to annotate\n"
    description = "Ensure sequence names contain only alphanumeric characters\n" 
    parser = OptionParser(usage=usage, description=description)

    options, args = parser.parse_args()
    
    if len(args) != 2:
        parser.print_help()
        return 1
    
    inputName = args[0]
    inputFile = open(inputName, "r")
    outputName = args[1]
    outputFile = open(outputName, "w")
     
    for header, seq in fastaRead(inputFile):
        fastaWrite(outputFile, fixHeader(header), seq)
            
    outputFile.close()
    inputFile.close()
    return 0

예제 #5

0

파일 보기

파일: test.py 프로젝트: zoumingr/cactus

def getCactusInputs_funkyHeaderNames(regionNumber=0, tempDir=None):
    """Gets inputs (based on Blanchette region 0) that have weird header names
    that might get parsed wrong and cause issues."""
    sequences, newickTreeString = getCactusInputs_blanchette(
        regionNumber=regionNumber)

    # Assign weird header names
    if tempDir is None:
        tempDir = getTempDir()
    # Should also consider "bar foo", "ba rfoo", but we currently
    # throw away everything but the first token (probably because of
    # cigar parsing).
    funkyHeaderNames = [
        'id=1|foo', 'test1|1600', 'test2|', '|test3', 'id=1|bar'
    ]
    funkyIndex = 0
    for i, sequencePath in enumerate(sequences):
        newPath = os.path.join(tempDir, str(i))
        for _, sequence in fastaRead(sequencePath):
            header = funkyHeaderNames[funkyIndex % len(funkyHeaderNames)]
            funkyIndex += 1
            fastaWrite(newPath, header, sequence, 'a')
        sequences[i] = newPath

    return sequences, newickTreeString

예제 #6

0

파일 보기

파일: last.py 프로젝트: tmfarrell/ont_dap

    def run(self, params="-s 2 -T 0 -Q 0 -a 1"):
        localReferenceFastaFile = os.path.join(self.getLocalTempDir(), "ref.fa")
        #Because we don't want to have any crufty files created in the local temp dir.

        indexFile = os.path.join(self.getLocalTempDir(), "my-index") #Index file

        mafFile = os.path.join(self.getLocalTempDir(), "out.maf") #MAF file

        #Hack to make last work, creating SQ line
        fH = open(self.outputSamFile, 'w')
        for name, seq in fastaRead(open(self.referenceFastaFile, 'r')):
            fH.write("@SQ\tSN:%s\tLN:%s\n" % (name.split()[0], len(seq)))
        fH.close()

        #Make fasta file, as last fastq seems broken
        localReadFile = os.path.join(self.getLocalTempDir(), "reads.fa") #Index file
        fH = open(localReadFile, 'w')
        for name, seq, quals in fastqRead(self.readFastqFile):
            fastaWrite(fH, name, seq)
        fH.close()

        system("cp %s %s" % (self.referenceFastaFile, localReferenceFastaFile)) #Copy across the ref file
        system("lastdb %s %s" % (indexFile, localReferenceFastaFile)) #Build the index
        system("lastal %s %s %s > %s" % (params, indexFile, localReadFile, mafFile)) #Build the alignment
        system("maf-convert.py sam %s >> %s" % (mafFile, self.outputSamFile)) #Now convert sam file

예제 #7

0

파일 보기

파일: mutateReference.py 프로젝트: isovic/marginAlign

def main():
    #Parse the inputs args/options
    parser = OptionParser(usage="usage: inputFastaFile outputFastaFile outputMutationsFile [options]", 
                          version="%prog 0.1")

    parser.add_option("--snpRate", dest="snpRate", 
                      help="The probability of introducing a random different base at each position", 
                      default=0.2, type=float)

    #Parse the options/arguments
    options, args = parser.parse_args()
    
    #Print help message if no input
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)
    
    #Exit if the arguments are not what we expect
    if len(args) != 3:
        raise RuntimeError("Expected three arguments, got: %s" % " ".join(args))
 
    #This call gets the mutated sequences and a list of mutations
    mutatedSequences, allMutations = mutateSequences(getFastaDictionary(args[0]), options.snpRate)
 
    #Write out the mutated sequences into the given file
    fH = open(args[1], 'w')
    for name in mutatedSequences:
        fastaWrite(fH, name, mutatedSequences[name])
    fH.close()
    
    #Write out mutations
    fH = open(args[2], 'w')
    for mutation in allMutations:
        fH.write("\t".join(map(str, mutation)) + "\n")
    fH.close()

예제 #8

0

파일 보기

    def run(self, params="-s 2 -T 0 -Q 0 -a 1"):
        localReferenceFastaFile = os.path.join(
            self.getLocalTempDir(), "ref.fa"
        )  #Because we don't want to have any crufty files created in the local temp dir.
        indexFile = os.path.join(self.getLocalTempDir(),
                                 "my-index")  #Index file
        mafFile = os.path.join(self.getLocalTempDir(), "out.maf")  #MAF file
        #Hack to make last work, creating SQ line
        fH = open(self.outputSamFile, 'w')
        for name, seq in fastaRead(open(self.referenceFastaFile, 'r')):
            fH.write("@SQ\tSN:%s\tLN:%s\n" % (name.split()[0], len(seq)))
        fH.close()

        #Make fasta file, as last fastq seems broken
        localReadFile = os.path.join(self.getLocalTempDir(),
                                     "reads.fa")  #Index file
        fH = open(localReadFile, 'w')
        for name, seq, quals in fastqRead(self.readFastqFile):
            fastaWrite(fH, name, seq)
        fH.close()

        system("cp %s %s" %
               (self.referenceFastaFile,
                localReferenceFastaFile))  #Copy across the ref file
        system("lastdb %s %s" %
               (indexFile, localReferenceFastaFile))  #Build the index
        system(
            "lastal %s %s %s > %s" %
            (params, indexFile, localReadFile, mafFile))  #Build the alignment
        system("maf-convert.py sam %s >> %s" %
               (mafFile, self.outputSamFile))  #Now convert sam file

예제 #9

0

파일 보기

 def prepare_oned(self, nanopore_read, oned_read_path):
     try:
         read_file = open(oned_read_path, "w")
         fastaWrite(fileHandleOrFile=read_file,
                    name=nanopore_read.read_label,
                    seq=nanopore_read.template_read)
         version = nanopore_read.version
         read_file.close()
         nanopore_read.close()
         return True, version, False
     except Exception:
         return False, None, False

예제 #10

0

파일 보기

파일: align_cgp_cds.py 프로젝트: yuzhenpeng/comparativeAnnotator

def align_consensus(tmp_dir, gp, target_genome_fasta, ref_tx_fasta):
    """
    Main consensus alignment function.
    """
    ref_tx_fasta = Fasta(ref_tx_fasta)
    target_genome_fasta = Fasta(target_genome_fasta)
    tmp_tgt, tmp_ref, tmp_psl = prepare_tmp_files(tmp_dir, gp,
                                                  target_genome_fasta)
    tx_seq = str(ref_tx_fasta[gp.name])
    fastaWrite(tmp_ref, gp.name, tx_seq)
    system("blat {} {} -out=psl -noHead {}".format(tmp_tgt, tmp_ref, tmp_psl))
    r = popenCatch("simpleChain -outPsl {} /dev/stdout".format(tmp_psl))
    r = r.split("\n")[:-1]
    best_cov, best_ident = evaluate_blat_results(r)
    return map(str, [gp.id, gp.name, best_cov, best_ident])

예제 #11

0

파일 보기

 def prepare_twod(self, nanopore_read, twod_read_path):
     # check for table to make 'assembled' 2D alignment table fasta with
     if nanopore_read.has2D_alignment_table is False:
         nanopore_read.close()
         return False, None, False
     fasta_handle = open(twod_read_path, "w")
     fastaWrite(fileHandleOrFile=fasta_handle,
                name=nanopore_read.read_label,
                seq=nanopore_read.alignment_table_sequence)
     if nanopore_read.complement_model_id == "complement_median68pA_pop1.model":
         pop1_complement = True
     else:
         pop1_complement = False
     version = nanopore_read.version
     fasta_handle.close()
     nanopore_read.close()
     return True, version, pop1_complement

예제 #12

0

파일 보기

def main():
    #Parse the inputs args/options
    parser = OptionParser(
        usage=
        "usage: inputFastaFile outputFastaFile outputMutationsFile [options]",
        version="%prog 0.1")

    parser.add_option(
        "--snpRate",
        dest="snpRate",
        help=
        "The probability of introducing a random different base at each position",
        default=0.2,
        type=float)

    #Parse the options/arguments
    options, args = parser.parse_args()

    #Print help message if no input
    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(0)

    #Exit if the arguments are not what we expect
    if len(args) != 3:
        raise RuntimeError("Expected three arguments, got: %s" %
                           " ".join(args))

    #This call gets the mutated sequences and a list of mutations
    mutatedSequences, allMutations = mutateSequences(
        getFastaDictionary(args[0]), options.snpRate)

    #Write out the mutated sequences into the given file
    fH = open(args[1], 'w')
    for name in mutatedSequences:
        fastaWrite(fH, name, mutatedSequences[name])
    fH.close()

    #Write out mutations
    fH = open(args[2], 'w')
    for mutation in allMutations:
        fH.write("\t".join(map(str, mutation)) + "\n")
    fH.close()

예제 #13

0

파일 보기

    def testRandom(self):
        """Makes random sequences and tests that Ortheus can align them and produce a valid output.
        """
        outputFile = getTempFile()
        self.tempFiles.append(outputFile)

        MAX_SEQS = 20

        for i in xrange(MAX_SEQS):
            self.tempFiles.append(getTempFile())

        for test in xrange(0, self.testNo):
            print "test no : %i " % test
            #seqNo
            binaryTree = randomTree()
            middleSeq = getRandomSequence(250)[1]
            seqs = []
            getTreeSeqs(binaryTree, middleSeq, seqs)

            if len(seqs) <= MAX_SEQS and len(seqs) > 2:
                seqFiles = []
                for i in xrange(0, len(seqs)):
                    seqFiles.append(self.tempFiles[1 + i])
                    fileHandle = open(seqFiles[i], 'w')
                    fastaWrite(fileHandle, "%i" % i, seqs[i])
                    fileHandle.close()
                print "Have seq files ", seqFiles

                treeString = printBinaryTree(binaryTree, True)
                print "For tree ", treeString

                #align seqs and check no failure
                command = "ortheus_core -a %s -b '%s' -d %s -e" % (
                    " ".join(seqFiles), treeString, outputFile)
                print "command to call", command
                system(command)

                #check alignment is complete
                alignment = [i[:] for i in fastaAlignmentRead(outputFile)]
                #print "alignment", alignment
                checkAlignment(alignment, seqs)

                print "test no is finished : %i " % test

예제 #14

0

파일 보기

파일: cactus_filterSmallFastaSequences.py 프로젝트: zoumingr/cactus

def main():
    ##########################################
    #Construct the arguments.
    ##########################################

    usage = "usage: %prog [options] <fasta input file> <fasta output file>\n\n" + \
            "    <fasta file>:  fasta sequence to filter\n"
    description = "Ensure sequences have length >= length\n"

    parser = OptionParser(usage=usage, description=description)

    parser.add_option("--prefix",
                      dest="prefix",
                      type="string",
                      help="only filter sequences with prefix in name",
                      default="")
    parser.add_option("--length",
                      dest="length",
                      type="int",
                      help="filter shorter than length [default=1000]",
                      default=1000)

    options, args = parser.parse_args()

    if len(args) != 2:
        parser.print_help()
        return 1

    inputName = args[0]
    inputFile = open(inputName, "r")
    outputName = args[1]
    outputFile = open(outputName, "w")

    contTable = containedSequences(inputFile)
    inputFile.seek(0)

    for header, seq in fastaRead(inputFile):
        if tooShort(header, seq, options, contTable) == False:
            fastaWrite(outputFile, header, seq)

    outputFile.close()
    inputFile.close()
    return 0

예제 #15

0

파일 보기

 def testFastaReadWrite(self):
     tempFile = getTempFile()
     self.tempFiles.append(tempFile)
     for test in range(0, self.testNo):
         fastaNumber = random.choice(range(10))
         l = [getRandomSequence() for i in range(fastaNumber)]
         fileHandle = open(tempFile, 'w')
         for name, seq in l:
             fastaWrite(fileHandle, name, seq)
         fileHandle.close()
         fileHandle = open(tempFile, 'r')
         l.reverse()
         outFh = io.StringIO()
         for i in fastaRead(fileHandle):
             assert i == l.pop()
             name, seq = i
             fastaWrite(outFh, name, seq)
         outFh.close()
         fileHandle.close()

예제 #16

0

파일 보기

파일: OrtheusTests.py 프로젝트: benedictpaten/ortheus

    def testRandom(self):
        """Makes random sequences and tests that Ortheus can align them and produce a valid output.
        """
        outputFile = getTempFile()
        self.tempFiles.append(outputFile)

        MAX_SEQS = 20

        for i in xrange(MAX_SEQS):
            self.tempFiles.append(getTempFile())

        for test in xrange(0, self.testNo):
            print "test no : %i " % test
            # seqNo
            binaryTree = randomTree()
            middleSeq = getRandomSequence(250)[1]
            seqs = []
            getTreeSeqs(binaryTree, middleSeq, seqs)

            if len(seqs) <= MAX_SEQS and len(seqs) > 2:
                seqFiles = []
                for i in xrange(0, len(seqs)):
                    seqFiles.append(self.tempFiles[1 + i])
                    fileHandle = open(seqFiles[i], "w")
                    fastaWrite(fileHandle, "%i" % i, seqs[i])
                    fileHandle.close()
                print "Have seq files ", seqFiles

                treeString = printBinaryTree(binaryTree, True)
                print "For tree ", treeString

                # align seqs and check no failure
                command = "ortheus_core -a %s -b '%s' -d %s -e" % (" ".join(seqFiles), treeString, outputFile)
                print "command to call", command
                system(command)

                # check alignment is complete
                alignment = [i[:] for i in fastaAlignmentRead(outputFile)]
                # print "alignment", alignment
                checkAlignment(alignment, seqs)

                print "test no is finished : %i " % test

예제 #17

0

파일 보기

파일: cactus_filterSmallFastaSequences.py 프로젝트: benedictpaten/cactus

def main():
    ##########################################
    #Construct the arguments.
    ##########################################    
    
    usage = "usage: %prog [options] <fasta input file> <fasta output file>\n\n" + \
            "    <fasta file>:  fasta sequence to filter\n"
    description = "Ensure sequences have length >= length\n"
                    
    parser = OptionParser(usage=usage, description=description)
    
    parser.add_option("--prefix", dest="prefix", type="string",
                      help="only filter sequences with prefix in name",
                      default="")
    parser.add_option("--length", dest="length", type="int",
                      help="filter shorter than length [default=1000]",
                      default=1000)
    
    options, args = parser.parse_args()
    
    if len(args) != 2:
        parser.print_help()
        return 1
    
    inputName = args[0]
    inputFile = open(inputName, "r")
    outputName = args[1]
    outputFile = open(outputName, "w")
  
    contTable = containedSequences(inputFile)
    inputFile.seek(0)
  
    for header, seq in fastaRead(inputFile):
        if tooShort(header, seq, options, contTable) == False:
            fastaWrite(outputFile, header, seq)
      
    outputFile.close()
    inputFile.close()  
    return 0

예제 #18

0

파일 보기

파일: test.py 프로젝트: benedictpaten/cactus

def getCactusInputs_funkyHeaderNames(regionNumber=0, tempDir=None):
    """Gets inputs (based on Blanchette region 0) that have weird header names
    that might get parsed wrong and cause issues."""
    sequences, newickTreeString = getCactusInputs_blanchette(regionNumber=regionNumber)

    # Assign weird header names
    if tempDir is None:
        tempDir = getTempDir()
    # Should also consider "bar foo", "ba rfoo", but we currently
    # throw away everything but the first token (probably because of
    # cigar parsing).
    funkyHeaderNames = ['id=1|foo', 'test1|1600', 'test2|', '|test3', 'id=1|bar']
    funkyIndex = 0
    for i, sequencePath in enumerate(sequences):
        newPath = os.path.join(tempDir, str(i))
        for _, sequence in fastaRead(sequencePath):
            header = funkyHeaderNames[funkyIndex % len(funkyHeaderNames)]
            funkyIndex += 1
            fastaWrite(newPath, header, sequence, 'a')
        sequences[i] = newPath

    return sequences, newickTreeString

예제 #19

0

파일 보기

파일: blastTest.py 프로젝트: benedictpaten/cactus

 def testBlastRandom(self):
     """Make some sequences, put them in a file, call blast with random parameters 
     and check it runs okay.
     """
     tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa")
     self.tempFiles.append(tempSeqFile)
     for test in xrange(self.testNo):
         seqNo = random.choice(xrange(0, 10))
         seq = getRandomSequence(8000)[1]
         fileHandle = open(tempSeqFile, 'w')
         for fastaHeader, seq in [ (str(i), mutateSequence(seq, 0.3*random.random())) for i in xrange(seqNo) ]:
             if random.random() > 0.5:
                 seq = reverseComplement(seq)
             fastaWrite(fileHandle, fastaHeader, seq)
         fileHandle.close()
         chunkSize = random.choice(xrange(500, 9000))
         overlapSize = random.choice(xrange(2, 100))
         toilDir = os.path.join(getTempDirectory(self.tempDir), "toil")
         runCactusBlast([ tempSeqFile ], self.tempOutputFile, toilDir, chunkSize, overlapSize)
         #runToilStatusAndFailIfNotComplete(toilDir)
         if getLogLevelString() == "DEBUG":
             system("cat %s" % self.tempOutputFile)
         system("rm -rf %s " % toilDir)

예제 #20

0

파일 보기

def align(target, g, target_fasta, chunk, ref_fasta, out_path):
    g_f = Fasta(target_fasta)
    r_f = Fasta(ref_fasta)
    results = []
    for aug_aId in chunk:
        aId = remove_augustus_alignment_number(aug_aId)
        gencode_id = remove_alignment_number(aId)
        gencode_seq = str(r_f[gencode_id])
        aug_seq = str(g_f[aug_aId])
        tmp_aug = os.path.join(target.getLocalTempDir(), "tmp_aug")
        tmp_gencode = os.path.join(target.getLocalTempDir(), "tmp_gencode")
        fastaWrite(tmp_aug, aug_aId, aug_seq)
        fastaWrite(tmp_gencode, gencode_id, gencode_seq)
        r = popenCatch("blat {} {} -out=psl -noHead /dev/stdout".format(tmp_gencode, tmp_aug))
        r = r.split("\n")[:-3]
        if len(r) == 0:
            results.append([aug_aId, "0", "0"])
        else:
            p_list = [PslRow(x) for x in r]
            results.append(map(str, [aug_aId, identity(p_list), coverage(p_list)]))
    with open(os.path.join(out_path, getRandomAlphaNumericString(10) + ".txt"), "w") as outf:
        for x in results:
            outf.write("\t".join(x) + "\n")

예제 #21

0

파일 보기

파일: align_augustus.py 프로젝트: yuzhenpeng/comparativeAnnotator

def align(target, target_fasta, chunk, ref_fasta, file_tree):
    g_f = Fasta(target_fasta)
    r_f = Fasta(ref_fasta)
    results = []
    tmp_aug = os.path.join(target.getGlobalTempDir(), "tmp_aug")
    tmp_gencode = os.path.join(target.getGlobalTempDir(), "tmp_gencode")
    tmp_psl = os.path.join(target.getGlobalTempDir(), "tmp_psl")
    with open(tmp_aug, "w") as tmp_aug_h, open(tmp_gencode,
                                               "w") as tmp_gencode_h:
        for tgt_id in chunk:
            query_id = remove_augustus_alignment_number(tgt_id)
            gencode_id = remove_alignment_number(query_id)
            gencode_seq = str(r_f[gencode_id])
            aug_seq = str(g_f[tgt_id])
            fastaWrite(tmp_aug_h, tgt_id, aug_seq)
            fastaWrite(tmp_gencode_h, gencode_id, gencode_seq)
    system("blat {} {} -out=psl -noHead {}".format(tmp_aug, tmp_gencode,
                                                   tmp_psl))
    r = popenCatch("simpleChain -outPsl {} /dev/stdout".format(tmp_psl))
    r = r.split("\n")[:-1]
    r_d = defaultdict(list)
    for p in tokenize_stream(r):
        psl = PslRow(p)
        r_d[psl.t_name].append(psl)
    assert len(r_d.viewkeys() & set(chunk)) > 0, (r_d.viewkeys(), set(chunk))
    for tgt_id in chunk:
        if tgt_id not in r_d:
            results.append([tgt_id, query_id, "0", "0"])
        else:
            p_list = [[min(x.coverage, x.target_coverage), x.identity]
                      for x in r_d[tgt_id]]
            best_cov, best_ident = sorted(p_list, key=lambda x: x[0])[-1]
            results.append(map(str, [tgt_id, query_id, best_cov, best_ident]))
    with open(file_tree.getTempFile(), "w") as outf:
        for x in results:
            outf.write("".join([",".join(x), "\n"]))

예제 #22

0

파일 보기

파일: align_cgp_cds.py 프로젝트: yuzhenpeng/comparativeAnnotator

def align_cgp(tmp_dir, gp, target_genome_fasta, tx_dict, ref_tx_fasta):
    """
    Main CGP alignment function. For each CGP transcript, uses tx_dict to BLAT against all transcripts. These alignments
    are then chained and the highest coverage alignment used. This circumvents problems with multiple self alignments
    in the case of repeats.
    """
    results = []
    ref_tx_fasta = Fasta(ref_tx_fasta)
    target_genome_fasta = Fasta(target_genome_fasta)
    tmp_tgt, tmp_ref, tmp_psl = prepare_tmp_files(tmp_dir, gp,
                                                  target_genome_fasta)
    for gene_name, tx_names in tx_dict.iteritems():
        for tx_name in tx_names:
            tx_seq = str(ref_tx_fasta[tx_name])
            fastaWrite(tmp_ref, tx_name, tx_seq)
            system("blat {} {} -out=psl -noHead {}".format(
                tmp_tgt, tmp_ref, tmp_psl))
            r = popenCatch(
                "simpleChain -outPsl {} /dev/stdout".format(tmp_psl))
            r = r.split("\n")[:-1]
            best_cov, best_ident = evaluate_blat_results(r)
            results.append(
                map(str, [gp.name, gene_name, tx_name, best_cov, best_ident]))
    return results

예제 #23

0

파일 보기

파일: mutate_reference.py 프로젝트: zax-bioinforer/nanopore

def mutateReferenceSequences(referenceFastaFiles):
    updatedReferenceFastaFiles = referenceFastaFiles[:]
    for referenceFastaFile in referenceFastaFiles:
        if not "percent" in referenceFastaFile:
            mutation_rates = [0.01, 0.05, 0.10, 0.20]
            for mutation_rate in mutation_rates:
                indel_rate = 0.0 * mutation_rate # indel rate = 20% of Substitution rate
                i = mutation_rate * 100
                j = indel_rate * 100
                newReferenceFastaFile = referenceFastaFile.split(".fa")[0] + "_" + str(i) + "_percent_SNPs_" + str(j) + "_percent_InDels.fasta"
                mutationIndexFile = referenceFastaFile.split(".fa")[0] + "_" + str(i) + "_percent_SNPs_" + str(j) + "_percent_InDels.fasta_Index.txt"
                updatedReferenceFastaFiles.append(newReferenceFastaFile)
                if not os.path.exists(newReferenceFastaFile):
                    fH = open(newReferenceFastaFile, 'w')
                    fH2 = open(mutationIndexFile, 'w')
                    for header, seq in fastaRead(referenceFastaFile):
                        header = header.split()[0]
                        mutatedSeq = mutateSequence(seq, mutation_rate)
                        fastaWrite(fH, header, mutatedSeq)
                        fastaWrite(fH2, header, seq)
                        fastaWrite(fH2, header + "_mutated", mutatedSeq)
                    fH.close()
                    fH2.close()
    return updatedReferenceFastaFiles

예제 #24

0

파일 보기

파일: cropSequence.py 프로젝트: benedictpaten/referenceScripts

import sys
import xml.etree.ElementTree as ET
from sonLib.bioio import fastaRead, fastaWrite
node = ET.parse(sys.argv[1]).getroot()
fH = open(sys.argv[3], 'w')
seqs = [i for i in fastaRead(open(sys.argv[2], 'r'))]
assert (len(seqs) == 1)
for name, sequence in seqs:
    #>hg19.chr6.171115067.28377796.5150977.1
    i = name.split(".")
    j = int(node.attrib["minOtherReferenceCoordinate"])
    k = int(node.attrib["maxOtherReferenceCoordinate"])
    fastaWrite(fH,
               ".".join(i[0:3] +
                        [str(int(i[3]) + j), str(k - j)] + i[-1:]),
               sequence[j:k])
fH.close()

예제 #25

0

파일 보기

 def testCactusWorkflow_Blanchette(self): 
     """Runs the workflow on blanchette's simulated (colinear) regions.
     """
     if "SON_TRACE_DATASETS" not in os.environ:
         return
     for test in xrange(self.testNo):
         tempFiles = []
         tempDir = getTempDirectory(os.getcwd())
         
         trueAlignment = os.path.join(TestStatus.getPathToDataSets(), "blanchettesSimulation", "00.job", "true.mfa")
         
         #Load the true alignment.
         columnAlignment = [ i for i in  fastaAlignmentRead(trueAlignment) ]
         fastaHeaders = [ i for i in fastaReadHeaders(trueAlignment) ]
         sequenceNumber = 9
         
         #The tree
         newickTreeString = "((((HUMAN:0.006969, CHIMP:0.009727):0.025291, BABOON:0.044568):0.11,(RAT:0.072818, MOUSE:0.081244):0.260342):0.023260,((DOG:0.07, CAT:0.07):0.087381,(PIG:0.06, COW:0.06):0.104728):0.04);"
         
         #Get random dir
         testDir = getTempDirectory(tempDir)
         
         #random alignment
         alignmentLength = 5000
         randomStart = random.choice(xrange(len(columnAlignment)-alignmentLength))
         subAlignment = columnAlignment[randomStart:randomStart+alignmentLength]
         logger.info("Got a sub alignment, it is %i columns long" % len(subAlignment))
         
         #Get sequences
         sequences = [ (fastaHeaders[seqNo], "".join([ column[seqNo] for column in subAlignment if column[seqNo] != '-' ])) for seqNo in xrange(sequenceNumber) ]
         logger.info("Got the sequences")
         
         #Write sequences into temp files
         tempFastaFiles = []
         for seqNo in xrange(sequenceNumber):
             header, sequence = sequences[seqNo]
             logger.info("Making temp file for header: %s, seq: %s" % (header, sequence))
             tempFastaFile = os.path.join(testDir, "%i.fa" % seqNo)
             tempFastaFiles.append(tempFastaFile)
             fileHandle = open(tempFastaFile, "w")
             fastaWrite(fileHandle, header, sequence)
             fileHandle.close()
         logger.info("Got the temp sequence files")
         
         experiment = getCactusWorkflowExperimentForTest(tempFastaFiles, newickTreeString, testDir)
         experimentFile = os.path.join(testDir, "experiment.xml")
         experiment.writeXML(experimentFile)
         cactusDiskDatabaseString = experiment.getDiskDatabaseString()
         
         jobTree = os.path.join(testDir, "jobTree")
         
         runCactusWorkflow(experimentFile, jobTree)
         logger.info("Ran the the workflow")
         
         #Check the output alignment
         runJobTreeStatusAndFailIfNotComplete(jobTree)
         logger.info("Checked the job tree dir")
         
         #Output the 'TRUE' alignment file
         if os.system("mfaToMaf --help > /dev/null 2>&1") == 0 and\
            os.system("cactus_MAFGenerator --help > /dev/null 2>&1") == 0 and\
            os.system("mafComparator --help > /dev/null 2>&1") == 0 and\
            os.system("cactus_treeStats --help > /dev/null 2>&1") == 0:
             trueMFAFile = os.path.join(testDir, "true.mfa")
             fastaAlignmentWrite(subAlignment, fastaHeaders, len(fastaHeaders), trueMFAFile)
             trueMAFFile = os.path.join(testDir, "true.maf")
             system("mfaToMaf --mfaFile %s --outputFile %s --logLevel %s" % (trueMFAFile, trueMAFFile, getLogLevelString()))
             system("cat %s" % trueMAFFile)
             
             #Now get mafs for the region.
             mAFFile = os.path.join(testDir, "flower.maf")
             system("cactus_MAFGenerator --flowerName 0 --cactusDisk '%s' --outputFile %s --logLevel %s" % (cactusDiskDatabaseString, mAFFile, getLogLevelString()))
             logger.info("Got the MAFs from the flower disk")
             system("cat %s" % mAFFile)
             
             statsFile = os.path.join(testDir, "stats.xml")
             system("cactus_treeStats --cactusDisk '%s' --flowerName 0 --outputFile %s --logLevel %s" % (cactusDiskDatabaseString, statsFile, getLogLevelString()))
             system("cat %s" % statsFile)
             logger.info("Got the cactus tree stats")
             
             #Now compare the mafs to the output.
             resultsFile = os.path.join(testDir, "results.xml")
             system("mafComparator --mafFile1 %s --mafFile2 %s --outputFile %s --logLevel %s" % (trueMAFFile, mAFFile, resultsFile, getLogLevelString()))
             logger.info("Ran the maf comparator")
             
             system("cat %s" % resultsFile)
             
             #Cleanup
             experiment.cleanupDb()
             system("rm -rf %s" % testDir)
             logger.info("Successfully ran test for the problem")
             
         for tempFile in tempFiles:
             os.remove(tempFile)
         system("rm -rf %s" % tempDir)

예제 #26

0

파일 보기

파일: removeNs.py 프로젝트: benedictpaten/referenceScripts

            header.start += len( subSequence ) + lenNs
        
        sequence = sequence[m.start() + lenNs: ]
        m = re.search( pattern, sequence )
    
    i = fn2(header, searchedSeq + sequence)
    if i != None:
        yield i

#=========== MAIN ====================
fH = open(sys.argv[1], 'r')
fH2 = open(sys.argv[2], 'w')
lengthOfNs = int(sys.argv[3])
lengthOfFragment = int(sys.argv[4])
if len(sys.argv) == 6:
    setLogLevel(sys.argv[5])

headers = set()
for name, sequence in fastaRead(fH):
    header = Header( name.split()[0], len(sequence) )
    logger.info("Got a sequence of length %i with header %s for processing" % (len(sequence), name.split()[0]))
    for newheader, subsequence in fn( header, sequence, lengthOfNs ):
        if len( subsequence ) > 0:
            logger.info("Writing out a sequence of length %i with header %s" % (len(subsequence), newheader))
            assert newheader not in headers
            headers.add(newheader)
            fastaWrite(fH2, newheader, subsequence)
        
fH.close()
fH2.close()

예제 #27

0

파일 보기

파일: cropSequence.py 프로젝트: benedictpaten/referenceScripts

import sys
import xml.etree.ElementTree as ET
from sonLib.bioio import fastaRead, fastaWrite
node = ET.parse(sys.argv[1]).getroot()
fH = open(sys.argv[3], 'w')
seqs = [ i for i in fastaRead(open(sys.argv[2], 'r')) ]
assert(len(seqs) == 1)
for name, sequence in seqs:
    #>hg19.chr6.171115067.28377796.5150977.1
    i = name.split(".")
    j = int(node.attrib["minOtherReferenceCoordinate"])
    k = int(node.attrib["maxOtherReferenceCoordinate"])
    fastaWrite(fH, ".".join(i[0:3] + [ str(int(i[3]) + j), str(k - j)] + i[-1:]), sequence[j:k])
fH.close()

예제 #28

0

파일 보기

파일: makeHaploid.py 프로젝트: benedictpaten/referenceScripts

from sonLib.bioio import fastaRead, fastaWrite
import sys
import random
fH = open(sys.argv[2], "w")
def fn(k, i, j):
    if k.upper() == i.upper():
        l = random.choice(j)
        if k == k.upper():
            return l.upper()
        return l.lower()
    else:
        return k
for name, seq in fastaRead(open(sys.argv[1], "r")):
    for i, j in [ ("W", ("A", "T")),
                 ("S", ("C", "G")),
                 ("M", ("A", "C")),
                 ("K", ("G", "T")),
                 ("R", ("A", "G")),
                 ("Y", ("C", "T")),
                 ("B", ("C", "G", "T")),
                 ("D", ("A", "G", "T")),
                 ("H", ("A", "C", "T")),
                 ("V", ("A", "C", "G")) ]:
        seq = "".join([ fn(k, i, j) for k in seq ])
    fastaWrite(fH, name, seq)
fH.close()

예제 #29

0

파일 보기

파일: test.py 프로젝트: zoumingr/cactus

def getCactusInputs_random(regionNumber=0,
                           tempDir=None,
                           sequenceNumber=None,
                           avgSequenceLength=None,
                           treeLeafNumber=None):
    """Gets a random set of sequences, each of length given, and a species
    tree relating them. Each sequence is a assigned an event in this tree.
    """
    if sequenceNumber is None:
        sequenceNumber = random.choice(list(range(30)))
    if avgSequenceLength is None:
        avgSequenceLength = random.choice(list(range(1, 3000)))
    if treeLeafNumber is None:
        treeLeafNumber = random.choice(list(range(2, 4)))

    #Make tree
    binaryTree = makeRandomBinaryTree(treeLeafNumber)
    newickTreeString = printBinaryTree(binaryTree, includeDistances=True)
    newickTreeLeafNames = []

    def fn(tree):
        if tree.internal:
            fn(tree.left)
            fn(tree.right)
        else:
            newickTreeLeafNames.append(tree.iD)

    fn(binaryTree)
    logger.info("Made random binary tree: %s" % newickTreeString)

    sequenceDirs = []
    for i in range(len(newickTreeLeafNames)):
        seqDir = getTempDirectory(rootDir=tempDir)
        sequenceDirs.append(seqDir)

    logger.info("Made a set of random directories: %s" %
                " ".join(sequenceDirs))

    #Random sequences and species labelling
    sequenceFile = None
    fileHandle = None
    parentSequence = getRandomSequence(
        length=random.choice(list(range(1, 2 * avgSequenceLength))))[1]
    emptySequenceDirs = set(sequenceDirs)
    i = 0
    while i < sequenceNumber or len(emptySequenceDirs) > 0:
        if sequenceFile == None:
            if random.random(
            ) > 0.5:  #Randomly choose the files to be attached or not
                suffix = ".fa.complete"
            else:
                suffix = ".fa"
            sequenceDir = random.choice(sequenceDirs)
            if sequenceDir in emptySequenceDirs:
                emptySequenceDirs.remove(sequenceDir)
            sequenceFile = getTempFile(rootDir=sequenceDir, suffix=suffix)
            fileHandle = open(sequenceFile, 'w')
        if random.random() > 0.8:  #Get a new root sequence
            parentSequence = getRandomSequence(
                length=random.choice(list(range(1, 2 * avgSequenceLength))))[1]
        sequence = mutateSequence(parentSequence,
                                  distance=random.random() * 0.25)
        name = getRandomAlphaNumericString(15)
        if random.random() > 0.5:
            sequence = reverseComplement(sequence)
        fastaWrite(fileHandle, name, sequence)
        if random.random() > 0.5:
            fileHandle.close()
            fileHandle = None
            sequenceFile = None
        i += 1
    if fileHandle != None:
        fileHandle.close()

    logger.info("Made %s sequences in %s directories" %
                (sequenceNumber, len(sequenceDirs)))

    return sequenceDirs, newickTreeString

예제 #30

0

파일 보기

파일: makeHaploid.py 프로젝트: benedictpaten/referenceScripts

from sonLib.bioio import fastaRead, fastaWrite
import sys
import random
fH = open(sys.argv[2], "w")


def fn(k, i, j):
    if k.upper() == i.upper():
        l = random.choice(j)
        if k == k.upper():
            return l.upper()
        return l.lower()
    else:
        return k


for name, seq in fastaRead(open(sys.argv[1], "r")):
    for i, j in [("W", ("A", "T")), ("S", ("C", "G")), ("M", ("A", "C")),
                 ("K", ("G", "T")), ("R", ("A", "G")), ("Y", ("C", "T")),
                 ("B", ("C", "G", "T")), ("D", ("A", "G", "T")),
                 ("H", ("A", "C", "T")), ("V", ("A", "C", "G"))]:
        seq = "".join([fn(k, i, j) for k in seq])
    fastaWrite(fH, name, seq)
fH.close()

예제 #31

0

파일 보기

파일: pastaIdsToOriginalNames.py 프로젝트: joelarmstrong/treeBuildingEvaluation

#!/usr/bin/env python
# Usage: pastaIdsToOriginalNames.py fastaFile renameFile
import sys
from sonLib.bioio import system, fastaRead, fastaWrite

fastaFile = sys.argv[1]
renameFile = sys.argv[2]

curRealName = None
curPastaID = None
translate = {}
for i, line in enumerate(open(renameFile)):
    line = line.strip()
    if i % 3 == 0:
        curPastaID = line
    elif i % 3 == 1:
        curRealName = line
    else:
        translate[curPastaID] = curRealName

for header, seq in fastaRead(open(fastaFile)):
    # hacks for if we are using the badly-named original fasta.
    header = translate[header].replace("...", ".-.").replace(".", "_").replace("__", "_")
    fastaWrite(sys.stdout, header, seq)

예제 #32

0

파일 보기

파일: test.py 프로젝트: benedictpaten/cactus

def getCactusInputs_random(regionNumber=0, tempDir=None,
                           sequenceNumber=None,
                           avgSequenceLength=None,
                           treeLeafNumber=None):
    """Gets a random set of sequences, each of length given, and a species
    tree relating them. Each sequence is a assigned an event in this tree.
    """
    if sequenceNumber is None:
        sequenceNumber = random.choice(xrange(30))
    if avgSequenceLength is None:
        avgSequenceLength = random.choice(xrange(1,3000))
    if treeLeafNumber is None:
        treeLeafNumber = random.choice(xrange(2, 4))
    #Make tree
    binaryTree = makeRandomBinaryTree(treeLeafNumber)
    newickTreeString = printBinaryTree(binaryTree, includeDistances=True)
    newickTreeLeafNames = []
    def fn(tree):
        if tree.internal:
            fn(tree.left)
            fn(tree.right)
        else:
            newickTreeLeafNames.append(tree.iD)
    fn(binaryTree)
    logger.info("Made random binary tree: %s" % newickTreeString)
    
    sequenceDirs = []
    for i in xrange(len(newickTreeLeafNames)):
        seqDir = getTempDirectory(rootDir=tempDir)
        sequenceDirs.append(seqDir)

    logger.info("Made a set of random directories: %s" % " ".join(sequenceDirs))

    #Random sequences and species labelling
    sequenceFile = None
    fileHandle = None
    parentSequence = getRandomSequence(length=random.choice(xrange(1, 2*avgSequenceLength)))[1]
    emptySequenceDirs = set(sequenceDirs)
    i = 0
    while i < sequenceNumber or len(emptySequenceDirs) > 0:
        #for i in xrange(sequenceNumber):
        if sequenceFile == None:
            if random.random() > 0.5: #Randomly choose the files to be attached or not
                suffix = ".fa.complete"
            else:
                suffix = ".fa"
            sequenceDir = random.choice(sequenceDirs)
            if sequenceDir in emptySequenceDirs:
                emptySequenceDirs.remove(sequenceDir)
            sequenceFile = getTempFile(rootDir=sequenceDir, suffix=suffix)
            fileHandle = open(sequenceFile, 'w')
        if random.random() > 0.8: #Get a new root sequence
            parentSequence = getRandomSequence(length=random.choice(xrange(1, 2*avgSequenceLength)))[1]
        sequence = mutateSequence(parentSequence, distance=random.random()*0.5)
        name = getRandomAlphaNumericString(15)
        if random.random() > 0.5:
            sequence = reverseComplement(sequence)
        fastaWrite(fileHandle, name, sequence)
        if random.random() > 0.5:
            fileHandle.close()
            fileHandle = None
            sequenceFile = None
        i += 1
    if fileHandle != None:
        fileHandle.close()

    logger.info("Made %s sequences in %s directories" % (sequenceNumber, len(sequenceDirs)))
    
    return sequenceDirs, newickTreeString

예제 #33

0

파일 보기

import sys
import xml.etree.ElementTree as ET
from sonLib.bioio import fastaRead, fastaWrite
i = set([ i for i in ET.parse(sys.argv[1]).getroot().text.split() ])
fH = open(sys.argv[3], 'w')
for name, sequence in fastaRead(open(sys.argv[2], 'r')):
        if name not in i:
            fastaWrite(fH, name, sequence)
fH.close()