def main(): ########################################## #Construct the arguments. ########################################## usage = "usage: %prog [options] <fasta input file> <fasta output file>\n\n" + \ " <fasta file>: fasta sequence to annotate\n" description = "Ensure sequence names contain only alphanumeric characters\n" parser = OptionParser(usage=usage, description=description) options, args = parser.parse_args() if len(args) != 2: parser.print_help() return 1 inputName = args[0] inputFile = open(inputName, "r") outputName = args[1] outputFile = open(outputName, "w") for header, seq in fastaRead(inputFile): fastaWrite(outputFile, fixHeader(header), seq) outputFile.close() inputFile.close() return 0
def testFastaReadWriteC(self): """Tests consistency with C version of this function. """ tempFile = getTempFile() self.tempFiles.append(tempFile) tempFile2 = getTempFile() self.tempFiles.append(tempFile2) for test in range(0, self.testNo): fastaNumber = random.choice(range(10)) l = [getRandomSequence() for i in range(fastaNumber)] fileHandle = open(tempFile, 'w') for name, seq in l: fastaWrite(fileHandle, name, seq) fileHandle.close() command = "sonLib_fastaCTest %s %s" % (tempFile, tempFile2) print(command) system(command) fileHandle = open(tempFile2, 'r') l.reverse() outFh = io.StringIO() for i in fastaRead(fileHandle): name, seq = i assert i == l.pop() fastaWrite(outFh, name, seq) outFh.close() fileHandle.close()
def testBlastRandom(self): """Make some sequences, put them in a file, call blast with random parameters and check it runs okay. """ tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa") self.tempFiles.append(tempSeqFile) for test in xrange(self.testNo): seqNo = random.choice(xrange(0, 10)) seq = getRandomSequence(8000)[1] fileHandle = open(tempSeqFile, 'w') for fastaHeader, seq in [(str(i), mutateSequence(seq, 0.3 * random.random())) for i in xrange(seqNo)]: if random.random() > 0.5: seq = reverseComplement(seq) fastaWrite(fileHandle, fastaHeader, seq) fileHandle.close() chunkSize = random.choice(xrange(500, 9000)) overlapSize = random.choice(xrange(2, 100)) toilDir = os.path.join(getTempDirectory(self.tempDir), "toil") runCactusBlast([tempSeqFile], self.tempOutputFile, toilDir, chunkSize, overlapSize) #runToilStatusAndFailIfNotComplete(toilDir) if getLogLevelString() == "DEBUG": system("cat %s" % self.tempOutputFile) system("rm -rf %s " % toilDir)
def getCactusInputs_funkyHeaderNames(regionNumber=0, tempDir=None): """Gets inputs (based on Blanchette region 0) that have weird header names that might get parsed wrong and cause issues.""" sequences, newickTreeString = getCactusInputs_blanchette( regionNumber=regionNumber) # Assign weird header names if tempDir is None: tempDir = getTempDir() # Should also consider "bar foo", "ba rfoo", but we currently # throw away everything but the first token (probably because of # cigar parsing). funkyHeaderNames = [ 'id=1|foo', 'test1|1600', 'test2|', '|test3', 'id=1|bar' ] funkyIndex = 0 for i, sequencePath in enumerate(sequences): newPath = os.path.join(tempDir, str(i)) for _, sequence in fastaRead(sequencePath): header = funkyHeaderNames[funkyIndex % len(funkyHeaderNames)] funkyIndex += 1 fastaWrite(newPath, header, sequence, 'a') sequences[i] = newPath return sequences, newickTreeString
def run(self, params="-s 2 -T 0 -Q 0 -a 1"): localReferenceFastaFile = os.path.join(self.getLocalTempDir(), "ref.fa") #Because we don't want to have any crufty files created in the local temp dir. indexFile = os.path.join(self.getLocalTempDir(), "my-index") #Index file mafFile = os.path.join(self.getLocalTempDir(), "out.maf") #MAF file #Hack to make last work, creating SQ line fH = open(self.outputSamFile, 'w') for name, seq in fastaRead(open(self.referenceFastaFile, 'r')): fH.write("@SQ\tSN:%s\tLN:%s\n" % (name.split()[0], len(seq))) fH.close() #Make fasta file, as last fastq seems broken localReadFile = os.path.join(self.getLocalTempDir(), "reads.fa") #Index file fH = open(localReadFile, 'w') for name, seq, quals in fastqRead(self.readFastqFile): fastaWrite(fH, name, seq) fH.close() system("cp %s %s" % (self.referenceFastaFile, localReferenceFastaFile)) #Copy across the ref file system("lastdb %s %s" % (indexFile, localReferenceFastaFile)) #Build the index system("lastal %s %s %s > %s" % (params, indexFile, localReadFile, mafFile)) #Build the alignment system("maf-convert.py sam %s >> %s" % (mafFile, self.outputSamFile)) #Now convert sam file
def main(): #Parse the inputs args/options parser = OptionParser(usage="usage: inputFastaFile outputFastaFile outputMutationsFile [options]", version="%prog 0.1") parser.add_option("--snpRate", dest="snpRate", help="The probability of introducing a random different base at each position", default=0.2, type=float) #Parse the options/arguments options, args = parser.parse_args() #Print help message if no input if len(sys.argv) == 1: parser.print_help() sys.exit(0) #Exit if the arguments are not what we expect if len(args) != 3: raise RuntimeError("Expected three arguments, got: %s" % " ".join(args)) #This call gets the mutated sequences and a list of mutations mutatedSequences, allMutations = mutateSequences(getFastaDictionary(args[0]), options.snpRate) #Write out the mutated sequences into the given file fH = open(args[1], 'w') for name in mutatedSequences: fastaWrite(fH, name, mutatedSequences[name]) fH.close() #Write out mutations fH = open(args[2], 'w') for mutation in allMutations: fH.write("\t".join(map(str, mutation)) + "\n") fH.close()
def run(self, params="-s 2 -T 0 -Q 0 -a 1"): localReferenceFastaFile = os.path.join( self.getLocalTempDir(), "ref.fa" ) #Because we don't want to have any crufty files created in the local temp dir. indexFile = os.path.join(self.getLocalTempDir(), "my-index") #Index file mafFile = os.path.join(self.getLocalTempDir(), "out.maf") #MAF file #Hack to make last work, creating SQ line fH = open(self.outputSamFile, 'w') for name, seq in fastaRead(open(self.referenceFastaFile, 'r')): fH.write("@SQ\tSN:%s\tLN:%s\n" % (name.split()[0], len(seq))) fH.close() #Make fasta file, as last fastq seems broken localReadFile = os.path.join(self.getLocalTempDir(), "reads.fa") #Index file fH = open(localReadFile, 'w') for name, seq, quals in fastqRead(self.readFastqFile): fastaWrite(fH, name, seq) fH.close() system("cp %s %s" % (self.referenceFastaFile, localReferenceFastaFile)) #Copy across the ref file system("lastdb %s %s" % (indexFile, localReferenceFastaFile)) #Build the index system( "lastal %s %s %s > %s" % (params, indexFile, localReadFile, mafFile)) #Build the alignment system("maf-convert.py sam %s >> %s" % (mafFile, self.outputSamFile)) #Now convert sam file
def prepare_oned(self, nanopore_read, oned_read_path): try: read_file = open(oned_read_path, "w") fastaWrite(fileHandleOrFile=read_file, name=nanopore_read.read_label, seq=nanopore_read.template_read) version = nanopore_read.version read_file.close() nanopore_read.close() return True, version, False except Exception: return False, None, False
def align_consensus(tmp_dir, gp, target_genome_fasta, ref_tx_fasta): """ Main consensus alignment function. """ ref_tx_fasta = Fasta(ref_tx_fasta) target_genome_fasta = Fasta(target_genome_fasta) tmp_tgt, tmp_ref, tmp_psl = prepare_tmp_files(tmp_dir, gp, target_genome_fasta) tx_seq = str(ref_tx_fasta[gp.name]) fastaWrite(tmp_ref, gp.name, tx_seq) system("blat {} {} -out=psl -noHead {}".format(tmp_tgt, tmp_ref, tmp_psl)) r = popenCatch("simpleChain -outPsl {} /dev/stdout".format(tmp_psl)) r = r.split("\n")[:-1] best_cov, best_ident = evaluate_blat_results(r) return map(str, [gp.id, gp.name, best_cov, best_ident])
def prepare_twod(self, nanopore_read, twod_read_path): # check for table to make 'assembled' 2D alignment table fasta with if nanopore_read.has2D_alignment_table is False: nanopore_read.close() return False, None, False fasta_handle = open(twod_read_path, "w") fastaWrite(fileHandleOrFile=fasta_handle, name=nanopore_read.read_label, seq=nanopore_read.alignment_table_sequence) if nanopore_read.complement_model_id == "complement_median68pA_pop1.model": pop1_complement = True else: pop1_complement = False version = nanopore_read.version fasta_handle.close() nanopore_read.close() return True, version, pop1_complement
def main(): #Parse the inputs args/options parser = OptionParser( usage= "usage: inputFastaFile outputFastaFile outputMutationsFile [options]", version="%prog 0.1") parser.add_option( "--snpRate", dest="snpRate", help= "The probability of introducing a random different base at each position", default=0.2, type=float) #Parse the options/arguments options, args = parser.parse_args() #Print help message if no input if len(sys.argv) == 1: parser.print_help() sys.exit(0) #Exit if the arguments are not what we expect if len(args) != 3: raise RuntimeError("Expected three arguments, got: %s" % " ".join(args)) #This call gets the mutated sequences and a list of mutations mutatedSequences, allMutations = mutateSequences( getFastaDictionary(args[0]), options.snpRate) #Write out the mutated sequences into the given file fH = open(args[1], 'w') for name in mutatedSequences: fastaWrite(fH, name, mutatedSequences[name]) fH.close() #Write out mutations fH = open(args[2], 'w') for mutation in allMutations: fH.write("\t".join(map(str, mutation)) + "\n") fH.close()
def testRandom(self): """Makes random sequences and tests that Ortheus can align them and produce a valid output. """ outputFile = getTempFile() self.tempFiles.append(outputFile) MAX_SEQS = 20 for i in xrange(MAX_SEQS): self.tempFiles.append(getTempFile()) for test in xrange(0, self.testNo): print "test no : %i " % test #seqNo binaryTree = randomTree() middleSeq = getRandomSequence(250)[1] seqs = [] getTreeSeqs(binaryTree, middleSeq, seqs) if len(seqs) <= MAX_SEQS and len(seqs) > 2: seqFiles = [] for i in xrange(0, len(seqs)): seqFiles.append(self.tempFiles[1 + i]) fileHandle = open(seqFiles[i], 'w') fastaWrite(fileHandle, "%i" % i, seqs[i]) fileHandle.close() print "Have seq files ", seqFiles treeString = printBinaryTree(binaryTree, True) print "For tree ", treeString #align seqs and check no failure command = "ortheus_core -a %s -b '%s' -d %s -e" % ( " ".join(seqFiles), treeString, outputFile) print "command to call", command system(command) #check alignment is complete alignment = [i[:] for i in fastaAlignmentRead(outputFile)] #print "alignment", alignment checkAlignment(alignment, seqs) print "test no is finished : %i " % test
def main(): ########################################## #Construct the arguments. ########################################## usage = "usage: %prog [options] <fasta input file> <fasta output file>\n\n" + \ " <fasta file>: fasta sequence to filter\n" description = "Ensure sequences have length >= length\n" parser = OptionParser(usage=usage, description=description) parser.add_option("--prefix", dest="prefix", type="string", help="only filter sequences with prefix in name", default="") parser.add_option("--length", dest="length", type="int", help="filter shorter than length [default=1000]", default=1000) options, args = parser.parse_args() if len(args) != 2: parser.print_help() return 1 inputName = args[0] inputFile = open(inputName, "r") outputName = args[1] outputFile = open(outputName, "w") contTable = containedSequences(inputFile) inputFile.seek(0) for header, seq in fastaRead(inputFile): if tooShort(header, seq, options, contTable) == False: fastaWrite(outputFile, header, seq) outputFile.close() inputFile.close() return 0
def testFastaReadWrite(self): tempFile = getTempFile() self.tempFiles.append(tempFile) for test in range(0, self.testNo): fastaNumber = random.choice(range(10)) l = [getRandomSequence() for i in range(fastaNumber)] fileHandle = open(tempFile, 'w') for name, seq in l: fastaWrite(fileHandle, name, seq) fileHandle.close() fileHandle = open(tempFile, 'r') l.reverse() outFh = io.StringIO() for i in fastaRead(fileHandle): assert i == l.pop() name, seq = i fastaWrite(outFh, name, seq) outFh.close() fileHandle.close()
def testRandom(self): """Makes random sequences and tests that Ortheus can align them and produce a valid output. """ outputFile = getTempFile() self.tempFiles.append(outputFile) MAX_SEQS = 20 for i in xrange(MAX_SEQS): self.tempFiles.append(getTempFile()) for test in xrange(0, self.testNo): print "test no : %i " % test # seqNo binaryTree = randomTree() middleSeq = getRandomSequence(250)[1] seqs = [] getTreeSeqs(binaryTree, middleSeq, seqs) if len(seqs) <= MAX_SEQS and len(seqs) > 2: seqFiles = [] for i in xrange(0, len(seqs)): seqFiles.append(self.tempFiles[1 + i]) fileHandle = open(seqFiles[i], "w") fastaWrite(fileHandle, "%i" % i, seqs[i]) fileHandle.close() print "Have seq files ", seqFiles treeString = printBinaryTree(binaryTree, True) print "For tree ", treeString # align seqs and check no failure command = "ortheus_core -a %s -b '%s' -d %s -e" % (" ".join(seqFiles), treeString, outputFile) print "command to call", command system(command) # check alignment is complete alignment = [i[:] for i in fastaAlignmentRead(outputFile)] # print "alignment", alignment checkAlignment(alignment, seqs) print "test no is finished : %i " % test
def getCactusInputs_funkyHeaderNames(regionNumber=0, tempDir=None): """Gets inputs (based on Blanchette region 0) that have weird header names that might get parsed wrong and cause issues.""" sequences, newickTreeString = getCactusInputs_blanchette(regionNumber=regionNumber) # Assign weird header names if tempDir is None: tempDir = getTempDir() # Should also consider "bar foo", "ba rfoo", but we currently # throw away everything but the first token (probably because of # cigar parsing). funkyHeaderNames = ['id=1|foo', 'test1|1600', 'test2|', '|test3', 'id=1|bar'] funkyIndex = 0 for i, sequencePath in enumerate(sequences): newPath = os.path.join(tempDir, str(i)) for _, sequence in fastaRead(sequencePath): header = funkyHeaderNames[funkyIndex % len(funkyHeaderNames)] funkyIndex += 1 fastaWrite(newPath, header, sequence, 'a') sequences[i] = newPath return sequences, newickTreeString
def testBlastRandom(self): """Make some sequences, put them in a file, call blast with random parameters and check it runs okay. """ tempSeqFile = os.path.join(self.tempDir, "tempSeq.fa") self.tempFiles.append(tempSeqFile) for test in xrange(self.testNo): seqNo = random.choice(xrange(0, 10)) seq = getRandomSequence(8000)[1] fileHandle = open(tempSeqFile, 'w') for fastaHeader, seq in [ (str(i), mutateSequence(seq, 0.3*random.random())) for i in xrange(seqNo) ]: if random.random() > 0.5: seq = reverseComplement(seq) fastaWrite(fileHandle, fastaHeader, seq) fileHandle.close() chunkSize = random.choice(xrange(500, 9000)) overlapSize = random.choice(xrange(2, 100)) toilDir = os.path.join(getTempDirectory(self.tempDir), "toil") runCactusBlast([ tempSeqFile ], self.tempOutputFile, toilDir, chunkSize, overlapSize) #runToilStatusAndFailIfNotComplete(toilDir) if getLogLevelString() == "DEBUG": system("cat %s" % self.tempOutputFile) system("rm -rf %s " % toilDir)
def align(target, g, target_fasta, chunk, ref_fasta, out_path): g_f = Fasta(target_fasta) r_f = Fasta(ref_fasta) results = [] for aug_aId in chunk: aId = remove_augustus_alignment_number(aug_aId) gencode_id = remove_alignment_number(aId) gencode_seq = str(r_f[gencode_id]) aug_seq = str(g_f[aug_aId]) tmp_aug = os.path.join(target.getLocalTempDir(), "tmp_aug") tmp_gencode = os.path.join(target.getLocalTempDir(), "tmp_gencode") fastaWrite(tmp_aug, aug_aId, aug_seq) fastaWrite(tmp_gencode, gencode_id, gencode_seq) r = popenCatch("blat {} {} -out=psl -noHead /dev/stdout".format(tmp_gencode, tmp_aug)) r = r.split("\n")[:-3] if len(r) == 0: results.append([aug_aId, "0", "0"]) else: p_list = [PslRow(x) for x in r] results.append(map(str, [aug_aId, identity(p_list), coverage(p_list)])) with open(os.path.join(out_path, getRandomAlphaNumericString(10) + ".txt"), "w") as outf: for x in results: outf.write("\t".join(x) + "\n")
def align(target, target_fasta, chunk, ref_fasta, file_tree): g_f = Fasta(target_fasta) r_f = Fasta(ref_fasta) results = [] tmp_aug = os.path.join(target.getGlobalTempDir(), "tmp_aug") tmp_gencode = os.path.join(target.getGlobalTempDir(), "tmp_gencode") tmp_psl = os.path.join(target.getGlobalTempDir(), "tmp_psl") with open(tmp_aug, "w") as tmp_aug_h, open(tmp_gencode, "w") as tmp_gencode_h: for tgt_id in chunk: query_id = remove_augustus_alignment_number(tgt_id) gencode_id = remove_alignment_number(query_id) gencode_seq = str(r_f[gencode_id]) aug_seq = str(g_f[tgt_id]) fastaWrite(tmp_aug_h, tgt_id, aug_seq) fastaWrite(tmp_gencode_h, gencode_id, gencode_seq) system("blat {} {} -out=psl -noHead {}".format(tmp_aug, tmp_gencode, tmp_psl)) r = popenCatch("simpleChain -outPsl {} /dev/stdout".format(tmp_psl)) r = r.split("\n")[:-1] r_d = defaultdict(list) for p in tokenize_stream(r): psl = PslRow(p) r_d[psl.t_name].append(psl) assert len(r_d.viewkeys() & set(chunk)) > 0, (r_d.viewkeys(), set(chunk)) for tgt_id in chunk: if tgt_id not in r_d: results.append([tgt_id, query_id, "0", "0"]) else: p_list = [[min(x.coverage, x.target_coverage), x.identity] for x in r_d[tgt_id]] best_cov, best_ident = sorted(p_list, key=lambda x: x[0])[-1] results.append(map(str, [tgt_id, query_id, best_cov, best_ident])) with open(file_tree.getTempFile(), "w") as outf: for x in results: outf.write("".join([",".join(x), "\n"]))
def align_cgp(tmp_dir, gp, target_genome_fasta, tx_dict, ref_tx_fasta): """ Main CGP alignment function. For each CGP transcript, uses tx_dict to BLAT against all transcripts. These alignments are then chained and the highest coverage alignment used. This circumvents problems with multiple self alignments in the case of repeats. """ results = [] ref_tx_fasta = Fasta(ref_tx_fasta) target_genome_fasta = Fasta(target_genome_fasta) tmp_tgt, tmp_ref, tmp_psl = prepare_tmp_files(tmp_dir, gp, target_genome_fasta) for gene_name, tx_names in tx_dict.iteritems(): for tx_name in tx_names: tx_seq = str(ref_tx_fasta[tx_name]) fastaWrite(tmp_ref, tx_name, tx_seq) system("blat {} {} -out=psl -noHead {}".format( tmp_tgt, tmp_ref, tmp_psl)) r = popenCatch( "simpleChain -outPsl {} /dev/stdout".format(tmp_psl)) r = r.split("\n")[:-1] best_cov, best_ident = evaluate_blat_results(r) results.append( map(str, [gp.name, gene_name, tx_name, best_cov, best_ident])) return results
def mutateReferenceSequences(referenceFastaFiles): updatedReferenceFastaFiles = referenceFastaFiles[:] for referenceFastaFile in referenceFastaFiles: if not "percent" in referenceFastaFile: mutation_rates = [0.01, 0.05, 0.10, 0.20] for mutation_rate in mutation_rates: indel_rate = 0.0 * mutation_rate # indel rate = 20% of Substitution rate i = mutation_rate * 100 j = indel_rate * 100 newReferenceFastaFile = referenceFastaFile.split(".fa")[0] + "_" + str(i) + "_percent_SNPs_" + str(j) + "_percent_InDels.fasta" mutationIndexFile = referenceFastaFile.split(".fa")[0] + "_" + str(i) + "_percent_SNPs_" + str(j) + "_percent_InDels.fasta_Index.txt" updatedReferenceFastaFiles.append(newReferenceFastaFile) if not os.path.exists(newReferenceFastaFile): fH = open(newReferenceFastaFile, 'w') fH2 = open(mutationIndexFile, 'w') for header, seq in fastaRead(referenceFastaFile): header = header.split()[0] mutatedSeq = mutateSequence(seq, mutation_rate) fastaWrite(fH, header, mutatedSeq) fastaWrite(fH2, header, seq) fastaWrite(fH2, header + "_mutated", mutatedSeq) fH.close() fH2.close() return updatedReferenceFastaFiles
import sys import xml.etree.ElementTree as ET from sonLib.bioio import fastaRead, fastaWrite node = ET.parse(sys.argv[1]).getroot() fH = open(sys.argv[3], 'w') seqs = [i for i in fastaRead(open(sys.argv[2], 'r'))] assert (len(seqs) == 1) for name, sequence in seqs: #>hg19.chr6.171115067.28377796.5150977.1 i = name.split(".") j = int(node.attrib["minOtherReferenceCoordinate"]) k = int(node.attrib["maxOtherReferenceCoordinate"]) fastaWrite(fH, ".".join(i[0:3] + [str(int(i[3]) + j), str(k - j)] + i[-1:]), sequence[j:k]) fH.close()
def testCactusWorkflow_Blanchette(self): """Runs the workflow on blanchette's simulated (colinear) regions. """ if "SON_TRACE_DATASETS" not in os.environ: return for test in xrange(self.testNo): tempFiles = [] tempDir = getTempDirectory(os.getcwd()) trueAlignment = os.path.join(TestStatus.getPathToDataSets(), "blanchettesSimulation", "00.job", "true.mfa") #Load the true alignment. columnAlignment = [ i for i in fastaAlignmentRead(trueAlignment) ] fastaHeaders = [ i for i in fastaReadHeaders(trueAlignment) ] sequenceNumber = 9 #The tree newickTreeString = "((((HUMAN:0.006969, CHIMP:0.009727):0.025291, BABOON:0.044568):0.11,(RAT:0.072818, MOUSE:0.081244):0.260342):0.023260,((DOG:0.07, CAT:0.07):0.087381,(PIG:0.06, COW:0.06):0.104728):0.04);" #Get random dir testDir = getTempDirectory(tempDir) #random alignment alignmentLength = 5000 randomStart = random.choice(xrange(len(columnAlignment)-alignmentLength)) subAlignment = columnAlignment[randomStart:randomStart+alignmentLength] logger.info("Got a sub alignment, it is %i columns long" % len(subAlignment)) #Get sequences sequences = [ (fastaHeaders[seqNo], "".join([ column[seqNo] for column in subAlignment if column[seqNo] != '-' ])) for seqNo in xrange(sequenceNumber) ] logger.info("Got the sequences") #Write sequences into temp files tempFastaFiles = [] for seqNo in xrange(sequenceNumber): header, sequence = sequences[seqNo] logger.info("Making temp file for header: %s, seq: %s" % (header, sequence)) tempFastaFile = os.path.join(testDir, "%i.fa" % seqNo) tempFastaFiles.append(tempFastaFile) fileHandle = open(tempFastaFile, "w") fastaWrite(fileHandle, header, sequence) fileHandle.close() logger.info("Got the temp sequence files") experiment = getCactusWorkflowExperimentForTest(tempFastaFiles, newickTreeString, testDir) experimentFile = os.path.join(testDir, "experiment.xml") experiment.writeXML(experimentFile) cactusDiskDatabaseString = experiment.getDiskDatabaseString() jobTree = os.path.join(testDir, "jobTree") runCactusWorkflow(experimentFile, jobTree) logger.info("Ran the the workflow") #Check the output alignment runJobTreeStatusAndFailIfNotComplete(jobTree) logger.info("Checked the job tree dir") #Output the 'TRUE' alignment file if os.system("mfaToMaf --help > /dev/null 2>&1") == 0 and\ os.system("cactus_MAFGenerator --help > /dev/null 2>&1") == 0 and\ os.system("mafComparator --help > /dev/null 2>&1") == 0 and\ os.system("cactus_treeStats --help > /dev/null 2>&1") == 0: trueMFAFile = os.path.join(testDir, "true.mfa") fastaAlignmentWrite(subAlignment, fastaHeaders, len(fastaHeaders), trueMFAFile) trueMAFFile = os.path.join(testDir, "true.maf") system("mfaToMaf --mfaFile %s --outputFile %s --logLevel %s" % (trueMFAFile, trueMAFFile, getLogLevelString())) system("cat %s" % trueMAFFile) #Now get mafs for the region. mAFFile = os.path.join(testDir, "flower.maf") system("cactus_MAFGenerator --flowerName 0 --cactusDisk '%s' --outputFile %s --logLevel %s" % (cactusDiskDatabaseString, mAFFile, getLogLevelString())) logger.info("Got the MAFs from the flower disk") system("cat %s" % mAFFile) statsFile = os.path.join(testDir, "stats.xml") system("cactus_treeStats --cactusDisk '%s' --flowerName 0 --outputFile %s --logLevel %s" % (cactusDiskDatabaseString, statsFile, getLogLevelString())) system("cat %s" % statsFile) logger.info("Got the cactus tree stats") #Now compare the mafs to the output. resultsFile = os.path.join(testDir, "results.xml") system("mafComparator --mafFile1 %s --mafFile2 %s --outputFile %s --logLevel %s" % (trueMAFFile, mAFFile, resultsFile, getLogLevelString())) logger.info("Ran the maf comparator") system("cat %s" % resultsFile) #Cleanup experiment.cleanupDb() system("rm -rf %s" % testDir) logger.info("Successfully ran test for the problem") for tempFile in tempFiles: os.remove(tempFile) system("rm -rf %s" % tempDir)
header.start += len( subSequence ) + lenNs sequence = sequence[m.start() + lenNs: ] m = re.search( pattern, sequence ) i = fn2(header, searchedSeq + sequence) if i != None: yield i #=========== MAIN ==================== fH = open(sys.argv[1], 'r') fH2 = open(sys.argv[2], 'w') lengthOfNs = int(sys.argv[3]) lengthOfFragment = int(sys.argv[4]) if len(sys.argv) == 6: setLogLevel(sys.argv[5]) headers = set() for name, sequence in fastaRead(fH): header = Header( name.split()[0], len(sequence) ) logger.info("Got a sequence of length %i with header %s for processing" % (len(sequence), name.split()[0])) for newheader, subsequence in fn( header, sequence, lengthOfNs ): if len( subsequence ) > 0: logger.info("Writing out a sequence of length %i with header %s" % (len(subsequence), newheader)) assert newheader not in headers headers.add(newheader) fastaWrite(fH2, newheader, subsequence) fH.close() fH2.close()
import sys import xml.etree.ElementTree as ET from sonLib.bioio import fastaRead, fastaWrite node = ET.parse(sys.argv[1]).getroot() fH = open(sys.argv[3], 'w') seqs = [ i for i in fastaRead(open(sys.argv[2], 'r')) ] assert(len(seqs) == 1) for name, sequence in seqs: #>hg19.chr6.171115067.28377796.5150977.1 i = name.split(".") j = int(node.attrib["minOtherReferenceCoordinate"]) k = int(node.attrib["maxOtherReferenceCoordinate"]) fastaWrite(fH, ".".join(i[0:3] + [ str(int(i[3]) + j), str(k - j)] + i[-1:]), sequence[j:k]) fH.close()
from sonLib.bioio import fastaRead, fastaWrite import sys import random fH = open(sys.argv[2], "w") def fn(k, i, j): if k.upper() == i.upper(): l = random.choice(j) if k == k.upper(): return l.upper() return l.lower() else: return k for name, seq in fastaRead(open(sys.argv[1], "r")): for i, j in [ ("W", ("A", "T")), ("S", ("C", "G")), ("M", ("A", "C")), ("K", ("G", "T")), ("R", ("A", "G")), ("Y", ("C", "T")), ("B", ("C", "G", "T")), ("D", ("A", "G", "T")), ("H", ("A", "C", "T")), ("V", ("A", "C", "G")) ]: seq = "".join([ fn(k, i, j) for k in seq ]) fastaWrite(fH, name, seq) fH.close()
def getCactusInputs_random(regionNumber=0, tempDir=None, sequenceNumber=None, avgSequenceLength=None, treeLeafNumber=None): """Gets a random set of sequences, each of length given, and a species tree relating them. Each sequence is a assigned an event in this tree. """ if sequenceNumber is None: sequenceNumber = random.choice(list(range(30))) if avgSequenceLength is None: avgSequenceLength = random.choice(list(range(1, 3000))) if treeLeafNumber is None: treeLeafNumber = random.choice(list(range(2, 4))) #Make tree binaryTree = makeRandomBinaryTree(treeLeafNumber) newickTreeString = printBinaryTree(binaryTree, includeDistances=True) newickTreeLeafNames = [] def fn(tree): if tree.internal: fn(tree.left) fn(tree.right) else: newickTreeLeafNames.append(tree.iD) fn(binaryTree) logger.info("Made random binary tree: %s" % newickTreeString) sequenceDirs = [] for i in range(len(newickTreeLeafNames)): seqDir = getTempDirectory(rootDir=tempDir) sequenceDirs.append(seqDir) logger.info("Made a set of random directories: %s" % " ".join(sequenceDirs)) #Random sequences and species labelling sequenceFile = None fileHandle = None parentSequence = getRandomSequence( length=random.choice(list(range(1, 2 * avgSequenceLength))))[1] emptySequenceDirs = set(sequenceDirs) i = 0 while i < sequenceNumber or len(emptySequenceDirs) > 0: if sequenceFile == None: if random.random( ) > 0.5: #Randomly choose the files to be attached or not suffix = ".fa.complete" else: suffix = ".fa" sequenceDir = random.choice(sequenceDirs) if sequenceDir in emptySequenceDirs: emptySequenceDirs.remove(sequenceDir) sequenceFile = getTempFile(rootDir=sequenceDir, suffix=suffix) fileHandle = open(sequenceFile, 'w') if random.random() > 0.8: #Get a new root sequence parentSequence = getRandomSequence( length=random.choice(list(range(1, 2 * avgSequenceLength))))[1] sequence = mutateSequence(parentSequence, distance=random.random() * 0.25) name = getRandomAlphaNumericString(15) if random.random() > 0.5: sequence = reverseComplement(sequence) fastaWrite(fileHandle, name, sequence) if random.random() > 0.5: fileHandle.close() fileHandle = None sequenceFile = None i += 1 if fileHandle != None: fileHandle.close() logger.info("Made %s sequences in %s directories" % (sequenceNumber, len(sequenceDirs))) return sequenceDirs, newickTreeString
from sonLib.bioio import fastaRead, fastaWrite import sys import random fH = open(sys.argv[2], "w") def fn(k, i, j): if k.upper() == i.upper(): l = random.choice(j) if k == k.upper(): return l.upper() return l.lower() else: return k for name, seq in fastaRead(open(sys.argv[1], "r")): for i, j in [("W", ("A", "T")), ("S", ("C", "G")), ("M", ("A", "C")), ("K", ("G", "T")), ("R", ("A", "G")), ("Y", ("C", "T")), ("B", ("C", "G", "T")), ("D", ("A", "G", "T")), ("H", ("A", "C", "T")), ("V", ("A", "C", "G"))]: seq = "".join([fn(k, i, j) for k in seq]) fastaWrite(fH, name, seq) fH.close()
#!/usr/bin/env python # Usage: pastaIdsToOriginalNames.py fastaFile renameFile import sys from sonLib.bioio import system, fastaRead, fastaWrite fastaFile = sys.argv[1] renameFile = sys.argv[2] curRealName = None curPastaID = None translate = {} for i, line in enumerate(open(renameFile)): line = line.strip() if i % 3 == 0: curPastaID = line elif i % 3 == 1: curRealName = line else: translate[curPastaID] = curRealName for header, seq in fastaRead(open(fastaFile)): # hacks for if we are using the badly-named original fasta. header = translate[header].replace("...", ".-.").replace(".", "_").replace("__", "_") fastaWrite(sys.stdout, header, seq)
def getCactusInputs_random(regionNumber=0, tempDir=None, sequenceNumber=None, avgSequenceLength=None, treeLeafNumber=None): """Gets a random set of sequences, each of length given, and a species tree relating them. Each sequence is a assigned an event in this tree. """ if sequenceNumber is None: sequenceNumber = random.choice(xrange(30)) if avgSequenceLength is None: avgSequenceLength = random.choice(xrange(1,3000)) if treeLeafNumber is None: treeLeafNumber = random.choice(xrange(2, 4)) #Make tree binaryTree = makeRandomBinaryTree(treeLeafNumber) newickTreeString = printBinaryTree(binaryTree, includeDistances=True) newickTreeLeafNames = [] def fn(tree): if tree.internal: fn(tree.left) fn(tree.right) else: newickTreeLeafNames.append(tree.iD) fn(binaryTree) logger.info("Made random binary tree: %s" % newickTreeString) sequenceDirs = [] for i in xrange(len(newickTreeLeafNames)): seqDir = getTempDirectory(rootDir=tempDir) sequenceDirs.append(seqDir) logger.info("Made a set of random directories: %s" % " ".join(sequenceDirs)) #Random sequences and species labelling sequenceFile = None fileHandle = None parentSequence = getRandomSequence(length=random.choice(xrange(1, 2*avgSequenceLength)))[1] emptySequenceDirs = set(sequenceDirs) i = 0 while i < sequenceNumber or len(emptySequenceDirs) > 0: #for i in xrange(sequenceNumber): if sequenceFile == None: if random.random() > 0.5: #Randomly choose the files to be attached or not suffix = ".fa.complete" else: suffix = ".fa" sequenceDir = random.choice(sequenceDirs) if sequenceDir in emptySequenceDirs: emptySequenceDirs.remove(sequenceDir) sequenceFile = getTempFile(rootDir=sequenceDir, suffix=suffix) fileHandle = open(sequenceFile, 'w') if random.random() > 0.8: #Get a new root sequence parentSequence = getRandomSequence(length=random.choice(xrange(1, 2*avgSequenceLength)))[1] sequence = mutateSequence(parentSequence, distance=random.random()*0.5) name = getRandomAlphaNumericString(15) if random.random() > 0.5: sequence = reverseComplement(sequence) fastaWrite(fileHandle, name, sequence) if random.random() > 0.5: fileHandle.close() fileHandle = None sequenceFile = None i += 1 if fileHandle != None: fileHandle.close() logger.info("Made %s sequences in %s directories" % (sequenceNumber, len(sequenceDirs))) return sequenceDirs, newickTreeString
import sys import xml.etree.ElementTree as ET from sonLib.bioio import fastaRead, fastaWrite i = set([ i for i in ET.parse(sys.argv[1]).getroot().text.split() ]) fH = open(sys.argv[3], 'w') for name, sequence in fastaRead(open(sys.argv[2], 'r')): if name not in i: fastaWrite(fH, name, sequence) fH.close()