totalReadLength = readLength * 2 + insertSize numGenes = len(genes) r90 = int(numGenes * .9) r80 = int(numGenes * .8) r70 = int(numGenes * .7) r60 = int(numGenes * .6) r50 = int(numGenes * .5) r40 = int(numGenes * .4) r30 = int(numGenes * .3) r20 = int(numGenes * .2) r10 = int(numGenes * .1) for key in genes: thisGeneName = key if len(genes[thisGeneName]) > totalReadLength: complete = makeRandomTestData.outputFastaSeq(filename + 'E1R100G100.fa', thisGeneName, genes[thisGeneName]) levelKey = random.randint(0, 2) if levelKey == 0: level = 10 elif levelKey == 1: level = 100 elif levelKey == 2: level = 1000 else: level = 0 complete = makeRandomTestData.makeRandomlyPlacedPairedEndReads(genes[thisGeneName], thisGeneName, readLength, insertSize, level, filename + 'E1R100G100RandomReadsPair1.fa', filename + 'E1R100G100RandomReadsPair2.fa', filename + 'E1R100G100RandomReads.txt', 1, 0) count += 1 genes2 = makeRandomTestData.inputFastaSeq(infileName)
x = os.remove(randomTrans + '.fa') for i in range(0, numTranscripts): isoform = random.randint(1, 100) if isoform <= isoformProb and len(genes) > 2: geneSelected = random.choice(genes.keys()) geneSelectedParts = geneSelected.split(' ') geneSelectedId = '' for part in geneSelectedParts: geneSelectedId += part identifier = 'testgene_' + str(i) + ' Isoform_' + geneSelectedId thisGeneName, seq = makeRandomTestData.generateIsoforms(identifier, genes[geneSelected], 'random', 0, 0, 0, min, max) genes[thisGeneName] = seq else: thisGeneName = 'testgene_' + str(i) + ' ' + randomTrans genes[thisGeneName] = makeRandomTestData.makeKmerCountData(random.randint(101, 4999), 'default') complete = makeRandomTestData.outputFastaSeq(randomTrans + '.fa', thisGeneName, genes[thisGeneName]) infileName = randomTrans + '.fa' filename = 'DataSet' + name genes = makeRandomTestData.inputFastaSeq(infileName) errors = 1 if makeRandomTestData.checkForFasta(filename + 'E1R100G100.fa'): x = os.remove(filename + 'E1R100G100.fa') if makeRandomTestData.checkForFasta(filename + 'E1R100G100RandomReads.fa'): x = os.remove(filename + 'E1R100G100RandomReads.fa') if makeRandomTestData.checkForFasta(filename + 'E1R100G100RandomReads.txt'): x = os.remove(filename + 'E1R100G100RandomReads.txt') if makeRandomTestData.checkForFasta(filename + 'E1R90G100.fa'): x = os.remove(filename + 'E1R90G100.fa') if makeRandomTestData.checkForFasta(filename + 'E1R80G100.fa'):
filename = 'DataSet' + name genes = makeRandomTestData.inputFastaSeq(infileName) errors = 0 if makeRandomTestData.checkForFasta(filename + 'E1R100G100.fa'): x = os.remove(filename + 'E1R100G100.fa') if makeRandomTestData.checkForFasta(filename + 'E1R100G100RandomReadsPair1.fa'): x = os.remove(filename + 'E1R100G100RandomReadsPair1.fa') if makeRandomTestData.checkForFasta(filename + 'E1R100G100RandomReadsPair2.fa'): x = os.remove(filename + 'E1R100G100RandomReadsPair2.fa') if makeRandomTestData.checkForFasta(filename + 'E1R100G100RandomReads.txt'): x = os.remove(filename + 'E1R100G100RandomReads.txt') count = 0 readLength = 100 insertSize = 400 totalReadLength = readLength * 2 + insertSize numGenes = len(genes) for key in genes: thisGeneName = key if len(genes[thisGeneName]) > totalReadLength: complete = makeRandomTestData.outputFastaSeq(filename + 'E1R100G100.fa', thisGeneName, genes[thisGeneName]) level = 20 complete = makeRandomTestData.makeRandomlyPlacedPairedEndReads(genes[thisGeneName], thisGeneName, readLength, insertSize, level, filename + 'E1R100G100RandomReadsPair1.fa', filename + 'E1R100G100RandomReadsPair2.fa', filename + 'E1R100G100RandomReads.txt', errors, 0) count += 1 print 'Finished'
x = os.remove(filename + 'E1R100G100.fa') if makeRandomTestData.checkForFasta(filename + 'E1R100G100RandomReads.fa'): x = os.remove(filename + 'E1R100G100RandomReads.fa') if makeRandomTestData.checkForFasta(filename + 'E1R100G100RandomReads.txt'): x = os.remove(filename + 'E1R100G100RandomReads.txt') count = 0 count2 = 0 readLength = 100 numGenes = len(genes) for key in genes: thisGeneName = key if len(genes[thisGeneName]) > readLength: count2 += 1 complete = makeRandomTestData.outputFastaSeq(filename + 'E1R100G100.fa', thisGeneName, genes[thisGeneName]) # levelKey = random.randint(0, 5) # if levelKey == 0: # level = 10 # elif levelKey == 1: # level = 100 # elif levelKey == 2: # level = 1000 # else: # level = 0 level = 20 coverageReq = float(len(genes[thisGeneName])) / float(readLength) * float(level) for i in range(0, int(coverageReq)): position = random.randint(0,len(genes[thisGeneName])-readLength) read = genes[thisGeneName][position:position + readLength] identifier1 = key + 'Count' + str(i) + 'Begin' + str(position) + 'End' + str(position + readLength - 1)
# A script to take ensembl gene ids and get those seqs from a fasta file. # Original: 2011.11.10 # Original: A. Black P. # Last modified: 2011.11.10 # Last modified: A. Black P. # Usage: python extractSeqsFastaById.py input.fasta input.ids output.fasta import sys, makeRandomTestData print 'Loading/counting data\n' count = 0 genes = makeRandomTestData.inputFastaSeq(sys.argv[1]) for newString in open(sys.argv[2], 'r'): name = newString.rstrip() for key in genes: if name in key: complete = makeRandomTestData.outputFastaSeq( sys.argv[3], key, genes[key]) count += 1 print count print 'Finished!'
# A script to take ensembl gene ids and get those seqs from a fasta file. # Original: 2011.11.10 # Original: A. Black P. # Last modified: 2011.11.10 # Last modified: A. Black P. # Usage: python extractSeqsFastaById.py input.fasta input.ids output.fasta import sys, makeRandomTestData print 'Loading/counting data\n' count = 0 genes = makeRandomTestData.inputFastaSeq(sys.argv[1]) for newString in open(sys.argv[2], 'r'): name = newString.rstrip() for key in genes: if name in key: complete = makeRandomTestData.outputFastaSeq(sys.argv[3], key, genes[key]) count += 1 print count print 'Finished!'