예제 #1
0
파일: SNP_tiles.py 프로젝트: mrG7/tileIt
def test():
	fname = "test/Firre.fasta"
	handle = open(fname,'r')
	fastaIter = sequencelib.FastaIterator(handle)

	mySeq = fastaIter.next()
	tiles = scanSequence(mySeq['sequence'],mySeq['name'])

	print len(tiles)
	# Remove duplicate tile sequences
	tiles = findUnique(tiles)

	# for tile in tiles:
	# 	print "%s\t%0.2f\t%d" % (tile,tile.GC,len(tile))

	print len(tiles)
예제 #2
0
파일: SNP_tiles.py 프로젝트: mrG7/tileIt
def main():
	#######################
	# Variables
	#######################
	universalPrimers = {
		'M13 forward sequencing primer (-20)': 'GTAAAACGACGGCCAGT',
		'M13 forward sequencing primer (-40)': 'GTTTTCCCAGTCACGAC',
		'M13 forward sequencing primer (-47)': 'CGCCAGGGTTTTCCCAGTCACGAC',
		'M13 reverse sequencing primer (-24)': 'AACAGCTATGACCATG',
		'M13 reverse sequencing primer (-48)': 'AGCGGATAACAATTTCACACAGGA',
		'T3 promoter': 'ATTAACCCTCACTAAAGGGA',
		'T7 universal primer': 'TAATACGACTCACTATAGGG',
		'T7 Terminator': 'GCTAGTTATTGCTCAGCGG',
		'SP6 promoter': 'CATACGATTTAGGTGACACTATAG',
		'SP6 universal primer': 'ATTTAGGTGACACTATAG',
		'VF2': 'tgccacctgacgtctaagaa',
		'VR': 'attaccgcctttgagtgagc'
	}

	MPRATags = {
		'prefix': 'ACTGGCCGCTTCACTG',
		'suffix': 'GGTACCTCTAGA@AGATCGGAAGAGCGTCG'
	}

	maxTileSize = 150
	tileStep = 1
	tagLength = 10
	numTagsPerTile = 5
	prefix = MPRATags['prefix']
	suffix = MPRATags['suffix']
	sites = "KpnI,XbaI,SfiI"

	#Argument handling
	try:
		opts,args = getopt.getopt(sys.argv[1:],"ho:vp:s:l:w:te:n:r:",["help","output=","verbose","prefix=","suffix=","max-tile-length=","tile-step=","tag","tag-length=","num-tags-per-tile=","restriction-sites="])
	except getopt.GetoptError as err:
		print(err)
		usage()
		sys.exit(2)
	output = None
	verbose = False
	for o,a in opts:
		if o == "-v":
			verbose = True
		elif o in ("-h","--help"):
			usage()
			sys.exit()
		elif o in ("-o","--output"):
			output = a
		elif o in ("-p","--prefix"):
			if onlyNucleic(a):
				prefix = a
			else:
				raise(TileError,"Prefix is not a valid nucleic acid sequence.")
		elif o in ("-s","--suffix"):
			if onlyNucleic(a):
				suffix = a
			else:
				raise(TileError,"Suffix is not a valid nucleic acid sequence.")
		elif o in ("-l","--max-tile-length"):
			maxTileSize = int(a)
		elif o in ("-w","--tile-step"):
			tileStep = int(a)
		elif o in ("-t","--tag"):
			addTag = True
		elif o in ("-e","--tag-length"):
			tagLength = int(a)
		elif o in ("-n","--num-tags-per-tile"):
			numTagsPerTile = int(a)
		elif o in ("-r","--restriction-sites"):
			sites = a.strip()
			if sites == "":
				sites = None
		else:
			assert False, "Unhandled option"
	# Grab fname as remainder argument
	try:
		fname = str(args[0])
		handle = open(fname,'r')
	except:
		usage()
		sys.exit(2)
		#print fname

	#Estimate prefix and suffix length
	prefixLength = estimateAffixLength(prefix,tagLength)
	suffixLength = estimateAffixLength(suffix,tagLength)

	#Find window size
	tileSize = maxTileSize-prefixLength-suffixLength

	#Fetch all tile sequences
	fastaIter = sequencelib.FastaIterator(handle)
	tiles = []
	for mySeq in fastaIter:
		#Warn about masked regions
		if sites != None:
			warnRestrictionSites(mySeq['sequence'],mySeq['name'],sites)

		#Get tiles from sequence
		tmpTiles = scanSequence(mySeq['sequence'],mySeq['name'],tileStep=tileStep,tileSize=tileSize)
		tiles += tmpTiles

	# Remove duplicate tile sequences
	tiles = findUnique(tiles)

	# Check tiles for restriction sites
	if sites != None:
		cleanTiles = set()
		for tile in tiles:
			if hasRestrictionSites(tile.sequence, sites):
				continue
			else:
				cleanTiles.add(tile)
		tiles = list(cleanTiles)

	#determine number of tags needed
	numTagsReq = len(tiles) * numTagsPerTile

	tags = buildTags(numTagsReq,tagLength,sites=sites)

	assert len(tags) == len(tiles) * numTagsPerTile

	#Create numTagsPerTile tiles for each sequence
	tmpTiles = set()
	#Add prefix, suffix, and tag
	for i in xrange(len(tiles)):
		for j in xrange(numTagsPerTile):
			tmpTile = copy.copy(tiles[i])
			tmpTile.name = "%s:%d" % (tmpTile.name,j)
			tmpTile.prefix = prefix
			tmpTile.suffix = suffix
			tmpTile.tag = tags.pop()
			tmpTiles.add(tmpTile)

	tiles = list(tmpTiles)
	tiles.sort()

	#Just for QC
	#for i in xrange(10):
	outputTable(tiles)

	print >>sys.stderr, "There are a total of %d unique tiles" % len(tiles)