Exemplo n.º 1
0
def mapFastQ(fName, organism):
	
	indexFileHuman = '/home/chrisgre/indexes/bowtie/hg19'
	indexFileMouse = '/home/chrisgre/indexes/bowtie/mm9'
	indexFileZebrafish = '/home/chrisgre/indexes/bowtie/danRer6'
	
	if organism == 'human':
		indexName = indexFileHuman
	elif organism == 'mouse':
		indexName = indexFileMouse
	elif organism == 'zebrafish':
		indexName = indexFileZebrafish
		
	
	outName = fName + '.mapped'
	
		
	logFile = open(mainConf.conf['outLog'] + cg.getBaseFileName(fName), 'w')
	errorFile = open(mainConf.conf['errorLog'] + cg.getBaseFileName(fName), 'w')
	
	if fastQTypes.getFastQType(fName, quick = True) == 'Sa':
		print 'Mapping with 33 phred offset'
		subprocess.Popen(['bowtie', '--phred33-quals', '-k', '20', '-m', '20', '-p', '1', indexName, fName, outName], stdout=logFile, stderr=errorFile).wait()
	else:
		print 'Mapping with 64 phred offset'
		subprocess.Popen(['bowtie', '--phred64-quals', '-k', '20', '-m', '20', '-p', '1', indexName, fName, outName], stdout=logFile, stderr=errorFile).wait()
	
	logFile.close()
	errorFile.close()
Exemplo n.º 2
0
def testmerge(masterDir, parDir):
        '''The master directory will contain the merged objects,
        the slave directory contains the directories of all the runs
        oRNA (master)
        aDir (master)
        pRuns
        --run.00
        ----oRNA (slave: pRuns/run.00/oRNA)
        ----aDir
        --run.01
        '''

        mDC = cgDB.dataController(masterDir, cgAlignment.cgAlignment)
        id_masterObj = mDC.load()
        
        #recurse through all the runs
        masterBN = bioLibCG.getBaseFileName(masterDir)

        for slaveDir in bioLibCG.recursePaths(parDir, end = masterBN):

        
                oDC = cgDB.dataController(slaveDir, cgAlignment.cgAlignment)
                id_slaveObj = oDC.load()
       
                id_masterObj = cgDB.mergeTwoObjects(id_masterObj, id_slaveObj, cgOriginRNA.OriginRNA) 
        
        mDC.commit(id_masterObj)
Exemplo n.º 3
0
def makeWig(fN, assembly, format=None, name=None):
    '''format assumes bowtie
	suitible for medium mapped files.
	takes longer.'''
    #assume bowtie
    if not format: format = 'Bowtie'
    parserFunction = returnParserFunction(format)
    if not name: name = cg.getBaseFileName(fN, naked=True)
    lDict = cg.returnChromLengthDict(assembly)

    for chrom in lDict:
        if not chrom in cg.acceptableChroms: continue
        for strand in ['1', '-1']:
            f = open(fN, 'r')
            #create hitmap of chrom and strand
            print chrom, strand, 'hitmap'
            hitDict = {}
            for line in f:

                lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line))
                lStrand = str(lStrand)
                start = int(start)
                end = int(end)
                if chrom == lChrom and strand == lStrand:
                    for i in range(start, end + 1):
                        try:
                            hitDict[i] += 1
                        except KeyError:
                            hitDict[i] = 1

            #write results to wig file
            writeWigFromHitDict(hitDict, assembly)
Exemplo n.º 4
0
def testmerge(masterDir, parDir):
    '''The master directory will contain the merged objects,
        the slave directory contains the directories of all the runs
        oRNA (master)
        aDir (master)
        pRuns
        --run.00
        ----oRNA (slave: pRuns/run.00/oRNA)
        ----aDir
        --run.01
        '''

    mDC = cgDB.dataController(masterDir, cgAlignment.cgAlignment)
    id_masterObj = mDC.load()

    #recurse through all the runs
    masterBN = bioLibCG.getBaseFileName(masterDir)

    for slaveDir in bioLibCG.recursePaths(parDir, end=masterBN):

        oDC = cgDB.dataController(slaveDir, cgAlignment.cgAlignment)
        id_slaveObj = oDC.load()

        id_masterObj = cgDB.mergeTwoObjects(id_masterObj, id_slaveObj,
                                            cgOriginRNA.OriginRNA)

    mDC.commit(id_masterObj)
Exemplo n.º 5
0
def makeWig(fN, assembly, format = None, name = None):
	
	'''format assumes bowtie
	suitible for medium mapped files.
	takes longer.'''
	#assume bowtie
	if not format: format = 'Bowtie'
	parserFunction = returnParserFunction(format)
	if not name: name = cg.getBaseFileName(fN, naked = True)
	lDict = cg.returnChromLengthDict(assembly)
	
	
	for chrom in lDict:
		if not chrom in cg.acceptableChroms: continue
		for strand in ['1', '-1']:
			f = open(fN, 'r')
			#create hitmap of chrom and strand
			print chrom, strand, 'hitmap'
			hitDict = {}
			for line in f:
				
				lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line))
				lStrand = str(lStrand)
				start = int(start)
				end = int(end)
				if chrom == lChrom and strand == lStrand:
					for i in range(start, end + 1):
						try:
							hitDict[i] += 1
						except KeyError:
							hitDict[i] = 1
			
			#write results to wig file
			writeWigFromHitDict(hitDict, assembly)
Exemplo n.º 6
0
def writeWigFromHitDict(hitDict, assembly, name, directory=None):

    mConf = c.getConfig('Main.conf')
    if not directory: directory = mConf.conf['wigs']
    if not name: name = cg.getBaseFileName(name, naked=True)
    lDict = cg.returnChromLengthDict(assembly)

    cg.clearDirectory(directory, overwrite=False)
    #write results to wig file
    for chrom in hitDict:
        for strand in hitDict[chrom]:

            oF = open(directory + '/%s.%s.%s.wig' % (name, chrom, strand), 'w')
            oF.write('track type=bedGraph name=%s.%s.%s\n' %
                     (name, chrom, strand))

            #print '  sorting'
            #print hitDict[chrom]
            chromEnd = lDict[chrom]  #
            hitDict[chrom][strand][chromEnd] = 0
            keys = hitDict[chrom][strand].keys()
            keys.sort()

            #print '  writing blocks'
            prevVal = 0
            prevCoord = 0
            blockStart = 0
            blockEnd = 1
            for key in keys:
                val = hitDict[chrom][strand][key]

                if prevCoord == key - 1:
                    if val == prevVal:  #should be combined
                        blockEnd = key + 1
                    else:  #no zero block
                        #write old block
                        oF.write('%s\t%s\t%s\t%s\n' %
                                 (chrom, blockStart, blockEnd,
                                  prevVal))  #!make it a float value?
                        #start new block
                        blockStart = key
                        blockEnd = key + 1

                else:
                    #write old block
                    oF.write('%s\t%s\t%s\t%s\n' %
                             (chrom, blockStart, blockEnd, prevVal))
                    #write zero block
                    oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockEnd, key, 0))
                    #start new block
                    blockStart = key
                    blockEnd = key + 1

                prevVal = val
                prevCoord = key
            oF.close()
Exemplo n.º 7
0
def mixWig(directory, assembly, name=None):
    '''Does it by chromosome --> faster, less memory'''

    if not name: name = 'Merge'
    #gather all chromosomes
    chromList = []
    for fN in cg.recurseDir(directory, end='.wig'):
        chrom = cg.getBaseFileName(fN).strip().split('.')[-3]
        if chrom not in chromList:
            chromList.append(chrom)

    print chromList

    for chrom in chromList:

        print chrom
        #Gather all the values from all the files
        hitDict = {}  # chrom : { strand : coord
        for fN in cg.recurseDir(directory, end='.wig'):
            fChrom = cg.getBaseFileName(fN).strip().split('.')[-3]
            if fChrom != chrom: continue
            print '  ', fN, fChrom
            f = open(fN, 'r')
            f.readline()  #header
            strand = cg.getBaseFileName(fN).strip().split('.')[-2]
            for line in f:

                lChrom, start, end, val = (line.strip().split('\t'))
                start, end, val = int(start), int(end), int(val)
                if val < 1: continue
                #print start, end, val
                for i in range(start, end):
                    try:
                        hitDict[lChrom][strand][i] += val
                    except (KeyError, TypeError):
                        if not lChrom in hitDict:
                            hitDict[lChrom] = {}
                        if not strand in hitDict[lChrom]:
                            hitDict[lChrom][strand] = {}
                        hitDict[lChrom][strand][i] = val

        #write results to wig file
        writeWigFromHitDict(hitDict, assembly, name, directory)
Exemplo n.º 8
0
def mixWig(directory, assembly, name = None):
	'''Does it by chromosome --> faster, less memory'''
	
	if not name: name = 'Merge'
	#gather all chromosomes
	chromList = []
	for fN in cg.recurseDir(directory, end = '.wig'):
		chrom = cg.getBaseFileName(fN).strip().split('.')[-3]
		if chrom not in chromList:
			chromList.append(chrom)
	
	print chromList
	
	for chrom in chromList:
		
		print chrom
		#Gather all the values from all the files
		hitDict = {} # chrom : { strand : coord
		for fN in cg.recurseDir(directory, end = '.wig'):
			fChrom = cg.getBaseFileName(fN).strip().split('.')[-3]
			if fChrom != chrom: continue
			print  '  ', fN, fChrom
			f = open(fN, 'r')
			f.readline() #header
			strand = cg.getBaseFileName(fN).strip().split('.')[-2]
			for line in f:
				
				lChrom, start, end, val = (line.strip().split('\t'))
				start, end, val = int(start), int(end), int(val)
				if val < 1: continue
				#print start, end, val
				for i in range(start, end):
					try:
						hitDict[lChrom][strand][i] += val
					except (KeyError,TypeError):
						if not lChrom in hitDict:
							hitDict[lChrom] = {}
						if not strand in hitDict[lChrom]:
							hitDict[lChrom][strand] = {}
						hitDict[lChrom][strand][i] = val
		
		#write results to wig file
		writeWigFromHitDict(hitDict, assembly, name, directory)
Exemplo n.º 9
0
def makeWigMem(fN, assembly, format = None, name = None, directory = None):
	'''format assumes bowtie
	suitible for small mapped files.'''
	
	if not name: name = cg.getBaseFileName(fN, naked = True)
	if not format: format = 'Bowtie'
	parserFunction = returnParserFunction(format)
	
	lDict = cg.returnChromLengthDict(assembly)
	f = open(fN, 'r')
	f.readline() #header...file might not have one but its one read...
	
	#create hitmap of chrom and strand
	hitDict = {} #format = chr: { strand : { coord : value 
	for line in f:
		try:
			lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line))
		except AttributeError:
			continue
		lStrand = str(lStrand)
		start = int(start)
		end = int(end)
		if lChrom in cg.acceptableChroms:
			
			#wig for degradome
			if lStrand == '1':
				i = start + 20
			else:
				i = start
				
			try:
				hitDict[lChrom][lStrand][i] += 1
			except KeyError:
				if lChrom not in hitDict:
					hitDict[lChrom] = {}
				if lStrand not in hitDict[lChrom]:
					hitDict[lChrom][lStrand] = {}
				hitDict[lChrom][lStrand][i] = 1
			'''
			
			for i in range(start, end):
				try:
					hitDict[lChrom][lStrand][i] += 1
				except KeyError:
					if lChrom not in hitDict:
						hitDict[lChrom] = {}
					if lStrand not in hitDict[lChrom]:
						hitDict[lChrom][lStrand] = {}
					hitDict[lChrom][lStrand][i] = 1
			'''		
	f.close()
	
	#write results to wig file
	writeWigFromHitDict(hitDict, assembly, name, directory)
Exemplo n.º 10
0
def makeWigMem(fN, assembly, format=None, name=None, directory=None):
    '''format assumes bowtie
	suitible for small mapped files.'''

    if not name: name = cg.getBaseFileName(fN, naked=True)
    if not format: format = 'Bowtie'
    parserFunction = returnParserFunction(format)

    lDict = cg.returnChromLengthDict(assembly)
    f = open(fN, 'r')
    f.readline()  #header...file might not have one but its one read...

    #create hitmap of chrom and strand
    hitDict = {}  #format = chr: { strand : { coord : value
    for line in f:
        try:
            lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line))
        except AttributeError:
            continue
        lStrand = str(lStrand)
        start = int(start)
        end = int(end)
        if lChrom in cg.acceptableChroms:

            #wig for degradome
            if lStrand == '1':
                i = start + 20
            else:
                i = start

            try:
                hitDict[lChrom][lStrand][i] += 1
            except KeyError:
                if lChrom not in hitDict:
                    hitDict[lChrom] = {}
                if lStrand not in hitDict[lChrom]:
                    hitDict[lChrom][lStrand] = {}
                hitDict[lChrom][lStrand][i] = 1
            '''
			
			for i in range(start, end):
				try:
					hitDict[lChrom][lStrand][i] += 1
				except KeyError:
					if lChrom not in hitDict:
						hitDict[lChrom] = {}
					if lStrand not in hitDict[lChrom]:
						hitDict[lChrom][lStrand] = {}
					hitDict[lChrom][lStrand][i] = 1
			'''
    f.close()

    #write results to wig file
    writeWigFromHitDict(hitDict, assembly, name, directory)
Exemplo n.º 11
0
def writeWigFromHitDict(hitDict, assembly, name, directory = None):
	
	mConf = c.getConfig('Main.conf')
	if not directory: directory = mConf.conf['wigs']
	if not name: name = cg.getBaseFileName(name, naked = True)
	lDict = cg.returnChromLengthDict(assembly)
	
	cg.clearDirectory(directory, overwrite = False)
	#write results to wig file
	for chrom in hitDict:
		for strand in hitDict[chrom]:
			
			oF = open(directory + '/%s.%s.%s.wig' % (name, chrom, strand), 'w')
			oF.write('track type=bedGraph name=%s.%s.%s\n' % (name, chrom, strand))
			
			#print '  sorting'
			#print hitDict[chrom]
			chromEnd = lDict[chrom] #
			hitDict[chrom][strand][chromEnd] = 0
			keys = hitDict[chrom][strand].keys()
			keys.sort()
			
			#print '  writing blocks'
			prevVal = 0
			prevCoord = 0
			blockStart = 0
			blockEnd = 1
			for key in keys:
				val = hitDict[chrom][strand][key]
				
				if prevCoord == key - 1: 
					if val == prevVal:#should be combined
						blockEnd = key + 1
					else: #no zero block
						#write old block
						oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal)) #!make it a float value?
						#start new block
						blockStart = key
						blockEnd = key + 1
						
				else:
					#write old block
					oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockStart, blockEnd, prevVal))
					#write zero block
					oF.write('%s\t%s\t%s\t%s\n' % (chrom, blockEnd, key, 0))
					#start new block
					blockStart = key
					blockEnd = key + 1
				
				prevVal = val
				prevCoord = key
			oF.close()
Exemplo n.º 12
0
def clipAdapter(fName, adapter = None, validate = False, oName = None, overwrite = True):
	
	#Check to see if the file exists:
	putativeN = fName.replace('.fastq','.clipped.fastq')
	if os.path.isfile(putativeN):
		if overwrite:
			print '  Overwriting file', putativeN
			os.remove(putativeN)
		else:
			print '  \nNOT OVERWRITING FILE', putativeN
			return 1
			 
	#If the adapter is none, try to find it in the small.meta file
	if adapter is None:
		baseFName = cg.getBaseFileName(fName) + '.counts'
		for metaFileName in metaFileNames:
			mFile = open(metaFileName, 'r')
			for line in mFile:
				fields = line.strip().split('\t')
				if baseFName == fields[0]:
					if fields[3] == 'NONE':
						print '  NO ADAPTER KNOWN FOR', fName
						return 1
					else:
						adapter = fields[3]
						print '  Using adapter', adapter, fName
			mFile.close()
	
	
	
	#Is it a valid fastq file?
	if validate: 
		pass
	
	#check the type of fastq file
	sangerType = False
	fType = fastQTypes.getFastQType(fName, quick = True)
	if fType == 'Sa':
		sangerType = True
	print '  Detected format:', fType, fName
	
	#Run it through clipper
	print 'Clipping file', fName
	if oName is None:
		oName = fName.replace('.fastq','.clipped.fastq')
	
	if sangerType:
		subprocess.Popen(['fastx_clipper', '-n', '-v', '-Q', '33', '-i', str(fName), '-a', str(adapter), '-o', str(oName)]).wait()
	else:
		subprocess.Popen(['fastx_clipper', '-n', '-v', '-i', str(fName), '-a', str(adapter), '-o', str(oName)]).wait()
	print '  DONE', fName
Exemplo n.º 13
0
def clipAdapter(fName, adapter=None, validate=False, oName=None):

    #If the adapter is none, try to find it in the small.meta file
    if adapter is None:
        baseFName = cg.getBaseFileName(fName) + '.counts'
        for metaFileName in metaFileNames:
            mFile = open(metaFileName, 'r')
            for line in mFile:
                fields = line.strip().split('\t')
                if baseFName == fields[0]:
                    if fields[3] == 'NONE':
                        print 'NO ADAPTER KNOWN FOR', fName
                        return 1
                    else:
                        adapter = fields[3]
                        print 'Using adapter', adapter
            mFile.close()

    #Is it a valid fastq file?
    if validate:
        pass

    #check the type of fastq file
    sangerType = False
    fType = fastQTypes.getFastQType(fName, quick=True)
    if fType == 'Sa':
        sangerType = True
    print 'Detected format:', fType

    #Run it through clipper
    print 'Clipping file'
    if oName is None:
        oName = fName.replace('.fastq', '.clipped.fastq')

    if sangerType:
        subprocess.Popen([
            'fastx_clipper', '-v', '-Q', '33', '-i',
            str(fName), '-a',
            str(adapter), '-o',
            str(oName)
        ]).wait()
    else:
        subprocess.Popen([
            'fastx_clipper', '-v', '-i',
            str(fName), '-a',
            str(adapter), '-o',
            str(oName)
        ]).wait()
    print 'DONE'
Exemplo n.º 14
0
def plotResults(fN, cName=None):

    cHairs = getHairpins.getHairpins(fN)  #CID: HAIRPIN

    directory = cg.getBaseFileName(fN)
    cg.clearDirectory(directory)

    #change the directory before plotting
    cwd = os.getcwd()
    os.chdir(directory)

    for CID in cHairs:
        print 'plotting:', CID
        cgPlot.plotASProfile(cHairs[CID], cName)

    os.chdir(cwd)
Exemplo n.º 15
0
def plotResults(fN, cName = None):
		
	cHairs = getHairpins.getHairpins(fN) #CID: HAIRPIN
	
	directory = cg.getBaseFileName(fN)
	cg.clearDirectory(directory)
	
	#change the directory before plotting
	cwd = os.getcwd()
	os.chdir(directory)
	
	for CID in cHairs:
		print 'plotting:', CID
		cgPlot.plotASProfile(cHairs[CID], cName)
	
	os.chdir(cwd)
Exemplo n.º 16
0
def mapFastQInDirQ(dirName, overwrite = True):
	'''Every Q function has a corresponding shell script'''
	wrapperShell = '/home/chrisgre/scripts/mapping/mapFastQ.sh'
	
	
	for file in cg.recurseDir(dirName, end = 'clipped.fastq'):
		print file
		
		putativeN = file.replace('.clipped.fastq','.clipped.fastq.mapped')
		if os.path.isfile(putativeN):
			if overwrite:
				print '  Overwriting file', putativeN
				os.remove(putativeN)
			else:
				print '  \nNOT OVERWRITING FILE', putativeN
				continue
				
		#check if mouse or human
		baseFName = cg.getBaseFileName(file, naked = True)
		org = 'None'
		for metaFileName in metaFileNames:
			mFile = open(metaFileName, 'r')
			for line in mFile:
				fields = line.strip().split('\t')
				if baseFName == fields[0]:
					if fields[2] == 'NONE':
						print '  NO ORG KNOWN FOR', file
						continue
					else:
						org = fields[2]
						print '  USING ORG', org, file
			mFile.close()
			
		#check if there is an organism, must check due to files not in metaFile
		if org == 'None':
			print '  NO org', file
			continue
			
		while True:
			#submit job if there are less than ten
			if clusterCheck.queryNumJobsQ('chrisgre') < 40:
				#subprocess.Popen(['qsub', '-q', 'xiao', '-l', 'mem=4G', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ])
				subprocess.Popen(['qsub', '-l', 'mem=4G', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ])
				time.sleep(.2) #give it time to update qstat
				break
			else:#wait 10 secs...
				time.sleep(20)
Exemplo n.º 17
0
def createTrackInDir(dirName):
	'''Every Q function has a corresponding shell script
	Make wig file for all mapped files, for all organisms'''
	
	wrapperShell = '/home/chrisgre/scripts/mapping/createTrack.sh'
	
	mainConf = c.cgConfig('Main.conf')
	metaFileName = mainConf.conf['metaFileName']

	for file in cg.recurseDir(dirName, end = '.mapped'):
						
		#check if mouse or human
		baseFName = cg.getBaseFileName(file)
		baseFName = baseFName.split('.')[0]
		
		metaDict = cg.getMetaFileDict(metaFileName)
		
		org = 'None'
		if baseFName in metaDict:
			if metaDict[baseFName][1] == 'NONE':
				print '  NO ORG KNOWN FOR', file
				continue
			else:
				org = metaDict[baseFName][1]
				print '  USING ORG', org, file
				
		#check if there is an organism, must check due to files not in metaFile
		if org == 'None':
			print '  NO org (not in meta file)', file
			continue
			
		while True:
			#submit job if there are less than ten
			if clusterCheck.queryNumJobsQ('chrisgre') < 1000:
				#subprocess.Popen(['qsub', '-q', 'xiao', '-l', 'mem=4G', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ])
				subprocess.Popen(['qsub', '-V', '-o', mainConf.conf['outLog'], '-e', mainConf.conf['errorLog'], wrapperShell, file, org ])
				#time.sleep(.5) #give it time to update qstat
				break
			else:#wait 10 secs...
				time.sleep(20)
Exemplo n.º 18
0
def makeWigMem(fN, assembly, format = None, name = None, directory = None, degWig = False, switchStrand = True, normalized = False):
	'''format assumes bowtie
	suitible for small mapped files.
        switch strand does not switch the strands, it just makes sure if the data is backwards (HeLa) that it will 
        put the peak in the right spot'''
	
        print 'degWig Value', degWig
        print 'switch strands?', switchStrand
	if not name: name = cg.getBaseFileName(fN, naked = True)
	if not format: format = 'Bowtie'
	parserFunction = returnParserFunction(format)
	
	lDict = cg.returnChromLengthDict(assembly)
	f = open(fN, 'r')
	f.readline() #header...file might not have one but its one read...
	
	#create hitmap of chrom and strand
	hitDict = {} #format = chr: { strand : { coord : value 
	for line in f:
                lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line))
		lStrand = str(lStrand)
		start = int(start)
		end = int(end)
                numPlacesMapped = int(line.strip().split('\t')[6])
                numPlacesMapped += 1
                readCount = 1
                if normalized:
                    readCount = float(readCount)/numPlacesMapped

		if lChrom in cg.acceptableChroms:
                        
                        if degWig:
                                #wig for degradome NOTE:!!! change lStrand == '1' to '-1' for Bracken!
                                if switchStrand:
                                    if lStrand == '1':
                                            i = start + (end - start)
                                    else:
                                            i = start + 1
                                else:                                            
                                    if lStrand == '-1':
                                            i = start + (end - start)
                                    else:
                                            i = start + 1


                                hitDict.setdefault(lChrom, {}).setdefault(lStrand, {})
                                hitDict[lChrom][lStrand][i] = hitDict[lChrom][lStrand].get(i, 0) + readCount
                        else:

                                #wig for regular
                                for i in range(start, end):
                                        try:
                                                hitDict[lChrom][lStrand][i] += readCount 
                                        except KeyError:
                                                if lChrom not in hitDict:
                                                        hitDict[lChrom] = {}
                                                if lStrand not in hitDict[lChrom]:
                                                        hitDict[lChrom][lStrand] = {}
                                                hitDict[lChrom][lStrand][i] = readCount

	f.close()
	
	#write results to wig file
	writeWigFromHitDict(hitDict, assembly, name, directory)
Exemplo n.º 19
0
def createMultiTrack(dirName, organism):
	'''merge all mapped tracks in directory and create a single wig file'''
	mainConf = c.cgConfig('Main.conf')
	metaFileName = mainConf.conf['metaFileName']
	
	fileList = []
	for file in cg.recurseDir(dirName, end = '.mapped'):
						
		#check if mouse or human SHOULD PUT INTO A STD FUNCTION FOR META FILE
		#check if mouse or human
		baseFName = cg.getBaseFileName(file, naked= True)
		
		metaDict = cg.getMetaFileDict(metaFileName)
		
		org = 'None'
		if baseFName in metaDict:
			if metaDict[baseFName][1] == 'NONE':
				print '  NO ORG KNOWN FOR', file
				continue
			elif not metaDict[baseFName][1] == organism:
				print '  NOT ORGANISM RUNNING', file
				continue
			else:
				org = metaDict[baseFName][1]
				print '  USING ORG', org, file
			
		#check if there is an organism, must check due to files not in metaFile
		if org == 'None':
			print '  NO org (not in meta file)', file
			continue
		
		#only make wig file for organism asked for
		if not org == organism:
			continue
		
		#if it is right organism and has mapped file then add
		fileList.append(file)
	
	
	#make merged wig
	if organism == 'human':
		chroms = cg.humanChromosomes
		assembly = 'hg19'
	elif organism == 'mouse':
		chroms = cg.mouseChromosomes
		assembly = 'mm9'
	elif organism == 'zebrafish':
		chroms = cg.zebrafishChromosomes
		assembly = 'danRer6'
	
	print 'Making Bed File vectors'
	cvg = HTSeq.GenomicArray(chroms, stranded=True, typecode='i')
	for fName in fileList:
		alignment_file = HTSeq.BowtieReader(fName)
		for alngt in alignment_file:
			if alngt.aligned:
				cvg.add_value( 1, alngt.iv ) #iv is the genomic interval..

	bedNamePos = dirName + '/Merge.' + organism + '.1.wig'
	bedNameNeg = dirName + '/Merge.' + organism + '.-1.wig'
	
	print 'Writing Bed File'
	cvg.write_bedgraph_file(bedNamePos, "+" )
	cvg.write_bedgraph_file(bedNameNeg, "-" )

	#Now extend it
	updateWigLength(bedNamePos, assembly)
	updateWigLength(bedNameNeg, assembly)
	
	#Now Sort it.
	cgSort.wigSort(bedNamePos)
	cgSort.wigSort(bedNameNeg)
Exemplo n.º 20
0

#make lib:hits dict
densityFile = open('/home/chrisgre/scripts/readDensity/individual.densities.data', 'r')
tissueHits = {}
cID = 'NONE'
for line in densityFile:
	if line.startswith('\t'): #lib: hit
		l = line.strip().split('\t')[0]
		hits = int(line.strip().split('\t')[1])
		tissueHits[cID][l] = hits 
	else:
		cID = line.strip()
		tissueHits[cID] = {}

tissueHist = {}#tissue: hits
for mirID in mirIDs:
	if mirID in tissueHits:
		for smallLib in tissueHits[mirID]:
			smallName = cg.getBaseFileName(smallLib, naked = True)
			if smallName in metaDict:
				if metaDict[smallName][1] == 'mouse':
					if len(metaDict[smallName]) > 3:
						t = metaDict[smallName][3]
						if t in tissueHist:
							tissueHist[t] += tissueHits[mirID][smallLib]
						else:
							tissueHist[t] = tissueHits[mirID][smallLib]

print tissueHist
Exemplo n.º 21
0
direc = '/home/chrisgre/apps/projects/small.rna.libs'
metaFileName = direc + '/' + 'small.meta'

##make data of already made file so there aren't any duplicates:
fileDict = {}  #filename...
metaFile = open(metaFileName, 'r')

#add new entries
countFiles = cg.recurseDir(direc, end='.fastq')

for file in countFiles:
    fileName = file.strip().split('/')[-1]
    if len(fileName.split('.')) > 2:  #has to specifically end in fastq...
        continue
    fileName = cg.getBaseFileName(file, naked=True)
    dir = file.strip().split('/')[-2]

    org = 'NONE'
    if 'human' in dir:
        org = 'human'
    if 'mouse' in dir:
        org = 'mouse'
    if 'pig' in dir:
        org = 'pig'
    if 'dog' in dir:
        org = 'dog'
    if 'rat' in dir:
        org = 'rat'
    if 'zebrafish' in dir:
        org = 'zebrafish'
Exemplo n.º 22
0
    if metaDict[baseFName][1] == organism:
        organismFileList.append(baseFName)

#put small hits for each prediction in dictionary
pCount = {}
smallFile = open(smallFileName, 'r')

currID = 'NONE'
for line in smallFile:
    if '\t' not in line:  #This is the line with the id in it -->  store another ID
        currID = line.strip()
    else:  #this line contains library and count info  --> add
        lib = line.strip().split('\t')[0]
        count = int(line.strip().split('\t')[1])

        if cg.getBaseFileName(lib, naked=True) in organismFileList:
            if currID in pCount:
                pCount[currID] = pCount[currID] + count
            else:
                pCount[currID] = count

#update the file --> any line with kmer on it give it the count
newLines = []
predFile = open(pFileName, 'r')
for line in predFile:
    kmer = line.strip().split('\t')[0].split('.')[0]
    if kmer in pCount:
        numSmall = pCount[kmer]
    else:
        numSmall = 0
    newLine = line.strip().split('\t')
Exemplo n.º 23
0
#make lib:hits dict
densityFile = open(
    '/home/chrisgre/scripts/readDensity/individual.densities.data', 'r')
tissueHits = {}
cID = 'NONE'
for line in densityFile:
    if line.startswith('\t'):  #lib: hit
        l = line.strip().split('\t')[0]
        hits = int(line.strip().split('\t')[1])
        tissueHits[cID][l] = hits
    else:
        cID = line.strip()
        tissueHits[cID] = {}

tissueHist = {}  #tissue: hits
for mirID in mirIDs:
    if mirID in tissueHits:
        for smallLib in tissueHits[mirID]:
            smallName = cg.getBaseFileName(smallLib, naked=True)
            if smallName in metaDict:
                if metaDict[smallName][1] == 'mouse':
                    if len(metaDict[smallName]) > 3:
                        t = metaDict[smallName][3]
                        if t in tissueHist:
                            tissueHist[t] += tissueHits[mirID][smallLib]
                        else:
                            tissueHist[t] = tissueHits[mirID][smallLib]

print tissueHist
Exemplo n.º 24
0
def makeWigMem(fN,
               assembly,
               format=None,
               name=None,
               directory=None,
               degWig=False,
               switchStrand=True,
               normalized=False):
    '''format assumes bowtie
	suitible for small mapped files.
        switch strand does not switch the strands, it just makes sure if the data is backwards (HeLa) that it will 
        put the peak in the right spot'''

    print 'degWig Value', degWig
    print 'switch strands?', switchStrand
    if not name: name = cg.getBaseFileName(fN, naked=True)
    if not format: format = 'Bowtie'
    parserFunction = returnParserFunction(format)

    lDict = cg.returnChromLengthDict(assembly)
    f = open(fN, 'r')
    f.readline()  #header...file might not have one but its one read...

    #create hitmap of chrom and strand
    hitDict = {}  #format = chr: { strand : { coord : value
    for line in f:
        lChrom, lStrand, start, end = cg.tccSplit(parserFunction(line))
        lStrand = str(lStrand)
        start = int(start)
        end = int(end)
        numPlacesMapped = int(line.strip().split('\t')[6])
        numPlacesMapped += 1
        readCount = 1
        if normalized:
            readCount = float(readCount) / numPlacesMapped

        if lChrom in cg.acceptableChroms:

            if degWig:
                #wig for degradome NOTE:!!! change lStrand == '1' to '-1' for Bracken!
                if switchStrand:
                    if lStrand == '1':
                        i = start + (end - start)
                    else:
                        i = start + 1
                else:
                    if lStrand == '-1':
                        i = start + (end - start)
                    else:
                        i = start + 1

                hitDict.setdefault(lChrom, {}).setdefault(lStrand, {})
                hitDict[lChrom][lStrand][i] = hitDict[lChrom][lStrand].get(
                    i, 0) + readCount
            else:

                #wig for regular
                for i in range(start, end):
                    try:
                        hitDict[lChrom][lStrand][i] += readCount
                    except KeyError:
                        if lChrom not in hitDict:
                            hitDict[lChrom] = {}
                        if lStrand not in hitDict[lChrom]:
                            hitDict[lChrom][lStrand] = {}
                        hitDict[lChrom][lStrand][i] = readCount

    f.close()

    #write results to wig file
    writeWigFromHitDict(hitDict, assembly, name, directory)
Exemplo n.º 25
0
direc = '/home/chrisgre/apps/projects/small.rna.libs'
metaFileName = direc + '/' + 'small.meta'

##make data of already made file so there aren't any duplicates:
fileDict = {} #filename...
metaFile = open(metaFileName, 'r')

#add new entries
countFiles = cg.recurseDir(direc, end = '.fastq')

for file in countFiles:
	fileName = file.strip().split('/')[-1]
	if len(fileName.split('.')) > 2: #has to specifically end in fastq...
		continue
	fileName = cg.getBaseFileName(file, naked = True)
	dir = file.strip().split('/')[-2]
		
	org = 'NONE'
	if 'human' in dir:
		org = 'human'
	if 'mouse' in dir:
		org = 'mouse'
	if 'pig' in dir:
		org = 'pig'
	if 'dog' in dir:
		org = 'dog'
	if 'rat' in dir:
		org = 'rat'
	if 'zebrafish' in dir:
		org = 'zebrafish'
Exemplo n.º 26
0
	if metaDict[baseFName][1] == organism:
		organismFileList.append(baseFName)

#put small hits for each prediction in dictionary
pCount = {}
smallFile = open(smallFileName, 'r')

currID = 'NONE'
for line in smallFile:
	if '\t' not in line: #This is the line with the id in it -->  store another ID
		currID = line.strip()
	else: #this line contains library and count info  --> add 
		lib = line.strip().split('\t')[0]
		count = int(line.strip().split('\t')[1])
		
		if cg.getBaseFileName(lib, naked = True) in organismFileList:
			if currID in pCount:
				pCount[currID] = pCount[currID] + count
			else:
				pCount[currID] = count

#update the file --> any line with kmer on it give it the count
newLines = []
predFile = open(pFileName, 'r')
for line in predFile:
	kmer = line.strip().split('\t')[0].split('.')[0]
	if kmer in pCount:
		numSmall = pCount[kmer]
	else:
		numSmall = 0
	newLine = line.strip().split('\t')