Exemplo n.º 1
0
def scanSequence(seqList, dirName):
	'''Given list of sequences --> get all reads that have sequence
	'''
	
	fileNames = cg.recurseDir(dirName, end = '.sequence')
	if len(fileNames) > 1:
		print fileNames
		print 'there is more than one sequence file in this directory'
		return 1
	else:
		fN = fileNames[0]
	
	#for seq in seqList:
	seq = seqList
	fIndex = cgIndex.lineIndex(fN, header = False)
	fIndex.passCheckFunction(cgIndex.mapSequenceCheckFunction)
	fIndex.binarySearch(seq) #places file pointer at beginning of sequence line
	
	#extend and report
	fIndex.extendUp(seq)
	finalReads = []
	for line in fIndex.file:
		if fIndex.checkFunction(seq, line) == 0:
			finalReads.append(line.strip())
		else:
			return finalReads
Exemplo n.º 2
0
def scanVectorsFile(fN, tccList):
	'''Given tcc list --> scan wig files and return coord:value...
	'''	
	timer = cg.cgTimer()
	timer.start()
	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)
		
		#goto correct line in index
		fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this...
		fIndex.passCheckFunction(cgIndex.wigCheckFunction)
		fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning
				
		stop = False
		for line in fIndex.file:
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1])
			lEnd = int(cg.ss(line)[2])
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd):
				coordDict[i] = lValue
				
			if stop: break
	
		#fIndex.close()
	return coordDict
Exemplo n.º 3
0
def scanSequence(seqList, dirName):
    '''Given list of sequences --> get all reads that have sequence
	'''

    fileNames = cg.recurseDir(dirName, end='.sequence')
    if len(fileNames) > 1:
        print fileNames
        print 'there is more than one sequence file in this directory'
        return 1
    else:
        fN = fileNames[0]

    #for seq in seqList:
    seq = seqList
    fIndex = cgIndex.lineIndex(fN, header=False)
    fIndex.passCheckFunction(cgIndex.mapSequenceCheckFunction)
    fIndex.binarySearch(
        seq)  #places file pointer at beginning of sequence line

    #extend and report
    fIndex.extendUp(seq)
    finalReads = []
    for line in fIndex.file:
        if fIndex.checkFunction(seq, line) == 0:
            finalReads.append(line.strip())
        else:
            return finalReads
Exemplo n.º 4
0
def scanVectorsFile(fN, tccList):
	'''Given tcc list --> scan wig files and return coord:value...
	'''	
	timer = cg.cgTimer()
	timer.start()
	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)
		
		#goto correct line in index
		fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this...
		fIndex.passCheckFunction(cgIndex.wigCheckFunction)
		fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning
				
		stop = False
		for line in fIndex.file:
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1])
			lEnd = int(cg.ss(line)[2])
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd):
				coordDict[i] = lValue
				
			if stop: break
	
		#fIndex.close()
	return coordDict
Exemplo n.º 5
0
def svCoord(tccList, config = None):
	'''Given tcc list --> scan Organism wig files and coord:value...
	'''
	
	#init
	config = c.getConfig(config)
	org = config.conf['organism']
	wigDir = config.conf['wigSetDir']
	wigSetName = config.conf['wigSetName']
	splitIntoChroms = config.conf['wigChromSplit']
	if splitIntoChroms == 'True':
		splitIntoChroms = True
	else:
		splitIntoChroms = False

	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)
		
		if splitIntoChroms:
			fN = wigDir + '/%s.%s.%s.wig' %  (wigSetName, chrom, strand)
		else:
			fN = wigDir + '/Merge.%s.%s.wig' % (org.lower(), strand)
		
		fIndex = cgIndex.lineIndex(fN, header = True)
		fIndex.passCheckFunction(cgIndex.wigCheckFunction)
		fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning
		
		stop = False
		for line in fIndex.file:
			
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1]) + 1
                        #print 'lBeg', lBeg
			lEnd = int(cg.ss(line)[2])
                        #print 'lEnd', lEnd
                        #print '--'
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd + 1):
				coordDict[i] = lValue
				
			if stop: break
		fIndex.close() #close the file and the index after use...

	return coordDict
Exemplo n.º 6
0
    def save(self, outFN=None):

        if outFN == None: outFN = self._dataFileName
        if self._rangeSpecified:
            outFN += '.range.%s.%s' % (self._rangeSpecified[0],
                                       self._rangeSpecified[1])

#skip to start of specified range
        if self._rangeSpecified:
            fIndex = cgIndex.lineIndex(self._dataFileName)
            fIndex.passCheckFunction(cgIndex.primaryIDCheckFunction)
            fIndex.binarySearch(self._rangeSpecified[0])
            f = fIndex.file
        else:
            f = open(self._dataFileName, 'r')

#create new file contents
        newLines = []
        for line in f:
            ls = line.strip().split('\t')
            id = int(ls[0])

            #skip those that weren't selected and in range
            ''' #I'm not sure I want to only write the ones with conditions...
                        if self._conditions:
                                if id not in self._selectedIDs: continue
			'''

            #stop checking for ids once out of range
            if self._rangeSpecified:
                if id > self._rangeSpecified[1]: break

#save the rest
            for attName in self._selectedAttNames:
                newVal = self._attName_casteToFxn[attName](
                    self._attName_id_value[attName][id])
                ls = lineUpdate(ls, newVal,
                                self._attName_columnPosition[attName])

            newLines.append('%s\n' % '\t'.join(ls))
        f.close()

        #output file
        f = open(outFN, 'w')
        f.writelines(newLines)
        f.close()

        #exit signal for parallel processes
        if self._rangeSpecified:
            f = open(outFN + '.exitSignal', 'w')
            f.write('DONE')
            f.close()
Exemplo n.º 7
0
	def save(self, outFN = None):
		
		if outFN == None: outFN = self._dataFileName
		if self._rangeSpecified:
			outFN += '.range.%s.%s' % (self._rangeSpecified[0], self._rangeSpecified[1]) 
                
                
                #skip to start of specified range
                if self._rangeSpecified:
                        fIndex = cgIndex.lineIndex(self._dataFileName)
                        fIndex.passCheckFunction(cgIndex.primaryIDCheckFunction)
                        fIndex.binarySearch(self._rangeSpecified[0])
                        f = fIndex.file
                else:
                        f = open(self._dataFileName, 'r')
		
                #create new file contents
		newLines = []
		for line in f:
			ls = line.strip().split('\t')
			id = int(ls[0])
                        
                        #skip those that weren't selected and in range
                       
                        ''' #I'm not sure I want to only write the ones with conditions...
                        if self._conditions:
                                if id not in self._selectedIDs: continue
			'''

                        #stop checking for ids once out of range
                        if self._rangeSpecified:
				if id > self._rangeSpecified[1]: break
                       
                        #save the rest
			for attName in self._selectedAttNames:
				newVal = self._attName_casteToFxn[attName](self._attName_id_value[attName][id])
				ls = lineUpdate(ls, newVal, self._attName_columnPosition[attName])
			
			newLines.append('%s\n' % '\t'.join(ls))
		f.close()

		#output file
		f = open(outFN, 'w')
		f.writelines(newLines)
		f.close()

		#exit signal for parallel processes
                if self._rangeSpecified:
                        f = open(outFN + '.exitSignal', 'w')
                        f.write('DONE')
                        f.close()
Exemplo n.º 8
0
def scanVectorsOrganism(tccList, config=None):
    '''Given tcc list --> scan Organism wig files and coord:value...
	'''

    config = c.getConfig(config)

    coordDict = {}  # tcc: [list values]
    for tcc in tccList:
        chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)

        #print 'Checking Tcc'
        org = config.conf['organism']
        mConf = c.getConfig('Main.conf')
        wigDir = mConf.conf['wig%s' % org]
        fN = wigDir + '/Merge.%s.%s.wig.%s.wig' % (org.lower(), strand, chrom)
        #print 'Checking Index'
        #goto correct line in index
        fIndex = cgIndex.lineIndex(
            fN, header=True
        )  #!!!there actually is a header...have to deal with this...
        fIndex.passCheckFunction(cgIndex.wigCheckFunction)
        fIndex.binarySearch(
            tcc)  #places file pointer at beginning of tcc as beginning

        stop = False
        for line in fIndex.file:

            #print 'Line:', line.strip()
            lBeg = int(cg.ss(line)[1])
            lEnd = int(cg.ss(line)[2])
            lValue = int(cg.ss(line)[3].split('.')[0])

            if tccStart > lBeg:
                lBeg = tccStart
            if tccEnd < lEnd:
                lEnd = tccEnd
                stop = True
            #print timer.split()

            for i in range(lBeg, lEnd):
                coordDict[i] = lValue

            if stop: break
    return coordDict
Exemplo n.º 9
0
def scanVectorsOrganism(tccList, config = None):
	'''Given tcc list --> scan Organism wig files and coord:value...
	'''
	
	config = c.getConfig(config)
	
	coordDict = {} # tcc: [list values]
	for tcc in tccList:
		chrom, strand, tccStart, tccEnd = cg.tccSplit(tcc)
		
		#print 'Checking Tcc'	
		org = config.conf['organism']
		mConf = c.getConfig('Main.conf')
		wigDir = mConf.conf['wig%s' % org]
		fN = wigDir + '/Merge.%s.%s.wig.%s.wig' %  (org.lower(),strand,chrom)	
		#print 'Checking Index'
		#goto correct line in index
		fIndex = cgIndex.lineIndex(fN, header = True) #!!!there actually is a header...have to deal with this...
		fIndex.passCheckFunction(cgIndex.wigCheckFunction)
		fIndex.binarySearch(tcc) #places file pointer at beginning of tcc as beginning
		
		stop = False
		for line in fIndex.file:
			
			#print 'Line:', line.strip()
			lBeg = int(cg.ss(line)[1])
			lEnd = int(cg.ss(line)[2])
			lValue = int(cg.ss(line)[3].split('.')[0])
			
			if tccStart > lBeg:
				lBeg = tccStart
			if tccEnd < lEnd:
				lEnd = tccEnd
				stop = True
			#print timer.split()

			for i in range(lBeg, lEnd):
				coordDict[i] = lValue
				
			if stop: break
	return coordDict
Exemplo n.º 10
0
def scanCoord(tcc, dirName):

    fileNames = cg.recurseDir(dirName, end='.starts')

    #get name of file for index
    chrom, strand, start, end = cg.tccSplit(tcc)
    nameCheck = '%s.%s' % (chrom, strand)
    fN = 'None'
    for fileName in fileNames:
        if nameCheck in fileName: fN = fileName
    if fN == 'None':
        print 'No Index file for', nameCheck
        return 0

    fIndex = cgIndex.lineIndex(fN, header=False)
    fIndex.passCheckFunction(cgIndex.mapStartCheckFunction)
    fIndex.binarySearch(
        tcc, skipEnd=True)  #places file pointer at beginning of sequence line

    #Check if you need to move down one line
    checkLine = fIndex.getLineFromByte(fIndex.currentByte)
    fIndex.passCheckFunction(
        cgIndex.mapStartRangeCheckFunction
    )  #Note i'm passing now, but it is also used in extending
    if fIndex.checkFunction(tcc, checkLine) != 0:
        fIndex.file.readline()
    fIndex.currentByte = fIndex.file.tell()

    #Now extend up until in range, down until in range --> return reads.
    fIndex.extendUp(tcc)

    finalReads = []
    for line in fIndex.file:
        if fIndex.checkFunction(tcc, line) == 0:
            finalReads.append(line.strip())
        else:
            return finalReads
Exemplo n.º 11
0
def scanCoord(tcc, dirName):
	
	fileNames = cg.recurseDir(dirName, end = '.starts')
	
	
	#get name of file for index
	chrom, strand, start, end = cg.tccSplit(tcc)
	nameCheck = '%s.%s' % (chrom, strand)
	fN = 'None'
	for fileName in fileNames:
		if nameCheck in fileName: fN = fileName
	if fN == 'None': 
		print 'No Index file for', nameCheck
		return 0
        
	
	fIndex = cgIndex.lineIndex(fN, header = False)
	fIndex.passCheckFunction(cgIndex.mapStartCheckFunction)
	fIndex.binarySearch(tcc, skipEnd = True) #places file pointer at beginning of sequence line
        
        #Check if you need to move down one line
        checkLine = fIndex.getLineFromByte(fIndex.currentByte)
        fIndex.passCheckFunction(cgIndex.mapStartRangeCheckFunction) #Note i'm passing now, but it is also used in extending
        if fIndex.checkFunction(tcc, checkLine) != 0:
                fIndex.file.readline()
        fIndex.currentByte = fIndex.file.tell()

        #Now extend up until in range, down until in range --> return reads.
        fIndex.extendUp(tcc)
        
	finalReads = []
	for line in fIndex.file:
		if fIndex.checkFunction(tcc, line) == 0:
			finalReads.append(line.strip())
                else:
			return finalReads
Exemplo n.º 12
0
	def load(self, attNames, paraInfo = [None, None], idRange = [], conditions = {}):
                '''paraInfo is [runNumber, numberOfRuns].  First parallel is checked, and then idRange'''
                self._conditions = conditions

		#if running a parallel job, split it into the right ids...
		if paraInfo != [None, None]:
			idRange = getIDRange(paraInfo, self._dataFileName)
	
		#if running parallel or specific range, mark range info
		self._selectedAttNames = attNames		
		if idRange:
			self._rangeSpecified = [idRange[0], idRange[-1]]
			
		#get casting and column info
		self.loadTranscriptionInfo(attNames)

		#initialize master dict
		for attName in attNames:
			self._attName_id_value[attName] = {}

		#get number of slots
		f = open(self._dataFileName, 'r')
		numSlots = len(f.readline().split('\t'))
		f.close()
		
                loadTime = 0.0
                stripTime = 0.0
                idTime = 0.0
                tranTime = 0.0
                conditionTime = 0.0

                #skip to start of specified range
                if self._rangeSpecified:
                        fIndex = cgIndex.lineIndex(self._dataFileName)
                        fIndex.passCheckFunction(cgIndex.primaryIDCheckFunction)
                        fIndex.binarySearch(self._rangeSpecified[0])
                        f = fIndex.file
                else:
                        f = open(self._dataFileName, 'r')
	        

                #transcribe values
                for line in f:

			ls = line.strip().split('\t')
			id = int(ls[0]) #id is always first slot
		

			#only transcribe selected range!
			if idRange:
				if id > idRange[1]:
		                        break

			#transcribe
			for attName in attNames:
				if self._attName_columnPosition[attName] < numSlots:
                                        if ls[self._attName_columnPosition[attName]] != '.':
                                                self._attName_id_value[attName][id] = self._attName_casteFromFxn[attName](ls[self._attName_columnPosition[attName]])
                                        else:
					        self._attName_id_value[attName][id] = copy(self._attName_defaultValue[attName])
				else:
					self._attName_id_value[attName][id] = copy(self._attName_defaultValue[attName])

                        #do conditions
                        if conditions:
                                for attName in conditions:
                                        if attName == 'ID':
                                                if conditions['ID'](id):
                                                        self._selectedIDs.add(id)
                                                else:            
                                                        for aName in attNames:
                                                                del self._attName_id_value[aName][id]
                                                
                                        else:
                                                if conditions[attName](self._attName_id_value[attName][id]):
                                                        self._selectedIDs.add(id)
                                                else:            
                                                        for aName in attNames:
                                                                del self._attName_id_value[aName][id]
                f.close()

                

		#bind attribute names to dictionaries
		for attName in attNames:
			self.bindAttribute(attName)
Exemplo n.º 13
0
    def load(self, attNames, paraInfo=[None, None], idRange=[], conditions={}):
        '''paraInfo is [runNumber, numberOfRuns].  First parallel is checked, and then idRange'''
        self._conditions = conditions

        #if running a parallel job, split it into the right ids...
        if paraInfo != [None, None]:
            idRange = getIDRange(paraInfo, self._dataFileName)

    #if running parallel or specific range, mark range info
        self._selectedAttNames = attNames
        if idRange:
            self._rangeSpecified = [idRange[0], idRange[-1]]

    #get casting and column info
        self.loadTranscriptionInfo(attNames)

        #initialize master dict
        for attName in attNames:
            self._attName_id_value[attName] = {}

    #get number of slots
        f = open(self._dataFileName, 'r')
        numSlots = len(f.readline().split('\t'))
        f.close()

        loadTime = 0.0
        stripTime = 0.0
        idTime = 0.0
        tranTime = 0.0
        conditionTime = 0.0

        #skip to start of specified range
        if self._rangeSpecified:
            fIndex = cgIndex.lineIndex(self._dataFileName)
            fIndex.passCheckFunction(cgIndex.primaryIDCheckFunction)
            fIndex.binarySearch(self._rangeSpecified[0])
            f = fIndex.file
        else:
            f = open(self._dataFileName, 'r')

        #transcribe values
        for line in f:

            ls = line.strip().split('\t')
            id = int(ls[0])  #id is always first slot

            #only transcribe selected range!
            if idRange:
                if id > idRange[1]:
                    break

            #transcribe
            for attName in attNames:
                if self._attName_columnPosition[attName] < numSlots:
                    if ls[self._attName_columnPosition[attName]] != '.':
                        self._attName_id_value[attName][
                            id] = self._attName_casteFromFxn[attName](
                                ls[self._attName_columnPosition[attName]])
                    else:
                        self._attName_id_value[attName][id] = copy(
                            self._attName_defaultValue[attName])
                else:
                    self._attName_id_value[attName][id] = copy(
                        self._attName_defaultValue[attName])

        #do conditions
            if conditions:
                for attName in conditions:
                    if attName == 'ID':
                        if conditions['ID'](id):
                            self._selectedIDs.add(id)
                        else:
                            for aName in attNames:
                                del self._attName_id_value[aName][id]

                    else:
                        if conditions[attName](
                                self._attName_id_value[attName][id]):
                            self._selectedIDs.add(id)
                        else:
                            for aName in attNames:
                                del self._attName_id_value[aName][id]
        f.close()

        #bind attribute names to dictionaries
        for attName in attNames:
            self.bindAttribute(attName)
Exemplo n.º 14
0
import sys
import cgIndex

fIndex = cgIndex.lineIndex(sys.argv[1])
fIndex.passCheckFunction(cgIndex.primaryIDCheckFunction)
fIndex.binarySearch(4000000)

print fIndex.file.readline()