예제 #1
0
    def __iter__(self):
        chr = self.chr
        trackName1, trackName2, w1, w2, genome = self.trackName1, self.trackName2, self.w1, self.w2, self.genome
        
        region = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr) )

        track1 = PlainTrack(trackName1)
        tv1 = track1.getTrackView(region)
        vals1 = tv1.valsAsNumpyArray()
        
        track2 = PlainTrack(trackName2)
        tv2 = track2.getTrackView(region)
        vals2 = tv2.valsAsNumpyArray()
        
        for i in xrange(len(vals1)):
            yield w1*vals1[i] + w2*vals2[i]
예제 #2
0
def createAssemblyGapsFile(genome, assemblyChars='ACGTacgt'):
    """genome assemblyChars='ACGTacgt'"""
    basePath = gcf.createOrigPath(genome, GenomeInfo.getPropertyTrackName(genome, 'gaps'),'')
    outFn = basePath + 'assemblyGaps.bed'
    qcf.ensurePathExists(outFn)
    outFile = open(outFn,'w')
    
    seqTrack = PlainTrack( GenomeInfo.getSequenceTrackName(genome) )

    anyGaps = False
    for chr in GenomeInfo.getExtendedChrList(genome):
        chrRegion = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr))
        seqTV = seqTrack.getTrackView(chrRegion)
        seq = seqTV.valsAsNumpyArray()
        
        #gapIndexes = numpy.arange(len(seq))[(seq == 'n') | (seq == 'N')]
        gapIndexes = numpy.arange(len(seq))[numpy.logical_not( numpy.logical_or.reduce([seq == x for x in assemblyChars]) )]
        gapIndexDiff = gapIndexes[1:] - gapIndexes[:-1]
        gapBeginIndexes = numpy.delete(gapIndexes, (numpy.arange(len(gapIndexDiff)) + 1)[gapIndexDiff==1])
        gapEndIndexes = numpy.delete(gapIndexes + 1, numpy.arange(len(gapIndexDiff))[gapIndexDiff==1])
        
        assert len(gapBeginIndexes) == len(gapEndIndexes)
        
        for i in xrange(len(gapBeginIndexes)):
            anyGaps = True
            outFile.write('\t'.join([chr, str(gapBeginIndexes[i]), str(gapEndIndexes[i])]) + os.linesep)
        
    if not anyGaps:
        outFile.write('\t'.join([GenomeInfo.getExtendedChrList(genome)[0], '1', '1']))
        
    outFile.close()
    def __iter__(self):
        chr = self.chr
        trackName1, trackName2, w1, w2, genome = self.trackName1, self.trackName2, self.w1, self.w2, self.genome

        region = GenomeRegion(genome, chr, 0,
                              GenomeInfo.getChrLen(genome, chr))

        track1 = PlainTrack(trackName1)
        tv1 = track1.getTrackView(region)
        vals1 = tv1.valsAsNumpyArray()

        track2 = PlainTrack(trackName2)
        tv2 = track2.getTrackView(region)
        vals2 = tv2.valsAsNumpyArray()

        for i in xrange(len(vals1)):
            yield w1 * vals1[i] + w2 * vals2[i]
 def getAnchor(genome, trackName):
     track = PlainTrack(trackName)
     anchor = []
     for chrom in GenomeInfo.getChrList(genome):
         chromLen = GenomeInfo.getChrLen(genome, chrom)
         region = GenomeRegion(genome, chrom, 0, chromLen)
         tv = track.getTrackView(region)
         anchor = anchor + [str(tv.genomeAnchor)]
     
     return anchor
 def getNumberElements(genome, trackName):
     track = PlainTrack(trackName)
     numElements = []
     for chrom in GenomeInfo.getChrList(genome):
         chromLen = GenomeInfo.getChrLen(genome, chrom)
         region = GenomeRegion(genome, chrom, 0, chromLen)
         tv = track.getTrackView(region)
         numElements = numElements + [len(tv.startsAsNumpyArray())]
         
     return numElements
예제 #6
0
    def execute(cls, choices, galaxyFn=None, username=''):
        from gold.util.RandomUtil import random

        outputFile = open(galaxyFn, 'w')
        genome = choices[0]
        histItem = choices[2]
        trackItem = choices[3]
        chromRegsPath = GenomeInfo.getChrRegsFn(genome)

        chrSizeDict = dict([(chrom, GenomeInfo.getChrLen(genome, chrom))
                            for chrom in GenomeInfo.getChrList(genome)])
        geSource = headLinesStr = None
        if choices[1] == 'history':

            trackType = choices[2].split(':')[1]
            username = ''.join(
                [chr(random.randint(97, 122)) for i in range(6)])
            tempFn = createCollectedPath(
                genome, [],
                username + '_'.join([str(v) for v in time.localtime()[:6]]) +
                '.' + trackType)
            fnSource = ExternalTrackManager.extractFnFromGalaxyTN(
                choices[2].split(':'))
            open(tempFn, 'w').write(open(fnSource, 'r').read())

            if trackType in ['valued.bed', 'category.bed', 'bed']:
                geSource = GenomeElementSorter(
                    BedGenomeElementSource(tempFn, genome=genome)).__iter__()

            #elif trackType == 'gtrack':
            #    geSource = GenomeElementSorter(GtrackGenomeElementSource(tempFn, genome=genome)).__iter__()
            #    headLinesStr = geSource.getHeaderLines().replace('##','\n##')

            cls.WriteExpandedElementsToFile(geSource,
                                            chrSizeDict,
                                            outputFile,
                                            headLinesStr,
                                            writeHeaderFlag=True)
            os.remove(tempFn)

        else:
            writeHeaderFlag = True
            for chrom in GenomeInfo.getChrList(genome):
                gRegion = GenomeRegion(genome, chrom, 0, chrSizeDict[chrom])
                plTrack = PlainTrack(trackItem.split(':'))
                geSource = GenomeElementTvWrapper(
                    plTrack.getTrackView(gRegion)).__iter__()
                cls.WriteExpandedElementsToFile(geSource, chrSizeDict,
                                                outputFile, headLinesStr,
                                                writeHeaderFlag)
                writeHeaderFlag = False
        outputFile.close()
 def getSegmentSizes(genome, trackName):
     track = PlainTrack(trackName)
     segmentSize = []; sumSegmentSize = []
     for chrom in GenomeInfo.getChrList(genome):
         chromLen = GenomeInfo.getChrLen(genome, chrom)
         region = GenomeRegion(genome, chrom, 0, chromLen)
         tv = track.getTrackView(region)
         sizeSegments = tv.endsAsNumpyArray() - tv.startsAsNumpyArray()
         sumSizes = sizeSegments.sum()
         segmentSize = segmentSize + [sizeSegments.tolist()]
         sumSegmentSize = sumSegmentSize + [sumSizes.tolist()]
         
     return sumSegmentSize
    def _addPeaks(self):
        #trackName = ExternalTrackManager.getPreProcessedTrackFromGalaxyTN(genome, self.trackName)
        track = PlainTrack(self.trackName)
        chromRegs = GlobalBinSource(genome)
        i = 0
        for region in chromRegs:
            if i > 2:
                break
            tv = track.getTrackView(region)
            starts = tv.startsAsNumpyArray()
            ends = tv.endsAsNumpyArray()

            for (start, end) in zip(starts, ends):
                self.peaks.append(Peak(self, region.chr, start, end))

            i += 1
예제 #9
0
 def execute(choices, galaxyFn=None, username=''):
     '''
     Is called when execute-button is pushed by web-user. Should print
     output as HTML to standard out, which will be directed to a results page
     in Galaxy history. If getOutputFormat is anything else than HTML, the
     output should be written to the file with path galaxyFn. If needed,
     StaticFile can be used to get a path where additional files can be put
     (e.g. generated image files). choices is a list of selections made by
     web-user in each options box.
     '''
     print "<h2>Test tool<h2>"
     fastaTrack = PlainTrack(['Sequence', 'DNA'])
     for i in range(0, 500):
         seqTv = fastaTrack.getTrackView(
             GenomeRegion("hg19", "chr1", 1000000, 1001000))
         sequence = seqTv.valsAsNumpyArray()
         print sequence
예제 #10
0
    def __iter__(self):
        from gold.application.RSetup import r
        chr = self.chr
        trackName1, genome = self.trackName1, self.genome
        factor = self.factor
        region = GenomeRegion(genome, chr, 0, GenomeInfo.getChrLen(genome, chr) )

        track1 = PlainTrack(trackName1)
        tv1 = track1.getTrackView(region)
        vals1 = tv1.valsAsNumpyArray()
        
        #scale between 0 and 1..:
        minVal, maxVal = vals1.min(), vals1.max()
        vals1 = (vals1 - minVal) * (1/(maxVal-minVal))
        for pos in xrange(len(vals1)):
            #print r.runif(1), vals1[pos]
            if r.runif(1) < factor*vals1[pos]:
                yield [pos,pos+1]
 def getGenomicElements(genome, trackName):
     track = PlainTrack(trackName)
     genElements = []
     for chrom in GenomeInfo.getChrList(genome):
         chromLen = GenomeInfo.getChrLen(genome, chrom)
         region = GenomeRegion(genome, chrom, 0, chromLen)
         tv = track.getTrackView(region)
         for el in tv:
             #print chrom, el.start(), el.end() #, el.name()
             genElements = genElements + [[chrom, el.start(), el.end()]]
             
     return genElements
 
     #print numpy.version.version # 1.7.1 !!
     #unique, counts = numpy.unique(segmentSize, return_counts=True) # This is for numpy 1.9
     #print numpy.asarray((unique, counts)).T
     
     '''track.setFormatConverter('SegmentToMidPointFormatConverter')
예제 #12
0
    def __iter__(self):
        for pos in self._lowerOrderChain:
            from gold.track.Track import PlainTrack
            from quick.util.GenomeInfo import GenomeInfo
            from gold.track.GenomeRegion import GenomeRegion

            track = PlainTrack(GenomeInfo.getSequenceTrackName(self._genome))
            region = GenomeRegion(self._genome, self._chr, pos,
                                  pos + len(self._fullNmer))
            fullSubstring = (''.join(
                track.getTrackView(region).valsAsNumpyArray())).lower()
            pl = len(self._nmerPrefix)
            assert self._fullNmer[0:pl] == fullSubstring[
                0:
                pl], 'The prefix of lower order does not match at the positions given by the chain. %s vs %s. Region: %s' % (
                    self._fullNmer[0:pl], fullSubstring[0:pl], region)
            #print 'Comparing nmers: %s VS %s (at pos:%i).' % (self._fullNmer, fullSubstring, pos)
            if self._fullNmer == fullSubstring:
                yield pos
    def __iter__(self):
        from proto.RSetup import r
        chr = self.chr
        trackName1, genome = self.trackName1, self.genome
        factor = self.factor
        region = GenomeRegion(genome, chr, 0,
                              GenomeInfo.getChrLen(genome, chr))

        track1 = PlainTrack(trackName1)
        tv1 = track1.getTrackView(region)
        vals1 = tv1.valsAsNumpyArray()

        #scale between 0 and 1..:
        minVal, maxVal = vals1.min(), vals1.max()
        vals1 = (vals1 - minVal) * (1 / (maxVal - minVal))
        for pos in xrange(len(vals1)):
            #print r.runif(1), vals1[pos]
            if r.runif(1) < factor * vals1[pos]:
                yield [pos, pos + 1]
예제 #14
0
    def getMutatedSequence(cls, genome, regionDict, pointDict=None):
        resultDict = defaultdict(list)
        regionList = []
        fastaTrack = PlainTrack(['Sequence', 'DNA'])
        for chrom in regionDict.keys():
            for start, end in regionDict[chrom]:

                seqTv = fastaTrack.getTrackView(
                    GenomeRegion(genome, chrom, start, end))
                valList = list(seqTv.valsAsNumpyArray())
                if pointDict:
                    mutatedPoints = [
                        v[1:] for v in pointDict[chrom] if v[0] == start
                    ]
                    for index, val in mutatedPoints:
                        val = val[-1] if val.find('>') >= 0 else val
                        valList[index] = val
                resultDict[chrom].append(
                    '>%s %i-%i\n%s' %
                    (chrom, start + 1, end, ''.join(valList)))

        return resultDict
    def execute(cls, choices, galaxyFn=None, username=''):

        outputFile =  open(galaxyFn, 'w')
        genome = choices[0]
        histItem = choices[2]
        trackItem = choices[3]
        chromRegsPath = GenomeInfo.getChrRegsFn(genome)
        
        chrSizeDict =  dict([ ( chr, GenomeInfo.getChrLen(genome, chr)) for chr in GenomeInfo.getChrList(genome)])
        geSource = headLinesStr = None
        if choices[1] == 'history':
            
            trackType = choices[2].split(':')[1]
            
            from proto.hyperbrowser.StaticFile import GalaxyRunSpecificFile
            tempFn  = GalaxyRunSpecificFile(['fromHistory.'+trackType],galaxyFn).getDiskPath(True)
            
            fnSource = ExternalTrackManager.extractFnFromGalaxyTN(choices[2].split(':'))
            open(tempFn,'w').write(open(fnSource,'r').read())
        
            if trackType in ['valued.bed', 'category.bed', 'bed']:
                geSource = GenomeElementSorter(BedGenomeElementSource(tempFn, genome=genome)).__iter__()
            
            elif trackType == 'gtrack':
                geSource = GenomeElementSorter(GtrackGenomeElementSource(tempFn, genome=genome)).__iter__()
                headLinesStr = geSource.getHeaderLines().replace('##','\n##')
            
            cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag=True)
            os.remove(tempFn)
        
        else:
            writeHeaderFlag = True
            for chr in GenomeInfo.getChrList(genome):
                gRegion = GenomeRegion(genome, chr, 0, chrSizeDict[chr])
                plTrack = PlainTrack(trackItem.split(':'))
                geSource = GenomeElementTvWrapper(plTrack.getTrackView(gRegion)).__iter__()
                cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag)
                writeHeaderFlag = False    
        outputFile.close()
 def execute(cls, choices, galaxyFn=None, username=''):
     outputFile =  open(galaxyFn, 'w')
     genome = choices[0]
     histItem = choices[2]
     trackItem = choices[3]
     chromRegsPath = GenomeInfo.getChrRegsFn(genome)
     
     chrSizeDict =  dict([ ( chrom, GenomeInfo.getChrLen(genome, chrom)) for chrom in GenomeInfo.getChrList(genome)])
     geSource = headLinesStr = None
     if choices[1] == 'History':
         
         trackType = choices[2].split(':')[1]
         username = ''.join([chr(random.randint(97,122)) for i in range(6)]) 
         tempFn = createCollectedPath(genome, [], username+'_'.join([str(v) for v in time.localtime()[:6]])+'.'+trackType)
         fnSource = ExternalTrackManager.extractFnFromGalaxyTN(choices[2].split(':'))
         open(tempFn,'w').write(open(fnSource,'r').read())
         
         
         if trackType in ['marked.bed', 'category.bed', 'bed']:
             geSource = GenomeElementSorter(BedGenomeElementSource(tempFn, genome=genome)).__iter__()
         
         elif trackType == 'gtrack':
             geSource = GenomeElementSorter(GtrackGenomeElementSource(tempFn, genome=genome)).__iter__()
             headLinesStr = geSource.getHeaderLines().replace('##','\n##')
         
         cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag=True)
         os.remove(tempFn)
     
     else:
         writeHeaderFlag = True
         for chrom in GenomeInfo.getChrList(genome):
             gRegion = GenomeRegion(genome, chrom, 0, chrSizeDict[chrom])
             plTrack = PlainTrack(trackItem.split(':'))
             geSource = GenomeElementTvWrapper(plTrack.getTrackView(gRegion)).__iter__()
             cls.WriteExpandedElementsToFile(geSource, chrSizeDict, outputFile, headLinesStr, writeHeaderFlag)
             writeHeaderFlag = False    
     outputFile.close()
#create a track
track = PlainTrack(['Genes and gene subsets', 'Genes', 'Refseq'])
#track = PlainTrack(['DNA structure','Bendability'])

#create a region of interest
region = GenomeRegion('hg18', 'chr1', 1000, 900000)

#Could instead have been iterator of regions, e.g. genome-wide:
#from quick.application.UserBinSource import UserBinSource
#regionIter = UserBinSource('*','*','hg18')
#for region in regionIter:
#    track.getTrackView(region):
#print 'Last region of iter: ', region

#iterate through elements of the track in this region
trackView = track.getTrackView(region)
for element in trackView:
    #just print the intervals for now..
    print element.start(), element.end()

tv = track.getTrackView(region)
print 'Number of elements in region, the slow way: ', len(
    [element for element in tv])
print 'Number of elements in region, the fast way: ', len(
    tv.startsAsNumpyArray())

print 'Bp coverage by elements in the region, the slow way: ', sum(
    element.end() - element.start() for element in tv)
print 'Bp coverage by elements in the region, the fast way: ', tv.endsAsNumpyArray(
).sum() - tv.startsAsNumpyArray().sum()
예제 #18
0
#create a track
track = PlainTrack(['Genes and gene subsets','Genes','Refseq'])
#track = PlainTrack(['DNA structure','Bendability'])

#create a region of interest
region = GenomeRegion('hg18','chr1',1000,900000)

#Could instead have been iterator of regions, e.g. genome-wide:
#from quick.application.UserBinSource import UserBinSource
#regionIter = UserBinSource('*','*','hg18')
#for region in regionIter:
#    pass
#print 'Last region of iter: ', region

#iterate through elements of the track in this region
for element in track.getTrackView(region):
    #just print the intervals for now..
    print element.start(), element.end()
    
tv = track.getTrackView(region)
print 'Number of elements in region, the slow way: ', len([element for element in tv])
print 'Number of elements in region, the fast way: ', len(tv.startsAsNumpyArray())

print 'Bp coverage by elements in the region, the slow way: ', sum(element.end()-element.start() for element in tv)
print 'Bp coverage by elements in the region, the fast way: ', tv.endsAsNumpyArray().sum() - tv.startsAsNumpyArray().sum()
    
trackExplanation = \
'''
A Track object loads the appropriate preprocessed data based on a track name.
Calling the method getTrackView gives an object (really of class TrackView) that is used simply to iterate through all track elements of the given genome region.
A track element (of class TrackElement) has methods start,end,val,strand. Some of these will typically be None, depending on the format of the requested track (e.g. for Segments the method val will return None..)