def testAssignAndRetrieve(self):
        e = GenomeElement('TestGenome',
                          start=5,
                          val=1.0,
                          extra={
                              'a': 1,
                              'b': 2
                          },
                          orderedExtraKeys=['a', 'b'])
        self.assertEqual(e.genome, 'TestGenome')
        self.assertEqual(e.chr, None)
        self.assertEqual(e.start, 5)
        self.assertEqual(e.end, None)
        self.assertEqual(e.val, 1.0)
        self.assertEqual(e.strand, None)
        self.assertEqual(e.a, 1)
        self.assertEqual(e.b, 2)
        self.assertEqual(e.extra, {'a': 1, 'b': 2})
        self.assertEqual(e.orderedExtraKeys, ['a', 'b'])

        e = GenomeElement('TestGenome', a=1)
        e.b = 2
        self.assertEqual(e.genome, 'TestGenome')
        self.assertEqual(e.a, 1)
        self.assertEqual(e.b, 2)
        self.assertEqual(e.extra, {'a': 1, 'b': 2})
        self.assertEqual(e.orderedExtraKeys, ['a', 'b'])

        self.assertRaises(AttributeError, lambda: e.nonExisting)
Пример #2
0
    def _next(self, line):
        if line.startswith('#'):
            return

        ge = GenomeElement(self._genome)
        cols = line.split('\t')

        if self._numCols is not None:
            if len(cols) != self._numCols:
                raise InvalidFormatError(
                    'Error: BED files must have the same number of columns in each data line.'
                )
        else:
            self._numCols = len(cols)

        if self._numCols < self.MIN_NUM_COLS or self._numCols > self.MAX_NUM_COLS:
            raise InvalidFormatError('Error: BED file contains %s columns, but must contain between %s and %s columns.' \
                % (self._numCols, self.MIN_NUM_COLS, self.MAX_NUM_COLS))

        ge.chr = self._checkValidChr(cols[0])
        ge.start = self._checkValidStart(ge.chr, int(cols[1]))

        self._parseEnd(
            ge, self._checkValidEnd(ge.chr, int(cols[2]), start=ge.start))
        self._parseName(ge, cols)
        self._parseVal(ge, cols)

        if self._numCols >= 6:
            ge.strand = self._getStrandFromString(cols[5])

        for i, extraCol in enumerate(self.BED_EXTRA_COLUMNS):
            if self._numCols >= i + 7:
                setattr(ge, extraCol, cols[i + 6])

        return ge
    def _next(self, line):
        if line.startswith('#'):
            return
    
        ge = GenomeElement(self._genome)
        cols = line.split('\t')
        
        if self._numCols is not None:
            if len(cols) != self._numCols:
                raise InvalidFormatError('Error: BED files must have the same number of columns in each data line.')
        else:
            self._numCols = len(cols)
            
        if self._numCols < self.MIN_NUM_COLS or self._numCols > self.MAX_NUM_COLS:
            raise InvalidFormatError('Error: BED file must contain between %s and %s columns.' % (self.MIN_NUM_COLS, self.MAX_NUM_COLS))

        ge.chr = self._checkValidChr(cols[0])
        ge.start = self._checkValidStart(ge.chr, int(cols[1]))
        
        self._parseEnd( ge, self._checkValidEnd(ge.chr, int(cols[2]), start=ge.start))
        self._parseName( ge, cols )
        self._parseVal( ge, cols )
        
        if self._numCols >= 6:
            ge.strand = self._getStrandFromString(cols[5])
        
        for i,extraCol in enumerate(self.BED_EXTRA_COLUMNS):
            if self._numCols >= i+7:
                setattr(ge, extraCol, cols[i+6])
        
        return ge
Пример #4
0
    def _next(self, line):
        cols = line.split('\t')

        ge = GenomeElement(self._genome)
        ge.chr = self._checkValidChr(cols[0])
        ge.start = int(cols[1])
        ge.end = int(cols[2])
        self._parseVal(ge, cols[3])

        return ge
 def _next(self, line):
     cols = line.split('\t')
     
     ge = GenomeElement(self._genome)
     ge.chr = self._checkValidChr(cols[0])
     ge.start = int(cols[1])
     ge.end = int(cols[2])
     self._parseVal(ge, cols[3])
     
     return ge
Пример #6
0
    def _next(self, line):
        if line.startswith('>'):
            self._appendBoundingRegionTuple()
            self._elCount = 0
            self._chr = self._checkValidChr(line[1:].split()[0])
        else:
            if self._chr is None:
                raise InvalidFormatError(
                    'FASTA file does not start with the ">" character.')

            self._elCount += len(line)
            ge = GenomeElement(self._genome, self._chr)
            ge.val = np.fromstring(line, dtype='S1')
            return ge
    def _compute(self):
        tv1 = self._children[0].getResult()
        allSortedCodedEvents = self._children[1].getResult()

        allEventCodes = (allSortedCodedEvents % 4) - 2

        allSortedDecodedEvents = allSortedCodedEvents / 4

        from numpy.ma import add
        cumulativeCoverStatus = add.accumulate(allEventCodes)
        assert len(cumulativeCoverStatus) == len(allSortedDecodedEvents), str(
            len(allSortedDecodedEvents))

        unionStartList = []
        unionEndList = []

        startedFlag = False
        for i, cumVal in enumerate(cumulativeCoverStatus):
            if cumVal == 1 and not startedFlag:
                startPos = allSortedDecodedEvents[i]
                startedFlag = True
            elif cumVal == 0:
                if startPos:
                    unionStartList.append(startPos)
                    unionEndList.append(allSortedDecodedEvents[i])
                    startPos = None
                    startedFlag = False

        return [
            GenomeElement(start=x, end=y)
            for x, y in zip(unionStartList, unionEndList)
        ]
Пример #8
0
    def _parseDeclarationLine(self, line):
        returnGE = None

        chr, start, step, span = self._getDeclarationLineAttrValues(line)

        self._fixedStep = self._checkFixedStep(line, start, step)
        chr = self._handleChr(chr)
        self._span = self._handleSpan(span)

        self._isPoints = self._span == 1

        if self._fixedStep:
            start = self._handleStart(chr, start)
            self._step = self._handleStep(step)

            self._isStepFunction = (self._step == self._span
                                    and self._step > 1)
            self._isFunction = (self._step == self._span and self._step == 1)
            if self._isFunction:
                self._genomeElement.chr = chr

            if not self._shouldExpandBoundingRegion(chr, start):
                if self._chr is not None:  #self._chr is still the chromosome of the previous decl. line
                    self._appendBoundingRegionTuple()

                self._start = start
                self._curElCountInBoundingRegion = 0

                if self._isStepFunction:
                    returnGE = GenomeElement(genome=self._genome, chr=chr, end=self._start, \
                                             val=numpy.nan, isBlankElement=True)

        self._chr = chr

        return returnGE
Пример #9
0
 def _wrappedTrackElsGenerator(self):
     track = self._getTrack()
     for region, tv in ((region, self._getTrackView(track, region))
                        for region in self._boundingRegions):
         for te in tv:
             yield GenomeElement.createGeFromTrackEl(
                 te, tv.trackFormat, globalCoords=self._globalCoords)
Пример #10
0
    def next(self):
        self._curPos += 1
        if self._curPos % 10e6 == 0:
            print '.',

        if self._curPos >= len(self._tv.genomeAnchor):
            raise StopIteration

        if self._exhausted:
            return None

        if self._curEl is None:
            try:
                self._curEl = self._tvIter.next()
            except StopIteration:
                self._exhausted = True
                return None

        if self._curPos == self._curEl.start():
            trackEl = self._curEl
            genome = self._tv.genomeAnchor.genome
            chr = self._tv.genomeAnchor.chr
            #print 'EL: ',GenomeElement(genome,chr, trackEl.start(), trackEl.end(), trackEl.val(), trackEl.strand())
            outEl = GenomeElement(genome, chr, trackEl.start(), trackEl.end(),
                                  trackEl.val(), trackEl.strand())
            self._curEl = None
            return outEl
        else:
            #print self._curPos,' AND ', self._curEl.start()
            #print 'None'
            return None
Пример #11
0
 def testWriteElement(self):
     s = SetupDir(
         self.path,
         ['start', 'end', 'val', 'strand', 'id', 'edges', 'weights', 'cat'])
     ge = GenomeElement()
     s.od.writeElement(ge)
     for f in s.od._files.values():
         self.assertEqual(ge, f.ge)
    def next(self):
        if self._returnedOneElement:
            raise StopIteration

        self._returnedOneElement = True
        return GenomeElement(genome=self._genome,
                             chr=self._region.chr,
                             val=self._valSlice)
 def _compute(self):
     tv = self._children[0].getResult()
     starts = tv.startsAsNumpyArray()
     binArray = starts/self.microBin
     binCounts = np.bincount(binArray)
     numMicroBins = int( math.ceil( float(len(self._region)) / self.microBin) )
     binCounts = np.concatenate([binCounts, np.zeros(numMicroBins-len(binCounts), dtype='int')])
     return [GenomeElement(self._region.genome, self._region.chr, 
             self._region.start+i*self.microBin, min(self._region.start+(i+1)*self.microBin, self._region.end), 
             binCounts[i])
             for i in xrange(len(binCounts))]            
Пример #14
0
 def next(self):
     self._index += 1
     if self._index >= len(self):
         raise StopIteration
     
     return GenomeElement(start = self._startList[self._index] if self._index<len(self._startList) else None,
                          end = self._endList[self._index] if self._index<len(self._endList) else None,
                          strand = self._strandList[self._index] if self._index<len(self._strandList) else None,
                          val = self._valList[self._index] if self._index<len(self._valList) else None,
                          id = self._idList[self._index] if self._index<len(self._idList) else None,
                          edges = self._edgesList[self._index] if self._index<len(self._edgesList) else None,
                          weights = self._weightsList[self._index] if self._index<len(self._weightsList) else None,
                          extra = self._extraList[self._index] if self._index<len(self._extraList) else None)
Пример #15
0
 def testAssignAndRetrieve(self):
     e = GenomeElement('hg18', start=5, val=1.0, extra={'a':1,'b':2}, orderedExtraKeys=['a','b'])
     self.assertEqual(e.genome, 'hg18')
     self.assertEqual(e.chr, None)
     self.assertEqual(e.start, 5)
     self.assertEqual(e.end, None)
     self.assertEqual(e.val, 1.0)
     self.assertEqual(e.strand, None)
     self.assertEqual(e.a, 1)
     self.assertEqual(e.b, 2)
     self.assertEqual(e.extra, {'a':1,'b':2})
     self.assertEqual(e.orderedExtraKeys, ['a', 'b'])
     
     e = GenomeElement('hg18', a=1)
     e.b = 2
     self.assertEqual(e.genome, 'hg18')
     self.assertEqual(e.a, 1)
     self.assertEqual(e.b, 2)
     self.assertEqual(e.extra, {'a':1,'b':2})
     self.assertEqual(e.orderedExtraKeys, ['a', 'b'])
     
     self.assertRaises(AttributeError, lambda : e.nonExisting)
Пример #16
0
def _getIter(elList, valDataType, valDim, edgeWeightDataType, edgeWeightDim, brList=[]):
    geIter = MyGeIter(valDataType, valDim, edgeWeightDataType, edgeWeightDim)
    
    for i in xrange(len(elList)):
        ge = GenomeElement(genome=elList[i][0], chr=elList[i][1], start=elList[i][2], end=elList[i][3])
        if len(elList[i]) == 5:
            for prefix in elList[i][4]:
                setattr(ge, prefix, elList[i][4][prefix])
        geIter.iter.append(ge)
        
    for i in xrange(len(brList)):
        br = GenomeRegion(genome=brList[i][0], chr=brList[i][1], start=brList[i][2], end=brList[i][3])
        geIter.boundingRegionTuples.append(BoundingRegionTuple(br, brList[i][4]))
        
    return geIter
 def __init__(self,
              fn,
              genome=None,
              trackName=None,
              suffix=None,
              external=False,
              printWarnings=True,
              strToUseInsteadOfFn='',
              *args,
              **kwArgs):  #, depth=0
     self._fn = fn
     self._genome = genome
     self._genomeElement = GenomeElement(genome)
     self._trackName = trackName
     self._suffix = suffix
     self._external = external
     self._prefixList = None
     self._printWarnings = printWarnings
     self._strToUseInsteadOfFn = strToUseInsteadOfFn
     self._lastWarning = None
Пример #18
0
    def _next(self, line):
        if self._isDeclarationLine(line):
            ge = self._parseDeclarationLine(line)
            if ge is not None:
                return ge
        else:
            if line.startswith('#'):
                return None

            cols = line.split()
            self._checkDataLineCols(cols)

            if self._fixedStep:
                self._curElCountInBoundingRegion += 1
                val = numpy.float(self._handleNan(cols[0]))

                if self._isFunction:
                    self._genomeElement.val = val
                    return self._genomeElement
                else:
                    start = self._checkValidStart(
                        self._chr, self._getFixedStepCurElStart())
            else:
                start = self._checkValidStart(self._chr, int(cols[0]) - 1)
                val = numpy.float(self._handleNan(cols[1]))

            end = None
            if not self._isPoints:
                end = self._checkValidEnd(self._chr, self._getEnd(start),
                                          start)
            if self._isStepFunction:
                start = None

            return GenomeElement(genome=self._genome,
                                 chr=self._chr,
                                 start=start,
                                 end=end,
                                 val=val)
    def _next(self, line):
        if len(line)>0 and line[0]=='#':
            return None
        
        origCols = line.split('\t')
        cols = [unquote(x) for x in origCols]
        
        if len(cols) != 9:
            raise InvalidFormatError("Error: GFF files must contain 9 tab-separated columns")

        ge = GenomeElement(self._genome)
        ge.chr = self._checkValidChr(cols[0])
        ge.source = cols[1]
        ge.type = cols[2]
        ge.start = self._checkValidStart(ge.chr, int(cols[3]) - 1)
        ge.end =  self._checkValidEnd(ge.chr, int(cols[4]), start=ge.start)
        ge.val = numpy.float(self._handleNan(cols[5]))
        ge.strand = self._getStrandFromString(cols[6])
        ge.phase = cols[7]
        ge.attributes = cols[8]
        
        for attr in origCols[8].split(';'):
            attrSplitted = attr.split('=')
            if len(attrSplitted) == 2:
                key, val = attrSplitted
                if key.lower() == 'id':
                    ge.id = unquote(val)
                elif key.lower() == 'name':
                    ge.name = unquote(val)
                
        return ge
 def _wrappedTrackElsGenerator(self):
     track = self._getTrack()
     for region,tv in ((region, self._getTrackView(track, region)) for region in self._boundingRegions):
         for te in tv:
             yield GenomeElement.createGeFromTrackEl(te, tv.trackFormat, globalCoords=self._globalCoords)
Пример #21
0
 def next(self):
     trackEl = self._tvIter.next()
     ge = GenomeElement.createGeFromTrackEl(trackEl, self._tv.trackFormat)
     return ge
Пример #22
0
 def next(self):
     trackEl = self._tvIter.next()
     ge = GenomeElement.createGeFromTrackEl(trackEl, self._tv.trackFormat)
     return ge
    def testExclude(self):
        self.assertEqual([GenomeElement('TestGenome','chr21',100,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',90,100) ))
        self.assertEqual([GenomeElement('TestGenome','chr21',100,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',200,210) ))
        self.assertEqual([GenomeElement('TestGenome','chr21',100,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chrM',100,110) ))

        self.assertEqual([GenomeElement('TestGenome','chr21',110,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',100,110) ))
        self.assertEqual([GenomeElement('TestGenome','chr21',110,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',90,110) ))

        self.assertEqual([GenomeElement('TestGenome','chr21',100,190)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',190,200) ))
        self.assertEqual([GenomeElement('TestGenome','chr21',100,190)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',190,210) ))

        self.assertEqual([],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',90,210) ))

        self.assertEqual([GenomeElement('TestGenome','chr21',100,140), GenomeElement('TestGenome','chr21',160,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',140,160) ))
    def testExtend(self):
        self.assertEqual(GenomeElement('TestGenome','chr21',100,200),\
                         GenomeElement('TestGenome','chr21',100,200).extend( 0 ))

        self.assertEqual(GenomeElement('TestGenome','chr21',0,200),\
                         GenomeElement('TestGenome','chr21',100,200).extend( -100 ))
        self.assertEqual(GenomeElement('TestGenome','chr21',-100,200),\
                         GenomeElement('TestGenome','chr21',100,200).extend( -200, ensureValidity=False ))
        self.assertEqual(GenomeElement('TestGenome','chr21',0,200),\
                         GenomeElement('TestGenome','chr21',100,200).extend( -200, ensureValidity=True ))

        self.assertEqual(GenomeElement('TestGenome','chr21',100,300),\
                         GenomeElement('TestGenome','chr21',100,200).extend( 100 ))
        self.assertEqual(GenomeElement('TestGenome','chr21',100,50000200),\
                         GenomeElement('TestGenome','chr21',100,200).extend( 50000000, ensureValidity=False ))
        self.assertEqual(GenomeElement('TestGenome','chr21',100,46944323),\
                         GenomeElement('TestGenome','chr21',100,200).extend( 50000000, ensureValidity=True ))
    def testEqual(self):
        self.assertEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('hg18',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chrM',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          20,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          110,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          6,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          False,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id4', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id4'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 7],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={
                              'source': 'source',
                              'other': 'value'
                          }))
 def _compute(self): 
     from gold.origdata.GenomeElement import GenomeElement
     ge = GenomeElement(start=0, end=1)
     ge2 = GenomeElement(start=10, end=11)
     return [ge,ge2]
    def testContains(self):
        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',10,100)))

        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',20,80)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',10,101)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',9,100)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',9,101)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',0,10)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chrM',20,80)))
Пример #28
0
    def printGSuite(cls, choices, cols, rows, colListString, outFile):
        #print cols
        from quick.extra.ProgressViewer import ProgressViewer

        from gold.gsuite.GSuite import GSuite
        from gold.gsuite.GSuiteTrack import GSuiteTrack, GalaxyGSuiteTrack
        import gold.gsuite.GSuiteComposer as GSuiteComposer

        from gold.origdata.GtrackGenomeElementSource import GtrackGenomeElementSource
        from gold.origdata.GtrackComposer import ExtendedGtrackComposer
        from gold.origdata.GESourceWrapper import ListGESourceWrapper
        from gold.origdata.GenomeElement import GenomeElement

        from collections import defaultdict
        from copy import copy
        from urllib import quote

        from unidecode import unidecode
        from pyliftover import LiftOver

        gSuite = GSuite()

        diseaseColIndex = cols.index(cls.DISEASE_COLUMN_NAME)
        chrColIndex = cols.index(cls.CHR_COLUMN_NAME)
        startColIndex = cols.index(cls.START_COLUMN_NAME)
        valColIndex = cols.index(cls.VAL_COLUMN_NAME)
        
        orderedExtraKeys = copy(cols)
        extraIndexes = range(len(cols))
        for colName in [cls.DISEASE_COLUMN_NAME, cls.CHR_COLUMN_NAME,
                        cls.START_COLUMN_NAME, cls.VAL_COLUMN_NAME]:
            extraIndexes.remove(cols.index(colName))
            orderedExtraKeys.remove(colName)
        orderedExtraKeys = [cls._fixColNameForGTrack(key) for key in orderedExtraKeys]

        diseaseToRowsDict = defaultdict(list)
        for row in rows:
            disease = row[diseaseColIndex]
            if isinstance(disease, unicode):
                disease = unidecode(disease).replace('\x00', '')

            diseaseToRowsDict[disease].append(row)

        progressViewer = ProgressViewer([('Create GWAS tracks for diseases/traits', len(diseaseToRowsDict))],
                                        cls.extraGalaxyFn[cls.HISTORY_PROGRESS_TITLE] )

        for disease in sorted(diseaseToRowsDict.keys()):
            uri = GalaxyGSuiteTrack.generateURI(galaxyFn=cls.extraGalaxyFn[cls.HISTORY_HIDDEN_TRACK_STORAGE],
                                                extraFileName=disease.replace('/', '_') + '.gtrack')
            gSuiteTrack = GSuiteTrack(uri, title=disease, genome=cls.OUTPUT_GENOME)
            gSuite.addTrack(gSuiteTrack)

            shouldLiftOver = cls.DATABASE_GENOME != cls.OUTPUT_GENOME
            if shouldLiftOver:
                liftOver = LiftOver(cls.DATABASE_GENOME, cls.OUTPUT_GENOME)

            geList = []
            for row in diseaseToRowsDict[disease]:
                extra = {}
                for col, index in zip(orderedExtraKeys, extraIndexes):
                    cell = row[index].strip()
                    if isinstance(cell, unicode):
                        cell = unidecode(cell)

                    extra[col] = cell if cell != '' else '.'

                chrom = 'chr' + row[chrColIndex]
                if chrom == 'chr23':
                    chrom = 'chrX'
                if chrom == 'chr24':
                    chrom = 'chrY'
                if chrom == 'chrMT':
                    chrom = 'chrM'

                start = int(row[startColIndex])
                if shouldLiftOver:
                    newPosList = liftOver.convert_coordinate(chrom, start)
                    if newPosList is None or len(newPosList) != 1:
                        print 'SNP with position %s on chromosome %s ' % (chrom, start) +\
                              'could not be lifted over from reference genome ' +\
                              '%s to %s (for disease/trait "%s")' % \
                              (cls.DATABASE_GENOME, cls.OUTPUT_GENOME, disease)
                    else:
                        chrom, start = newPosList[0][0:2]
                #print extra
                geList.append(GenomeElement(chr=chrom, start=start,
                                            val=row[valColIndex], orderedExtraKeys=orderedExtraKeys,
                                            extra=extra))

            geSource = GtrackGenomeElementSource(cls.GTRACK_BLUEPRINT_PATH)
            wrappedGeSource = ListGESourceWrapper(geSource, geList)
            composer = ExtendedGtrackComposer(wrappedGeSource)
            composer.composeToFile(gSuiteTrack.path)

            progressViewer.update()

        GSuiteComposer.composeToFile(gSuite, outFile)
    def testOverlaps(self):
        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',10,100)))

        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',20,80)))

        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',10,101)))

        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',9,100)))

        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',9,101)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',0,10)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',100,110)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chrM',20,80)))
Пример #30
0
    def _next(self, line):
        if line.startswith('##FASTA'):
            raise StopIteration

        if len(line) > 0 and line[0] == '#':
            return None

        origCols = line.split('\t')
        cols = [unquote(x) for x in origCols]

        if len(cols) != 9:
            raise InvalidFormatError(
                "Error: GFF files must contain 9 tab-separated columns")

        ge = GenomeElement(self._genome)
        ge.chr = self._checkValidChr(cols[0])
        ge.source = cols[1]

        self._parseThirdCol(ge, cols[2])

        ge.start = self._checkValidStart(ge.chr, int(cols[3]) - 1)
        ge.end = self._checkValidEnd(ge.chr, int(cols[4]), start=ge.start)

        self._parseSixthCol(ge, cols[5])

        ge.strand = self._getStrandFromString(cols[6])
        ge.phase = cols[7]
        ge.attributes = cols[8]

        for attr in origCols[8].split(';'):
            attrSplitted = attr.split('=')
            if len(attrSplitted) == 2:
                key, val = attrSplitted
                if key.lower() == 'id':
                    ge.id = unquote(val)
                elif key.lower() == 'name':
                    ge.name = unquote(val)

        return ge
    def testTouches(self):
        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',10,100)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',20,80)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',10,101)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',9,100)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',9,101)))

        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',0,10)))

        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',100,110)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',0,9)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',101,110)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chrM',20,80)))