def testAssignAndRetrieve(self):
        e = GenomeElement('TestGenome',
                          start=5,
                          val=1.0,
                          extra={
                              'a': 1,
                              'b': 2
                          },
                          orderedExtraKeys=['a', 'b'])
        self.assertEqual(e.genome, 'TestGenome')
        self.assertEqual(e.chr, None)
        self.assertEqual(e.start, 5)
        self.assertEqual(e.end, None)
        self.assertEqual(e.val, 1.0)
        self.assertEqual(e.strand, None)
        self.assertEqual(e.a, 1)
        self.assertEqual(e.b, 2)
        self.assertEqual(e.extra, {'a': 1, 'b': 2})
        self.assertEqual(e.orderedExtraKeys, ['a', 'b'])

        e = GenomeElement('TestGenome', a=1)
        e.b = 2
        self.assertEqual(e.genome, 'TestGenome')
        self.assertEqual(e.a, 1)
        self.assertEqual(e.b, 2)
        self.assertEqual(e.extra, {'a': 1, 'b': 2})
        self.assertEqual(e.orderedExtraKeys, ['a', 'b'])

        self.assertRaises(AttributeError, lambda: e.nonExisting)
示例#2
0
    def _next(self, line):
        if line.startswith('#'):
            return

        ge = GenomeElement(self._genome)
        cols = line.split('\t')

        if self._numCols is not None:
            if len(cols) != self._numCols:
                raise InvalidFormatError(
                    'Error: BED files must have the same number of columns in each data line.'
                )
        else:
            self._numCols = len(cols)

        if self._numCols < self.MIN_NUM_COLS or self._numCols > self.MAX_NUM_COLS:
            raise InvalidFormatError('Error: BED file contains %s columns, but must contain between %s and %s columns.' \
                % (self._numCols, self.MIN_NUM_COLS, self.MAX_NUM_COLS))

        ge.chr = self._checkValidChr(cols[0])
        ge.start = self._checkValidStart(ge.chr, int(cols[1]))

        self._parseEnd(
            ge, self._checkValidEnd(ge.chr, int(cols[2]), start=ge.start))
        self._parseName(ge, cols)
        self._parseVal(ge, cols)

        if self._numCols >= 6:
            ge.strand = self._getStrandFromString(cols[5])

        for i, extraCol in enumerate(self.BED_EXTRA_COLUMNS):
            if self._numCols >= i + 7:
                setattr(ge, extraCol, cols[i + 6])

        return ge
    def _next(self, line):
        if line.startswith('#'):
            return
    
        ge = GenomeElement(self._genome)
        cols = line.split('\t')
        
        if self._numCols is not None:
            if len(cols) != self._numCols:
                raise InvalidFormatError('Error: BED files must have the same number of columns in each data line.')
        else:
            self._numCols = len(cols)
            
        if self._numCols < self.MIN_NUM_COLS or self._numCols > self.MAX_NUM_COLS:
            raise InvalidFormatError('Error: BED file must contain between %s and %s columns.' % (self.MIN_NUM_COLS, self.MAX_NUM_COLS))

        ge.chr = self._checkValidChr(cols[0])
        ge.start = self._checkValidStart(ge.chr, int(cols[1]))
        
        self._parseEnd( ge, self._checkValidEnd(ge.chr, int(cols[2]), start=ge.start))
        self._parseName( ge, cols )
        self._parseVal( ge, cols )
        
        if self._numCols >= 6:
            ge.strand = self._getStrandFromString(cols[5])
        
        for i,extraCol in enumerate(self.BED_EXTRA_COLUMNS):
            if self._numCols >= i+7:
                setattr(ge, extraCol, cols[i+6])
        
        return ge
示例#4
0
    def _next(self, line):
        cols = line.split('\t')

        ge = GenomeElement(self._genome)
        ge.chr = self._checkValidChr(cols[0])
        ge.start = int(cols[1])
        ge.end = int(cols[2])
        self._parseVal(ge, cols[3])

        return ge
 def _next(self, line):
     cols = line.split('\t')
     
     ge = GenomeElement(self._genome)
     ge.chr = self._checkValidChr(cols[0])
     ge.start = int(cols[1])
     ge.end = int(cols[2])
     self._parseVal(ge, cols[3])
     
     return ge
示例#6
0
    def _next(self, line):
        if line.startswith('>'):
            self._appendBoundingRegionTuple()
            self._elCount = 0
            self._chr = self._checkValidChr(line[1:].split()[0])
        else:
            if self._chr is None:
                raise InvalidFormatError(
                    'FASTA file does not start with the ">" character.')

            self._elCount += len(line)
            ge = GenomeElement(self._genome, self._chr)
            ge.val = np.fromstring(line, dtype='S1')
            return ge
    def _compute(self):
        tv1 = self._children[0].getResult()
        allSortedCodedEvents = self._children[1].getResult()

        allEventCodes = (allSortedCodedEvents % 4) - 2

        allSortedDecodedEvents = allSortedCodedEvents / 4

        from numpy.ma import add
        cumulativeCoverStatus = add.accumulate(allEventCodes)
        assert len(cumulativeCoverStatus) == len(allSortedDecodedEvents), str(
            len(allSortedDecodedEvents))

        unionStartList = []
        unionEndList = []

        startedFlag = False
        for i, cumVal in enumerate(cumulativeCoverStatus):
            if cumVal == 1 and not startedFlag:
                startPos = allSortedDecodedEvents[i]
                startedFlag = True
            elif cumVal == 0:
                if startPos:
                    unionStartList.append(startPos)
                    unionEndList.append(allSortedDecodedEvents[i])
                    startPos = None
                    startedFlag = False

        return [
            GenomeElement(start=x, end=y)
            for x, y in zip(unionStartList, unionEndList)
        ]
示例#8
0
    def _parseDeclarationLine(self, line):
        returnGE = None

        chr, start, step, span = self._getDeclarationLineAttrValues(line)

        self._fixedStep = self._checkFixedStep(line, start, step)
        chr = self._handleChr(chr)
        self._span = self._handleSpan(span)

        self._isPoints = self._span == 1

        if self._fixedStep:
            start = self._handleStart(chr, start)
            self._step = self._handleStep(step)

            self._isStepFunction = (self._step == self._span
                                    and self._step > 1)
            self._isFunction = (self._step == self._span and self._step == 1)
            if self._isFunction:
                self._genomeElement.chr = chr

            if not self._shouldExpandBoundingRegion(chr, start):
                if self._chr is not None:  #self._chr is still the chromosome of the previous decl. line
                    self._appendBoundingRegionTuple()

                self._start = start
                self._curElCountInBoundingRegion = 0

                if self._isStepFunction:
                    returnGE = GenomeElement(genome=self._genome, chr=chr, end=self._start, \
                                             val=numpy.nan, isBlankElement=True)

        self._chr = chr

        return returnGE
示例#9
0
 def _wrappedTrackElsGenerator(self):
     track = self._getTrack()
     for region, tv in ((region, self._getTrackView(track, region))
                        for region in self._boundingRegions):
         for te in tv:
             yield GenomeElement.createGeFromTrackEl(
                 te, tv.trackFormat, globalCoords=self._globalCoords)
示例#10
0
    def next(self):
        self._curPos += 1
        if self._curPos % 10e6 == 0:
            print '.',

        if self._curPos >= len(self._tv.genomeAnchor):
            raise StopIteration

        if self._exhausted:
            return None

        if self._curEl is None:
            try:
                self._curEl = self._tvIter.next()
            except StopIteration:
                self._exhausted = True
                return None

        if self._curPos == self._curEl.start():
            trackEl = self._curEl
            genome = self._tv.genomeAnchor.genome
            chr = self._tv.genomeAnchor.chr
            #print 'EL: ',GenomeElement(genome,chr, trackEl.start(), trackEl.end(), trackEl.val(), trackEl.strand())
            outEl = GenomeElement(genome, chr, trackEl.start(), trackEl.end(),
                                  trackEl.val(), trackEl.strand())
            self._curEl = None
            return outEl
        else:
            #print self._curPos,' AND ', self._curEl.start()
            #print 'None'
            return None
示例#11
0
 def testWriteElement(self):
     s = SetupDir(
         self.path,
         ['start', 'end', 'val', 'strand', 'id', 'edges', 'weights', 'cat'])
     ge = GenomeElement()
     s.od.writeElement(ge)
     for f in s.od._files.values():
         self.assertEqual(ge, f.ge)
    def next(self):
        if self._returnedOneElement:
            raise StopIteration

        self._returnedOneElement = True
        return GenomeElement(genome=self._genome,
                             chr=self._region.chr,
                             val=self._valSlice)
 def _compute(self):
     tv = self._children[0].getResult()
     starts = tv.startsAsNumpyArray()
     binArray = starts/self.microBin
     binCounts = np.bincount(binArray)
     numMicroBins = int( math.ceil( float(len(self._region)) / self.microBin) )
     binCounts = np.concatenate([binCounts, np.zeros(numMicroBins-len(binCounts), dtype='int')])
     return [GenomeElement(self._region.genome, self._region.chr, 
             self._region.start+i*self.microBin, min(self._region.start+(i+1)*self.microBin, self._region.end), 
             binCounts[i])
             for i in xrange(len(binCounts))]            
示例#14
0
 def next(self):
     self._index += 1
     if self._index >= len(self):
         raise StopIteration
     
     return GenomeElement(start = self._startList[self._index] if self._index<len(self._startList) else None,
                          end = self._endList[self._index] if self._index<len(self._endList) else None,
                          strand = self._strandList[self._index] if self._index<len(self._strandList) else None,
                          val = self._valList[self._index] if self._index<len(self._valList) else None,
                          id = self._idList[self._index] if self._index<len(self._idList) else None,
                          edges = self._edgesList[self._index] if self._index<len(self._edgesList) else None,
                          weights = self._weightsList[self._index] if self._index<len(self._weightsList) else None,
                          extra = self._extraList[self._index] if self._index<len(self._extraList) else None)
 def testAssignAndRetrieve(self):
     e = GenomeElement('hg18', start=5, val=1.0, extra={'a':1,'b':2}, orderedExtraKeys=['a','b'])
     self.assertEqual(e.genome, 'hg18')
     self.assertEqual(e.chr, None)
     self.assertEqual(e.start, 5)
     self.assertEqual(e.end, None)
     self.assertEqual(e.val, 1.0)
     self.assertEqual(e.strand, None)
     self.assertEqual(e.a, 1)
     self.assertEqual(e.b, 2)
     self.assertEqual(e.extra, {'a':1,'b':2})
     self.assertEqual(e.orderedExtraKeys, ['a', 'b'])
     
     e = GenomeElement('hg18', a=1)
     e.b = 2
     self.assertEqual(e.genome, 'hg18')
     self.assertEqual(e.a, 1)
     self.assertEqual(e.b, 2)
     self.assertEqual(e.extra, {'a':1,'b':2})
     self.assertEqual(e.orderedExtraKeys, ['a', 'b'])
     
     self.assertRaises(AttributeError, lambda : e.nonExisting)
示例#16
0
def _getIter(elList, valDataType, valDim, edgeWeightDataType, edgeWeightDim, brList=[]):
    geIter = MyGeIter(valDataType, valDim, edgeWeightDataType, edgeWeightDim)
    
    for i in xrange(len(elList)):
        ge = GenomeElement(genome=elList[i][0], chr=elList[i][1], start=elList[i][2], end=elList[i][3])
        if len(elList[i]) == 5:
            for prefix in elList[i][4]:
                setattr(ge, prefix, elList[i][4][prefix])
        geIter.iter.append(ge)
        
    for i in xrange(len(brList)):
        br = GenomeRegion(genome=brList[i][0], chr=brList[i][1], start=brList[i][2], end=brList[i][3])
        geIter.boundingRegionTuples.append(BoundingRegionTuple(br, brList[i][4]))
        
    return geIter
 def __init__(self,
              fn,
              genome=None,
              trackName=None,
              suffix=None,
              external=False,
              printWarnings=True,
              strToUseInsteadOfFn='',
              *args,
              **kwArgs):  #, depth=0
     self._fn = fn
     self._genome = genome
     self._genomeElement = GenomeElement(genome)
     self._trackName = trackName
     self._suffix = suffix
     self._external = external
     self._prefixList = None
     self._printWarnings = printWarnings
     self._strToUseInsteadOfFn = strToUseInsteadOfFn
     self._lastWarning = None
示例#18
0
    def _next(self, line):
        if self._isDeclarationLine(line):
            ge = self._parseDeclarationLine(line)
            if ge is not None:
                return ge
        else:
            if line.startswith('#'):
                return None

            cols = line.split()
            self._checkDataLineCols(cols)

            if self._fixedStep:
                self._curElCountInBoundingRegion += 1
                val = numpy.float(self._handleNan(cols[0]))

                if self._isFunction:
                    self._genomeElement.val = val
                    return self._genomeElement
                else:
                    start = self._checkValidStart(
                        self._chr, self._getFixedStepCurElStart())
            else:
                start = self._checkValidStart(self._chr, int(cols[0]) - 1)
                val = numpy.float(self._handleNan(cols[1]))

            end = None
            if not self._isPoints:
                end = self._checkValidEnd(self._chr, self._getEnd(start),
                                          start)
            if self._isStepFunction:
                start = None

            return GenomeElement(genome=self._genome,
                                 chr=self._chr,
                                 start=start,
                                 end=end,
                                 val=val)
    def _next(self, line):
        if len(line)>0 and line[0]=='#':
            return None
        
        origCols = line.split('\t')
        cols = [unquote(x) for x in origCols]
        
        if len(cols) != 9:
            raise InvalidFormatError("Error: GFF files must contain 9 tab-separated columns")

        ge = GenomeElement(self._genome)
        ge.chr = self._checkValidChr(cols[0])
        ge.source = cols[1]
        ge.type = cols[2]
        ge.start = self._checkValidStart(ge.chr, int(cols[3]) - 1)
        ge.end =  self._checkValidEnd(ge.chr, int(cols[4]), start=ge.start)
        ge.val = numpy.float(self._handleNan(cols[5]))
        ge.strand = self._getStrandFromString(cols[6])
        ge.phase = cols[7]
        ge.attributes = cols[8]
        
        for attr in origCols[8].split(';'):
            attrSplitted = attr.split('=')
            if len(attrSplitted) == 2:
                key, val = attrSplitted
                if key.lower() == 'id':
                    ge.id = unquote(val)
                elif key.lower() == 'name':
                    ge.name = unquote(val)
                
        return ge
 def _wrappedTrackElsGenerator(self):
     track = self._getTrack()
     for region,tv in ((region, self._getTrackView(track, region)) for region in self._boundingRegions):
         for te in tv:
             yield GenomeElement.createGeFromTrackEl(te, tv.trackFormat, globalCoords=self._globalCoords)
示例#21
0
 def next(self):
     trackEl = self._tvIter.next()
     ge = GenomeElement.createGeFromTrackEl(trackEl, self._tv.trackFormat)
     return ge
示例#22
0
 def next(self):
     trackEl = self._tvIter.next()
     ge = GenomeElement.createGeFromTrackEl(trackEl, self._tv.trackFormat)
     return ge
    def testExclude(self):
        self.assertEqual([GenomeElement('TestGenome','chr21',100,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',90,100) ))
        self.assertEqual([GenomeElement('TestGenome','chr21',100,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',200,210) ))
        self.assertEqual([GenomeElement('TestGenome','chr21',100,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chrM',100,110) ))

        self.assertEqual([GenomeElement('TestGenome','chr21',110,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',100,110) ))
        self.assertEqual([GenomeElement('TestGenome','chr21',110,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',90,110) ))

        self.assertEqual([GenomeElement('TestGenome','chr21',100,190)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',190,200) ))
        self.assertEqual([GenomeElement('TestGenome','chr21',100,190)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',190,210) ))

        self.assertEqual([],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',90,210) ))

        self.assertEqual([GenomeElement('TestGenome','chr21',100,140), GenomeElement('TestGenome','chr21',160,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',140,160) ))
    def testExtend(self):
        self.assertEqual(GenomeElement('TestGenome','chr21',100,200),\
                         GenomeElement('TestGenome','chr21',100,200).extend( 0 ))

        self.assertEqual(GenomeElement('TestGenome','chr21',0,200),\
                         GenomeElement('TestGenome','chr21',100,200).extend( -100 ))
        self.assertEqual(GenomeElement('TestGenome','chr21',-100,200),\
                         GenomeElement('TestGenome','chr21',100,200).extend( -200, ensureValidity=False ))
        self.assertEqual(GenomeElement('TestGenome','chr21',0,200),\
                         GenomeElement('TestGenome','chr21',100,200).extend( -200, ensureValidity=True ))

        self.assertEqual(GenomeElement('TestGenome','chr21',100,300),\
                         GenomeElement('TestGenome','chr21',100,200).extend( 100 ))
        self.assertEqual(GenomeElement('TestGenome','chr21',100,50000200),\
                         GenomeElement('TestGenome','chr21',100,200).extend( 50000000, ensureValidity=False ))
        self.assertEqual(GenomeElement('TestGenome','chr21',100,46944323),\
                         GenomeElement('TestGenome','chr21',100,200).extend( 50000000, ensureValidity=True ))
    def testEqual(self):
        self.assertEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('hg18',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chrM',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          20,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          110,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          6,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          False,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id4', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id4'], [5, 6],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 7],
                          extra={'source': 'source'}))

        self.assertNotEqual(
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={'source': 'source'}),
            GenomeElement('TestGenome',
                          'chr21',
                          10,
                          100,
                          5,
                          True,
                          'id', ['id2', 'id3'], [5, 6],
                          extra={
                              'source': 'source',
                              'other': 'value'
                          }))
 def _compute(self): 
     from gold.origdata.GenomeElement import GenomeElement
     ge = GenomeElement(start=0, end=1)
     ge2 = GenomeElement(start=10, end=11)
     return [ge,ge2]
    def testContains(self):
        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',10,100)))

        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',20,80)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',10,101)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',9,100)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',9,101)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',0,10)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chrM',20,80)))
示例#28
0
    def printGSuite(cls, choices, cols, rows, colListString, outFile):
        #print cols
        from quick.extra.ProgressViewer import ProgressViewer

        from gold.gsuite.GSuite import GSuite
        from gold.gsuite.GSuiteTrack import GSuiteTrack, GalaxyGSuiteTrack
        import gold.gsuite.GSuiteComposer as GSuiteComposer

        from gold.origdata.GtrackGenomeElementSource import GtrackGenomeElementSource
        from gold.origdata.GtrackComposer import ExtendedGtrackComposer
        from gold.origdata.GESourceWrapper import ListGESourceWrapper
        from gold.origdata.GenomeElement import GenomeElement

        from collections import defaultdict
        from copy import copy
        from urllib import quote

        from unidecode import unidecode
        from pyliftover import LiftOver

        gSuite = GSuite()

        diseaseColIndex = cols.index(cls.DISEASE_COLUMN_NAME)
        chrColIndex = cols.index(cls.CHR_COLUMN_NAME)
        startColIndex = cols.index(cls.START_COLUMN_NAME)
        valColIndex = cols.index(cls.VAL_COLUMN_NAME)
        
        orderedExtraKeys = copy(cols)
        extraIndexes = range(len(cols))
        for colName in [cls.DISEASE_COLUMN_NAME, cls.CHR_COLUMN_NAME,
                        cls.START_COLUMN_NAME, cls.VAL_COLUMN_NAME]:
            extraIndexes.remove(cols.index(colName))
            orderedExtraKeys.remove(colName)
        orderedExtraKeys = [cls._fixColNameForGTrack(key) for key in orderedExtraKeys]

        diseaseToRowsDict = defaultdict(list)
        for row in rows:
            disease = row[diseaseColIndex]
            if isinstance(disease, unicode):
                disease = unidecode(disease).replace('\x00', '')

            diseaseToRowsDict[disease].append(row)

        progressViewer = ProgressViewer([('Create GWAS tracks for diseases/traits', len(diseaseToRowsDict))],
                                        cls.extraGalaxyFn[cls.HISTORY_PROGRESS_TITLE] )

        for disease in sorted(diseaseToRowsDict.keys()):
            uri = GalaxyGSuiteTrack.generateURI(galaxyFn=cls.extraGalaxyFn[cls.HISTORY_HIDDEN_TRACK_STORAGE],
                                                extraFileName=disease.replace('/', '_') + '.gtrack')
            gSuiteTrack = GSuiteTrack(uri, title=disease, genome=cls.OUTPUT_GENOME)
            gSuite.addTrack(gSuiteTrack)

            shouldLiftOver = cls.DATABASE_GENOME != cls.OUTPUT_GENOME
            if shouldLiftOver:
                liftOver = LiftOver(cls.DATABASE_GENOME, cls.OUTPUT_GENOME)

            geList = []
            for row in diseaseToRowsDict[disease]:
                extra = {}
                for col, index in zip(orderedExtraKeys, extraIndexes):
                    cell = row[index].strip()
                    if isinstance(cell, unicode):
                        cell = unidecode(cell)

                    extra[col] = cell if cell != '' else '.'

                chrom = 'chr' + row[chrColIndex]
                if chrom == 'chr23':
                    chrom = 'chrX'
                if chrom == 'chr24':
                    chrom = 'chrY'
                if chrom == 'chrMT':
                    chrom = 'chrM'

                start = int(row[startColIndex])
                if shouldLiftOver:
                    newPosList = liftOver.convert_coordinate(chrom, start)
                    if newPosList is None or len(newPosList) != 1:
                        print 'SNP with position %s on chromosome %s ' % (chrom, start) +\
                              'could not be lifted over from reference genome ' +\
                              '%s to %s (for disease/trait "%s")' % \
                              (cls.DATABASE_GENOME, cls.OUTPUT_GENOME, disease)
                    else:
                        chrom, start = newPosList[0][0:2]
                #print extra
                geList.append(GenomeElement(chr=chrom, start=start,
                                            val=row[valColIndex], orderedExtraKeys=orderedExtraKeys,
                                            extra=extra))

            geSource = GtrackGenomeElementSource(cls.GTRACK_BLUEPRINT_PATH)
            wrappedGeSource = ListGESourceWrapper(geSource, geList)
            composer = ExtendedGtrackComposer(wrappedGeSource)
            composer.composeToFile(gSuiteTrack.path)

            progressViewer.update()

        GSuiteComposer.composeToFile(gSuite, outFile)
    def testOverlaps(self):
        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',10,100)))

        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',20,80)))

        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',10,101)))

        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',9,100)))

        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',9,101)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',0,10)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',100,110)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chrM',20,80)))
示例#30
0
    def _next(self, line):
        if line.startswith('##FASTA'):
            raise StopIteration

        if len(line) > 0 and line[0] == '#':
            return None

        origCols = line.split('\t')
        cols = [unquote(x) for x in origCols]

        if len(cols) != 9:
            raise InvalidFormatError(
                "Error: GFF files must contain 9 tab-separated columns")

        ge = GenomeElement(self._genome)
        ge.chr = self._checkValidChr(cols[0])
        ge.source = cols[1]

        self._parseThirdCol(ge, cols[2])

        ge.start = self._checkValidStart(ge.chr, int(cols[3]) - 1)
        ge.end = self._checkValidEnd(ge.chr, int(cols[4]), start=ge.start)

        self._parseSixthCol(ge, cols[5])

        ge.strand = self._getStrandFromString(cols[6])
        ge.phase = cols[7]
        ge.attributes = cols[8]

        for attr in origCols[8].split(';'):
            attrSplitted = attr.split('=')
            if len(attrSplitted) == 2:
                key, val = attrSplitted
                if key.lower() == 'id':
                    ge.id = unquote(val)
                elif key.lower() == 'name':
                    ge.name = unquote(val)

        return ge
    def testTouches(self):
        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',10,100)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',20,80)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',10,101)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',9,100)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',9,101)))

        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',0,10)))

        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',100,110)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',0,9)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',101,110)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chrM',20,80)))