Пример #1
0
    def _next(self, line):
        if line.startswith('#'):
            return

        ge = GenomeElement(self._genome)
        cols = line.split('\t')

        if self._numCols is not None:
            if len(cols) != self._numCols:
                raise InvalidFormatError('Error: BED files must have the same number of columns in each data line.')
        else:
            self._numCols = len(cols)

        if self._numCols < self.MIN_NUM_COLS or self._numCols > self.MAX_NUM_COLS:
            raise InvalidFormatError('Error: BED file must contain between %s and %s columns.' % (self.MIN_NUM_COLS, self.MAX_NUM_COLS))

        ge.chr = self._checkValidChr(cols[0])
        ge.start = self._checkValidStart(ge.chr, int(cols[1]))

        self._parseEnd( ge, self._checkValidEnd(ge.chr, int(cols[2]), start=ge.start))
        self._parseName( ge, cols )
        self._parseVal( ge, cols )

        if self._numCols >= 6:
            ge.strand = self._getStrandFromString(cols[5])

        for i,extraCol in enumerate(self.BED_EXTRA_COLUMNS):
            if self._numCols >= i+7:
                setattr(ge, extraCol, cols[i+6])

        return ge
Пример #2
0
    def _next(self, line):
        cols = line.split('\t')

        ge = GenomeElement(self._genome)
        ge.chr = self._checkValidChr(cols[0])
        ge.start = int(cols[1])
        ge.end = int(cols[2])
        self._parseVal(ge, cols[3])

        return ge
 def _next(self, line):
     cols = line.split('\t')
     
     ge = GenomeElement(self._genome)
     ge.chr = self._checkValidChr(cols[0])
     ge.start = int(cols[1])
     ge.end = int(cols[2])
     self._parseVal(ge, cols[3])
     
     return ge
    def _next(self, line):
        if line.startswith('>'):
            self._appendBoundingRegionTuple()
            self._elCount = 0
            self._chr = self._checkValidChr(line[1:].split()[0])
        else:
            if self._chr is None:
                raise InvalidFormatError('FASTA file does not start with the ">" character.')

            self._elCount += len(line)
            ge = GenomeElement(self._genome, self._chr)
            ge.val = np.fromstring(line, dtype='S1')
            return ge
Пример #5
0
    def _next(self, line):
        if line.startswith('>'):
            self._appendBoundingRegionTuple()
            self._elCount = 0
            self._chr = self._checkValidChr(line[1:].split()[0])
        else:
            if self._chr is None:
                raise InvalidFormatError(
                    'FASTA file does not start with the ">" character.')

            self._elCount += len(line)
            ge = GenomeElement(self._genome, self._chr)
            ge.val = np.fromstring(line, dtype='S1')
            return ge
Пример #6
0
    def next(self):
        self._curPos += 1
        if self._curPos % 10e6 == 0:
            print '.',

        if self._curPos >= len(self._tv.genomeAnchor):
            raise StopIteration

        if self._exhausted:
            return None

        if self._curEl is None:
            try:
                self._curEl = self._tvIter.next()
            except StopIteration:
                self._exhausted = True
                return None

        if self._curPos == self._curEl.start():
            trackEl = self._curEl
            genome = self._tv.genomeAnchor.genome
            chr = self._tv.genomeAnchor.chr
            #print 'EL: ',GenomeElement(genome,chr, trackEl.start(), trackEl.end(), trackEl.val(), trackEl.strand())
            outEl = GenomeElement(genome, chr, trackEl.start(), trackEl.end(),
                                  trackEl.val(), trackEl.strand())
            self._curEl = None
            return outEl
        else:
            #print self._curPos,' AND ', self._curEl.start()
            #print 'None'
            return None
Пример #7
0
def _getIter(elList,
             valDataType,
             valDim,
             edgeWeightDataType,
             edgeWeightDim,
             brList=[]):
    geIter = MyGeIter(valDataType, valDim, edgeWeightDataType, edgeWeightDim)

    for i in xrange(len(elList)):
        ge = GenomeElement(genome=elList[i][0],
                           chr=elList[i][1],
                           start=elList[i][2],
                           end=elList[i][3])
        if len(elList[i]) == 5:
            for prefix in elList[i][4]:
                setattr(ge, prefix, elList[i][4][prefix])
        geIter.iter.append(ge)

    for i in xrange(len(brList)):
        br = GenomeRegion(genome=brList[i][0],
                          chr=brList[i][1],
                          start=brList[i][2],
                          end=brList[i][3])
        geIter.boundingRegionTuples.append(
            BoundingRegionTuple(br, brList[i][4]))

    return geIter
Пример #8
0
    def _parseDeclarationLine(self, line):
        returnGE = None

        chr, start, step, span = self._getDeclarationLineAttrValues(line)

        self._fixedStep = self._checkFixedStep(line, start, step)
        chr = self._handleChr(chr)
        self._span = self._handleSpan(span)

        self._isPoints = self._span == 1

        if self._fixedStep:
            start = self._handleStart(chr, start)
            self._step = self._handleStep(step)

            self._isStepFunction = (self._step == self._span
                                    and self._step > 1)
            self._isFunction = (self._step == self._span and self._step == 1)
            if self._isFunction:
                self._genomeElement.chr = chr

            if not self._shouldExpandBoundingRegion(chr, start):
                if self._chr is not None:  #self._chr is still the chromosome of the previous decl. line
                    self._appendBoundingRegionTuple()

                self._start = start
                self._curElCountInBoundingRegion = 0

                if self._isStepFunction:
                    returnGE = GenomeElement(genome=self._genome, chr=chr, end=self._start, \
                                             val=numpy.nan, isBlankElement=True)

        self._chr = chr

        return returnGE
Пример #9
0
    def next(self):
        if self._returnedOneElement:
            raise StopIteration

        self._returnedOneElement = True
        return GenomeElement(genome=self._genome,
                             chr=self._region.chr,
                             val=self._valSlice)
Пример #10
0
 def testAssignAndRetrieve(self):
     e = GenomeElement('TestGenome', start=5, val=1.0, extra={'a':1,'b':2}, orderedExtraKeys=['a','b'])
     self.assertEqual(e.genome, 'TestGenome')
     self.assertEqual(e.chr, None)
     self.assertEqual(e.start, 5)
     self.assertEqual(e.end, None)
     self.assertEqual(e.val, 1.0)
     self.assertEqual(e.strand, None)
     self.assertEqual(e.a, 1)
     self.assertEqual(e.b, 2)
     self.assertEqual(e.extra, {'a':1,'b':2})
     self.assertEqual(e.orderedExtraKeys, ['a', 'b'])
     
     e = GenomeElement('TestGenome', a=1)
     e.b = 2
     self.assertEqual(e.genome, 'TestGenome')
     self.assertEqual(e.a, 1)
     self.assertEqual(e.b, 2)
     self.assertEqual(e.extra, {'a':1,'b':2})
     self.assertEqual(e.orderedExtraKeys, ['a', 'b'])
     
     self.assertRaises(AttributeError, lambda : e.nonExisting)
Пример #11
0
    def testAssignAndRetrieve(self):
        e = GenomeElement("TestGenome", start=5, val=1.0, extra={"a": 1, "b": 2}, orderedExtraKeys=["a", "b"])
        self.assertEqual(e.genome, "TestGenome")
        self.assertEqual(e.chr, None)
        self.assertEqual(e.start, 5)
        self.assertEqual(e.end, None)
        self.assertEqual(e.val, 1.0)
        self.assertEqual(e.strand, None)
        self.assertEqual(e.a, 1)
        self.assertEqual(e.b, 2)
        self.assertEqual(e.extra, {"a": 1, "b": 2})
        self.assertEqual(e.orderedExtraKeys, ["a", "b"])

        e = GenomeElement("TestGenome", a=1)
        e.b = 2
        self.assertEqual(e.genome, "TestGenome")
        self.assertEqual(e.a, 1)
        self.assertEqual(e.b, 2)
        self.assertEqual(e.extra, {"a": 1, "b": 2})
        self.assertEqual(e.orderedExtraKeys, ["a", "b"])

        self.assertRaises(AttributeError, lambda: e.nonExisting)
Пример #12
0
    def _next(self, line):
        if line.startswith('##FASTA'):
            raise StopIteration

        if len(line)>0 and line[0]=='#':
            return None

        origCols = line.split('\t')
        cols = [unquote(x) for x in origCols]

        if len(cols) != 9:
            raise InvalidFormatError("Error: GFF files must contain 9 tab-separated columns")

        ge = GenomeElement(self._genome)
        ge.chr = self._checkValidChr(cols[0])
        ge.source = cols[1]

        self._parseThirdCol(ge, cols[2])

        ge.start = self._checkValidStart(ge.chr, int(cols[3]) - 1)
        ge.end =  self._checkValidEnd(ge.chr, int(cols[4]), start=ge.start)

        self._parseSixthCol(ge, cols[5])

        ge.strand = self._getStrandFromString(cols[6])
        ge.phase = cols[7]
        ge.attributes = cols[8]

        for attr in origCols[8].split(';'):
            attrSplitted = attr.split('=')
            if len(attrSplitted) == 2:
                key, val = attrSplitted
                if key.lower() == 'id':
                    ge.id = unquote(val)
                elif key.lower() == 'name':
                    ge.name = unquote(val)

        return ge
Пример #13
0
 def __init__(self,
              fn,
              genome=None,
              trackName=None,
              external=False,
              printWarnings=True,
              strToUseInsteadOfFn='',
              *args,
              **kwArgs):  #, depth=0
     self._fn = fn
     self._genome = genome
     self._genomeElement = GenomeElement(genome)
     self._trackName = trackName
     self._external = external
     self._prefixList = None
     self._printWarnings = printWarnings
     self._strToUseInsteadOfFn = strToUseInsteadOfFn
     self._lastWarning = None
Пример #14
0
    def next(self):
        self._index += 1
        if self._index >= len(self):
            raise StopIteration

        return GenomeElement(start=self._startList[self._index]
                             if self._index < len(self._startList) else None,
                             end=self._endList[self._index]
                             if self._index < len(self._endList) else None,
                             strand=self._strandList[self._index]
                             if self._index < len(self._strandList) else None,
                             val=self._valList[self._index]
                             if self._index < len(self._valList) else None,
                             id=self._idList[self._index]
                             if self._index < len(self._idList) else None,
                             edges=self._edgesList[self._index]
                             if self._index < len(self._edgesList) else None,
                             weights=self._weightsList[self._index]
                             if self._index < len(self._weightsList) else None,
                             extra=self._extraList[self._index]
                             if self._index < len(self._extraList) else None)
Пример #15
0
    def _next(self, line):
        if self._isDeclarationLine(line):
            ge = self._parseDeclarationLine(line)
            if ge is not None:
                return ge
        else:
            if line.startswith('#'):
                return None

            cols = line.split()
            self._checkDataLineCols(cols)

            if self._fixedStep:
                self._curElCountInBoundingRegion += 1
                val = numpy.float(self._handleNan(cols[0]))

                if self._isFunction:
                    self._genomeElement.val = val
                    return self._genomeElement
                else:
                    start = self._checkValidStart(
                        self._chr, self._getFixedStepCurElStart())
            else:
                start = self._checkValidStart(self._chr, int(cols[0]) - 1)
                val = numpy.float(self._handleNan(cols[1]))

            end = None
            if not self._isPoints:
                end = self._checkValidEnd(self._chr, self._getEnd(start),
                                          start)
            if self._isStepFunction:
                start = None

            return GenomeElement(genome=self._genome,
                                 chr=self._chr,
                                 start=start,
                                 end=end,
                                 val=val)
Пример #16
0
 def _wrappedTrackElsGenerator(self):
     track = self._getTrack()
     for region,tv in ((region, self._getTrackView(track, region)) for region in self._boundingRegions):
         for te in tv:
             yield GenomeElement.createGeFromTrackEl(te, tv.trackFormat, globalCoords=self._globalCoords)
Пример #17
0
    def testExclude(self):
        self.assertEqual([GenomeElement('TestGenome','chr21',100,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',90,100) ))
        self.assertEqual([GenomeElement('TestGenome','chr21',100,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',200,210) ))
        self.assertEqual([GenomeElement('TestGenome','chr21',100,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chrM',100,110) ))

        self.assertEqual([GenomeElement('TestGenome','chr21',110,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',100,110) ))
        self.assertEqual([GenomeElement('TestGenome','chr21',110,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',90,110) ))
        
        self.assertEqual([GenomeElement('TestGenome','chr21',100,190)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',190,200) ))
        self.assertEqual([GenomeElement('TestGenome','chr21',100,190)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',190,210) ))
        
        self.assertEqual([],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',90,210) ))
        
        self.assertEqual([GenomeElement('TestGenome','chr21',100,140), GenomeElement('TestGenome','chr21',160,200)],\
                         GenomeElement('TestGenome','chr21',100,200).exclude( GenomeElement('TestGenome','chr21',140,160) ))
Пример #18
0
    def testExtend(self):
        self.assertEqual(GenomeElement('TestGenome','chr21',100,200),\
                         GenomeElement('TestGenome','chr21',100,200).extend( 0 ))

        self.assertEqual(GenomeElement('TestGenome','chr21',0,200),\
                         GenomeElement('TestGenome','chr21',100,200).extend( -100 ))
        self.assertEqual(GenomeElement('TestGenome','chr21',-100,200),\
                         GenomeElement('TestGenome','chr21',100,200).extend( -200, ensureValidity=False ))
        self.assertEqual(GenomeElement('TestGenome','chr21',0,200),\
                         GenomeElement('TestGenome','chr21',100,200).extend( -200, ensureValidity=True ))

        self.assertEqual(GenomeElement('TestGenome','chr21',100,300),\
                         GenomeElement('TestGenome','chr21',100,200).extend( 100 ))
        self.assertEqual(GenomeElement('TestGenome','chr21',100,50000200),\
                         GenomeElement('TestGenome','chr21',100,200).extend( 50000000, ensureValidity=False ))
        self.assertEqual(GenomeElement('TestGenome','chr21',100,46944323),\
                         GenomeElement('TestGenome','chr21',100,200).extend( 50000000, ensureValidity=True ))        
 def _wrappedTrackElsGenerator(self):
     track = self._getTrack()
     for region,tv in ((region, self._getTrackView(track, region)) for region in self._boundingRegions):
         for te in tv:
             yield GenomeElement.createGeFromTrackEl(te, tv.trackFormat, globalCoords=self._globalCoords)
Пример #20
0
    def testEqual(self):
        self.assertEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}),
                         GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}))

        self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}),
                            GenomeElement('NCBI46','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}))

        self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}),
                            GenomeElement('TestGenome','chrM',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}))

        self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}),
                            GenomeElement('TestGenome','chr21',20,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}))

        self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}),
                            GenomeElement('TestGenome','chr21',10,110,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}))

        self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}),
                            GenomeElement('TestGenome','chr21',10,100,6,True,'id',['id2','id3'],[5,6],extra={'source':'source'}))

        self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}),
                            GenomeElement('TestGenome','chr21',10,100,5,False,'id',['id2','id3'],[5,6],extra={'source':'source'}))

        self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}),
                            GenomeElement('TestGenome','chr21',10,100,5,True,'id4',['id2','id3'],[5,6],extra={'source':'source'}))

        self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}),
                            GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id4'],[5,6],extra={'source':'source'}))

        self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}),
                            GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,7],extra={'source':'source'}))

        self.assertNotEqual(GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source'}),
                            GenomeElement('TestGenome','chr21',10,100,5,True,'id',['id2','id3'],[5,6],extra={'source':'source', 'other':'value'}))
Пример #21
0
 def next(self):
     trackEl = self._tvIter.next()
     ge = GenomeElement.createGeFromTrackEl(trackEl, self._tv.trackFormat)
     return ge
Пример #22
0
    def testContains(self):
        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',10,100)))
        
        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',20,80)))
        
        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',10,101)))
        
        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',9,100)))
        
        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',9,101)))
        
        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chr21',0,10)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).contains( \
                        GenomeElement('TestGenome','chrM',20,80)))
Пример #23
0
 def testWriteElement(self):
     s = SetupDir(self.path, ['start', 'end', 'val', 'strand', 'id', 'edges', 'weights', 'cat'])
     ge = GenomeElement()
     s.od.writeElement(ge)
     for f in s.od._files.values():
         self.assertEqual(ge, f.ge)
Пример #24
0
    def testOverlaps(self):
        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',10,100)))
        
        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',20,80)))
        
        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',10,101)))
        
        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',9,100)))
        
        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',9,101)))
        
        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',0,10)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chr21',100,110)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).overlaps( \
                        GenomeElement('TestGenome','chrM',20,80)))
Пример #25
0
    def testTouches(self):
        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',10,100)))
        
        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',20,80)))
        
        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',10,101)))
        
        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',9,100)))
        
        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',9,101)))
        
        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',0,10)))

        self.assertTrue(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',100,110)))
        
        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',0,9)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chr21',101,110)))

        self.assertFalse(GenomeElement('TestGenome','chr21',10,100).touches( \
                        GenomeElement('TestGenome','chrM',20,80)))
Пример #26
0
 def next(self):
     trackEl = self._tvIter.next()
     ge = GenomeElement.createGeFromTrackEl(trackEl, self._tv.trackFormat)
     return ge
Пример #27
0
    def _next(self, line):
        if line.startswith('##FASTA'):
            raise StopIteration

        if len(line) > 0 and line[0] == '#':
            return None

        origCols = line.split('\t')
        cols = [unquote(x) for x in origCols]

        if len(cols) != 9:
            raise InvalidFormatError(
                "Error: GFF files must contain 9 tab-separated columns")

        ge = GenomeElement(self._genome)
        ge.chr = self._checkValidChr(cols[0])
        ge.source = cols[1]

        self._parseThirdCol(ge, cols[2])

        ge.start = self._checkValidStart(ge.chr, int(cols[3]) - 1)
        ge.end = self._checkValidEnd(ge.chr, int(cols[4]), start=ge.start)

        self._parseSixthCol(ge, cols[5])

        ge.strand = self._getStrandFromString(cols[6])
        ge.phase = cols[7]
        ge.attributes = cols[8]

        for attr in origCols[8].split(';'):
            attrSplitted = attr.split('=')
            if len(attrSplitted) == 2:
                key, val = attrSplitted
                if key.lower() == 'id':
                    ge.id = unquote(val)
                elif key.lower() == 'name':
                    ge.name = unquote(val)

        return ge