def test_getCDSAlignment(self): """test getCDSAlignment""" gene1 = Gene('G00001', 'Chr1', 1, 27, 1, [ Transcript('G00001.1', 'Chr1', 1, 27, 1, 'G00001', [ CDS('G00001.1_cds_1', 'Chr1', 1, 6, 1, 'G00001.1'), CDS('G00001.1_cds_1', 'Chr1', 13, 21, 1, 'G00001.1') ]) ]) gene2 = Gene('G00002', 'Chr5', 1, 27, 1, [ Transcript('G00002.1', 'Chr5', 1, 27, 1, 'G00002', [ CDS('G00002.1_cds_1', 'Chr5', 1, 6, 1, 'G00002.1'), CDS('G00002.1_cds_1', 'Chr5', 13, 27, 1, 'G00002.1') ]) ]) gl = GeneLink( Duplication('Chr1', 1, 58, 'Chr5', 1, 60, [ (Region('Chr1', 1, 58, 1), Region('Chr5', 1, 60, 1)) ], [( 'ATGTATTCTATCTCATGTTAATGCTAATACTAGTCATGATCAGATACGATGATGAT--TA', 'ATGTATTCTATCTCATGTTACTGCTAATACTAGTCATGATCAGATACGATGATGATCATA') ]), gene1, gene2) self.assertEquals( ('ATGTATtctatcTCATGTTAAtgctaa', 'ATGTATtctatcTCATGTTACTGCTAA', Region('Chr1', 1, 27, 1), Region('Chr5', 1, 27, 1)), gl.getCDSAlignment()) gene1 = Gene('G00001', 'Chr1', 1, 27, 1, [ Transcript('G00001.1', 'Chr1', 1, 27, 1, 'G00001', [ CDS('G00001.1_cds_1', 'Chr1', 1, 6, 1, 'G00001.1'), CDS('G00001.1_cds_1', 'Chr1', 13, 21, 1, 'G00001.1') ]) ]) gene2 = Gene('G00002', 'Chr5', 27, 60, -1, [ Transcript('G00002.1', 'Chr5', 27, 60, -1, 'G00002', [ CDS('G00002.1_cds_1', 'Chr5', 27, 39, -1, 'G00002.1'), CDS('G00002.1_cds_1', 'Chr5', 48, 60, -1, 'G00002.1') ]) ]) gl = GeneLink( Duplication('Chr1', 1, 58, 'Chr5', 1, 60, [ (Region('Chr1', 1, 58, -1), Region('Chr5', 1, 60, 1)) ], [( 'ATGTATTCTATCTCATGTTAATGCTAATACTAGTCATGATCAGATACGATGATGAG--TA', 'ATGTATTCTATCTCATGTTACTGCTAATACTAGTCATGATCAGATACGATGATGATCATA') ]), gene1, gene2) self.assertEquals( ('atactagtcatGATCAGATAcgatgaTGAG--TA', 'ATACTAGTCATGAtcagatacGATGATGATCATA', Region( 'Chr1', 1, 32, -1), Region('Chr5', 27, 60, 1)), gl.getCDSAlignment())
def test_getlGenesFromCoordinates(self): """Test getlGenesFromCoordinates""" gene1 = Gene('G00001','Chr1',23988,24919,-1,[Transcript('G00001.1','Chr1',23988,24919,-1,'G00001',[CDS('G00001.1_cds_1','Chr1',23988,24083, -1, 'G00001.1'),CDS('G00001.1_cds_1','Chr1',24274,24427,-1,'G00001.1'),CDS('G00001.1_cds_1','Chr1',24489,24919,-1,'G00001.1')])]) gene2 = Gene('G00002','Chr1',239880,249190,-1,[Transcript('G00002.1','Chr1',239880,249190,-1,'G00002',[CDS('G00002.1_cds_1','Chr1',239880,240830, -1, 'G00002.1'),CDS('G00002.1_cds_1','Chr1',242740,244270,-1,'G00002.1'),CDS('G00002.1_cds_1','Chr1',244890,249190,-1,'G00002.1')])]) lGenes = [gene1,gene2] # self.db.deleteAllGenes() TODO self.db.insertlGenes(lGenes) self.assertEquals([gene2],self.db.getlGenesFromCoordinates('Chr1',230000,250000))
def test_selectAllGenes(self): """Test selectAllGenes""" gene1 = Gene('G00001','Chr1',23988,24919,-1,[Transcript('G00001.1','Chr1',23988,24919,-1,'G00001',[CDS('G00001.1_cds_1','Chr1',23988,24083, -1, 'G00001.1'),CDS('G00001.1_cds_1','Chr1',24274,24427,-1,'G00001.1'),CDS('G00001.1_cds_1','Chr1',24489,24919,-1,'G00001.1')])]) lGenes = [gene1] # self.db.deleteAllGenes() TODO self.db.insertlGenes(lGenes) self.assertEquals([gene1],self.db.selectAllGenes())
def getlGenesFromCoordinates(self, seqid, start, end): """Get genes included in a defined region""" lGenes = [] dGenes = {} lTranscripts = [] dTranscripts = {} lCDS = [] dCDS = {} cursor = self.conn.execute( '''select id, seqid, start, end, strand from gene where seqid = \'{}\' and start > {} and end < {} order by start''' .format(seqid, start, end)) for row in cursor: dGenes[row[0]] = Gene(row[0], row[1], row[2], row[3], row[4]) if dGenes: cursor = self.conn.execute( '''select id, seqid, start,end,strand,gene_id from transcript where seqid = \'{}\' and start > {} and end < {} order by start''' .format(seqid, start, end)) for row in cursor: transcript = Transcript(row[0], row[1], row[2], row[3], row[4], row[5]) if transcript.gene_id in dGenes: dTranscripts[row[0]] = transcript if len(dGenes[transcript.gene_id].lTranscripts) > 0: dGenes[transcript.gene_id].lTranscripts.append( transcript) else: dGenes[transcript.gene_id].lTranscripts = [transcript] if dTranscripts: cursor = self.conn.execute( '''select cds_id, seqid, start,end,strand,transcript_id from cds where seqid = \'{}\' and start > {} and end < {} order by start''' .format(seqid, start, end)) for row in cursor: cds = CDS(row[0], row[1], row[2], row[3], row[4], row[5]) if cds.transcript_id in dTranscripts: if len(dTranscripts[cds.transcript_id].lCDS) > 0: dTranscripts[cds.transcript_id].lCDS.append(cds) else: dTranscripts[cds.transcript_id].lCDS = [cds] return dGenes.values()
def _parse(self): """Parse the gff file""" dGenes = {} dTranscripts = {} dCDS = {} with open(self.inputGffFile, 'r') as input: for line in input: if not re.match('^#', line): line = line.rstrip('\n') values = line.split('\t') if values[2] == 'gene': id = self._getFeatureTagValue('ID',values[8]) currentGene = Gene(id, values[0], int(values[3]), int(values[4]), self._getStrand(values[6])) dGenes[id] = currentGene self.lGenes.append(currentGene) if values[2] == 'mRNA': id = self._getFeatureTagValue('ID',values[8]) gene_id = self._getFeatureTagValue('Parent', values[8]) #gene_id = '{}_G'.format(id) #currentGene = Gene(gene_id, values[0], int(values[3]), int(values[4]), self._getStrand(values[6])) #dGenes[gene_id] = currentGene #self.lGenes.append(currentGene) currentTranscript = Transcript(id, values[0], int(values[3]), int(values[4]), self._getStrand(values[6]), gene_id) dTranscripts[id] = currentTranscript if len(dGenes[gene_id].lTranscripts) > 0: dGenes[gene_id].lTranscripts.append(currentTranscript) else: dGenes[gene_id].lTranscripts = [currentTranscript] if values[2] == 'CDS': id = self._getFeatureTagValue('ID',values[8]) transcript_id = self._getFeatureTagValue('Parent', values[8]) #id = '{}_CDS'.format(transcript_id) currentCDS = CDS(id, values[0], int(values[3]), int(values[4]), self._getStrand(values[6]), transcript_id) if len(dTranscripts[transcript_id].lCDS) > 0: dTranscripts[transcript_id].lCDS.append(currentCDS) else: dTranscripts[transcript_id].lCDS = [currentCDS]
def selectAllGenes(self): """Select all genes""" lGenes = [] dGenes = {} lTranscripts = [] dTranscripts = {} lCDS = [] dCDS = {} cursor = self.conn.execute( '''select id, seqid, start, end, strand from gene''') for row in cursor: dGenes[row[0]] = Gene(row[0], row[1], row[2], row[3], row[4]) cursor = self.conn.execute( '''select id, seqid, start,end,strand,gene_id from transcript''') for row in cursor: transcript = Transcript(row[0], row[1], row[2], row[3], row[4], row[5]) dTranscripts[row[0]] = transcript if len(dGenes[transcript.gene_id].lTranscripts) > 0: dGenes[transcript.gene_id].lTranscripts.append(transcript) else: dGenes[transcript.gene_id].lTranscripts = [transcript] cursor = self.conn.execute( '''select cds_id, seqid, start,end,strand,transcript_id from cds order by start''' ) for row in cursor: cds = CDS(row[0], row[1], row[2], row[3], row[4], row[5]) if len(dTranscripts[cds.transcript_id].lCDS) > 0: dTranscripts[cds.transcript_id].lCDS.append(cds) else: dTranscripts[cds.transcript_id].lCDS = [cds] return dGenes.values()
def test_getAllGenes(self): """Test getAllGenes method""" iGffGeneParser = GffGeneParser("test-data/gene.gff3") lGenes = [Gene('G00001','Chr1',23988,24919,-1,[Transcript('G00001.1','Chr1',23988,24919,-1,'G00001',[CDS('G00001.1_cds_1','Chr1',23988,24083, -1, 'G00001.1'),CDS('G00001.1_cds_1','Chr1',24274,24427,-1,'G00001.1'),CDS('G00001.1_cds_1','Chr1',24489,24919,-1,'G00001.1')])])] self.assertEqual(iGffGeneParser.getAllGenes()[0],lGenes[0])