def test_ucsc_refseq_genome(): """ Test Genome object with a small RefSeq GTF file downloaded from http://genome.ucsc.edu/cgi-bin/hgTables """ with TemporaryDirectory() as tmpdir: genome = Genome(reference_name="GRCh38", annotation_name="ucsc_test", gtf_path_or_url=UCSC_REFSEQ_PATH, cache_directory_path=tmpdir) genome.index() genes = genome.genes() for gene in genes: assert gene.id, \ "Gene with missing ID in %s" % (genome.gtf.dataframe(),) assert len(genes) == 2, \ "Expected 2 genes, got %d: %s" % ( len(genes), genes) transcripts = genome.transcripts() for transcript in transcripts: assert transcript.id, \ "Transcript with missing ID in %s" % (genome.gtf.dataframe(),) assert len(transcripts) == 2, \ "Expected 2 transcripts, got %d: %s" % ( len(transcripts), transcripts) genes_at_locus = genome.genes_at_locus(1, 67092176) assert len(genes_at_locus) == 2, \ "Expected 2 genes at locus chr1:67092176, got %d: %s" % ( len(genes_at_locus), genes_at_locus) ids = set([gene.id for gene in genes_at_locus]) eq_(set(["NM_001276352", "NR_075077"]), ids)
def test_ucsc_refseq_genome(): """ Test Genome object with a small RefSeq GTF file downloaded from http://genome.ucsc.edu/cgi-bin/hgTables """ with TemporaryDirectory() as tmpdir: genome = Genome( reference_name="GRCh38", annotation_name="ucsc_test", gtf_path_or_url=UCSC_REFSEQ_PATH, cache_directory_path=tmpdir) genome.index() genes = genome.genes() for gene in genes: assert gene.id, \ "Gene with missing ID in %s" % (genome.gtf.dataframe(),) assert len(genes) == 2, \ "Expected 2 genes, got %d: %s" % ( len(genes), genes) transcripts = genome.transcripts() for transcript in transcripts: assert transcript.id, \ "Transcript with missing ID in %s" % (genome.gtf.dataframe(),) assert len(transcripts) == 2, \ "Expected 2 transcripts, got %d: %s" % ( len(transcripts), transcripts) genes_at_locus = genome.genes_at_locus(1, 67092176) assert len(genes_at_locus) == 2, \ "Expected 2 genes at locus chr1:67092176, got %d: %s" % ( len(genes_at_locus), genes_at_locus) ids = set([gene.id for gene in genes_at_locus]) eq_(set(["NM_001276352", "NR_075077"]), ids)
def test_ucsc_gencode_genome(): """ Testing with a small GENCODE GTF file downloaded from http://genome.ucsc.edu/cgi-bin/hgTables """ with TemporaryDirectory() as tmpdir: genome = Genome( reference_name="GRCh38", annotation_name="ucsc_test", gtf_path_or_url=UCSC_GENCODE_PATH, cache_directory_path=tmpdir, ) genome.index() genes = genome.genes() for gene in genes: assert gene.id, "Gene with missing ID in %s" % (genome.gtf.dataframe(),) assert len(genes) == 7, "Expected 7 genes, got %d: %s" % (len(genes), genes) transcripts = genome.transcripts() for transcript in transcripts: assert transcript.id, "Transcript with missing ID in %s" % (genome.gtf.dataframe(),) assert len(transcripts) == 7, "Expected 7 transcripts, got %d: %s" % (len(transcripts), transcripts) gene_uc001aak4 = genome.gene_by_id("uc001aak.4") eq_(gene_uc001aak4.id, "uc001aak.4") eq_(gene_uc001aak4.name, None) eq_(gene_uc001aak4.biotype, None) gene_1_17369 = genome.genes_at_locus(1, 17369) eq_(gene_1_17369[0].id, "uc031tla.1") transcript_1_30564 = genome.transcripts_at_locus(1, 30564) eq_(transcript_1_30564[0].id, "uc057aty.1")
def test_ucsc_refseq(): """ Testing with a small RefSeq GTF file downloaded from http://genome.ucsc.edu/cgi-bin/hgTables """ genome = Genome("GRCh38", gtf_path_or_url=UCSC_REFSEQ_PATH) genome.install() eq_(len(genome.genes()), 2) eq_(len(genome.transcripts()), 2) genes_at_locus = genome.genes_at_locus(1, 67092176) eq_(len(genes_at_locus), 2) ids = set([gene.id for gene in genes_at_locus]) eq_(set(["NM_001276352", "NR_075077"]), ids)
def test_gtf_transcript_only(): genome = Genome( reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, transcript_fasta_path_or_url=MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH) genome.index() eq_(1, len(genome.genes())) transcript = genome.transcripts()[0] ok_(transcript.sequence) with assert_raises(ValueError) as cm: transcript.protein_sequence no_protein_(cm)
def test_gtf_transcript_only(): genome = Genome(reference_name="GRCm38", annotation_name="_test_mouse_ensembl81_subset", gtf_path_or_url=MOUSE_ENSMUSG00000017167_PATH, transcript_fasta_path_or_url= MOUSE_ENSMUSG00000017167_TRANSCRIPT_FASTA_PATH) genome.index() eq_(1, len(genome.genes())) transcript = genome.transcripts()[0] ok_(transcript.sequence) with assert_raises(ValueError) as cm: transcript.protein_sequence no_protein_(cm)
def test_ucsc_gencode(): """ Testing with a small GENCODE GTF file downloaded from http://genome.ucsc.edu/cgi-bin/hgTables """ genome = Genome("GRCh38", gtf_path_or_url=UCSC_GENCODE_PATH) genome.install() eq_(len(genome.genes()), 7) eq_(len(genome.transcripts()), 7) gene_uc001aak4 = genome.gene_by_id("uc001aak.4") eq_(gene_uc001aak4.id, "uc001aak.4") eq_(gene_uc001aak4.name, None) eq_(gene_uc001aak4.biotype, None) gene_1_17369 = genome.genes_at_locus(1, 17369) eq_(gene_1_17369[0].id, "uc031tla.1") transcript_1_30564 = genome.transcripts_at_locus(1, 30564) eq_(transcript_1_30564[0].id, "uc057aty.1")
def test_ucsc_gencode_genome(): """ Testing with a small GENCODE GTF file downloaded from http://genome.ucsc.edu/cgi-bin/hgTables """ with TemporaryDirectory() as tmpdir: genome = Genome( reference_name="GRCh38", annotation_name="ucsc_test", gtf_path_or_url=UCSC_GENCODE_PATH, cache_directory_path=tmpdir) genome.index() genes = genome.genes() for gene in genes: assert gene.id, \ "Gene with missing ID in %s" % (genome.gtf.dataframe(),) assert len(genes) == 7, \ "Expected 7 genes, got %d: %s" % ( len(genes), genes) transcripts = genome.transcripts() for transcript in transcripts: assert transcript.id, \ "Transcript with missing ID in %s" % (genome.gtf.dataframe(),) assert len(transcripts) == 7, \ "Expected 7 transcripts, got %d: %s" % ( len(transcripts), transcripts) gene_uc001aak4 = genome.gene_by_id("uc001aak.4") eq_(gene_uc001aak4.id, "uc001aak.4") eq_(gene_uc001aak4.name, None) eq_(gene_uc001aak4.biotype, None) gene_1_17369 = genome.genes_at_locus("chr1", 17369) eq_(gene_1_17369[0].id, "uc031tla.1") transcript_1_30564 = genome.transcripts_at_locus("chr1", 30564) eq_(transcript_1_30564[0].id, "uc057aty.1")