def test_build_ensembl_transcript_index(self): """Build the gtf portion of the ensembl transcript db """ # cat ~/oncotator_pycharm/oncotator/test/testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf | cut -f 9 | cut -f 5 --delimiter=" " | sort | uniq | sed -r "s/;//g" | sed -r "s/\"//g" # snR84, tK(UUU)K, YAL067C, YAL067W-A, YAL068C, YAL068W-A, YAL069W, YBR278W, YBR279W, YBR280C, YBR281C, YDR528W, YDR529C, YKR074W, # # grep -Pzo ">(snR84|tK\(UUU\)K|YAL067C|YAL067W-A|YAL068C|YAL068W-A|YAL069W|YBR278W|YBR279W|YBR280C|YBR281C|YDR528W|YDR529C|YKR074W)([A-Za-z_0-9 \:\-\n]+)" Saccharomyces_cerevisiae.EF4.71.cdna.all.fa >Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa # ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf" ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa" output_filename = "out/test_ensembl_gtf.db" protocol = "file" genome_build_factory = GenomeBuildFactory() genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], output_filename, protocol=protocol) self.assertTrue(os.path.exists(output_filename)) shove = Shove(protocol + "://" + output_filename, "memory://") self.assertTrue(len(shove.keys()) > 0) self.assertTrue("YDR529C" in shove.keys()) t = shove["YDR529C"] self.assertTrue(t.get_seq() is not None) self.assertTrue(t.get_seq() is not "") self.assertTrue(len(t.get_cds()) > 0) self.assertTrue(len(t.get_exons()) > 0) MutUtils.removeDir(output_filename)
def test_gencode_small(self): """Test that we can create Transcript instances from a small gencode gtf and fasta.""" gencode_input_gtf = "testdata/gencode/MAPK1.gencode.v19.annotation.gtf" gencode_input_fasta = "testdata/gencode/MAPK1.gencode.v19.pc_transcripts.fa" base_output_filename = "out/test_small_gencode" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename) seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False) transcripts = seq_index["MAPK1"] self.assertTrue(len(transcripts) == 4) seq_index_gp = Shove("file://" + base_output_filename + ".transcript_by_gp_bin.idx", "memory://", optimize=False) transcripts = seq_index_gp["22_753"] self.assertTrue(transcripts[0].get_strand() == "-") self.assertTrue(len(transcripts) == 1) for tx in transcripts: if tx.get_transcript_id() != "ENST00000215832.6": continue self.assertTrue(tx.get_seq().startswith("AGGCAATCGGTCCGAG"))
def _create_test_gencode_ds(base_output_filename, protein_id_mapping_file, gencode_version): genes = [ "MAPK1", "MUC16", "PIK3CA", "YPEL1", "KRTAP4-7", "MAT2A", "DDX11L10" ] gtf_list = [] fasta_list = [] for gene in genes: gtf_list.append("testdata/gencode/" + gene + ".gencode.v" + str(gencode_version) + ".annotation.gtf") fasta_list.append("testdata/gencode/" + gene + ".gencode.v" + str(gencode_version) + ".pc_transcripts.fa") shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices( gtf_list, fasta_list, base_output_filename, protein_id_mapping_file=protein_id_mapping_file) ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, title="GENCODE", version="v" + str(gencode_version), tx_filter="basic") return ensembl_ds
def test_gencode_small(self): """Test that we can create Transcript instances from a small gencode gtf and fasta.""" gencode_input_gtf = "testdata/gencode/MAPK1.gencode.v18.annotation.gtf" gencode_input_fasta = "testdata/gencode/MAPK1.gencode.v18.pc_transcripts.fa" base_output_filename = "out/test_small_gencode" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename) seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False) transcripts = seq_index["MAPK1"] self.assertTrue(len(transcripts) == 4) seq_index_gp = Shove("file://" + base_output_filename + ".transcript_by_gp_bin.idx", "memory://", optimize=False) transcripts = seq_index_gp["22_753"] self.assertTrue(transcripts[0].get_strand() == "-") self.assertTrue(len(transcripts) == 1) for tx in transcripts: if tx.get_transcript_id() != "ENST00000215832.6": continue self.assertTrue(tx.get_seq().startswith("AGGCAATCGGTCCGAG"))
def _create_ensembl_ds_from_saccer(self): gencode_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf" gencode_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa" base_output_filename = "out/test_saccer_ds" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename) ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, title="ensembl", version="71") return ensembl_ds
def _create_ensembl_ds_from_testdata(self, gene): gencode_input_gtf = "testdata/gencode/" + gene + ".gencode.v18.annotation.gtf" gencode_input_fasta = "testdata/gencode/" + gene + ".gencode.v18.pc_transcripts.fa" base_output_filename = "out/test_variant_classification" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename) ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, title="GENCODE", version="v18") return ensembl_ds
def test_construct_full_indices(self): """Attempt to construct all three ensembl indices with one command. """ ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf" ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa" base_output_filename = "out/test_full_indices_ensembl" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices([ensembl_input_gtf], [ensembl_input_fasta], base_output_filename) self.assertTrue(os.path.exists(base_output_filename + ".transcript.idx")) self.assertTrue(os.path.exists(base_output_filename + ".transcript_by_gene.idx")) self.assertTrue(os.path.exists(base_output_filename + ".transcript_by_gp_bin.idx"))
def _create_test_gencode_ds(base_output_filename, protein_id_mapping_file="testdata/gencode/ensembl_id_mappingsGRCh37.p13.txt"): genes = ["MAPK1", "MUC16", "PIK3CA", "YPEL1", "KRTAP4-7", "MAT2A"] gtf_list = [] fasta_list = [] for gene in genes: gtf_list.append("testdata/gencode/" + gene + ".gencode.v18.annotation.gtf") fasta_list.append("testdata/gencode/" + gene + ".gencode.v18.pc_transcripts.fa") shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices(gtf_list, fasta_list, base_output_filename, protein_id_mapping_file=protein_id_mapping_file) ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, title="GENCODE", version="v18", tx_filter="basic") return ensembl_ds
def test_multiple_gtf_initialization(self): """Test that we can create a datasource from multiple gtf & fastas""" gencode_input_gtfs = ["testdata/gencode/CP.gencode.annotation.gtf", "testdata/gencode/MAPK1.gencode.v18.annotation.gtf"] gencode_input_fastas = ["testdata/gencode/CP.gencode.pc_transcripts.fa", "testdata/gencode/MAPK1.gencode.v18.pc_transcripts.fa"] base_output_filename = "out/test_multi_gencode" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices(gencode_input_gtfs, gencode_input_fastas, base_output_filename) seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False) transcripts = seq_index["CP"] self.assertTrue(len(transcripts) == 15) transcripts = seq_index["MAPK1"] self.assertTrue(len(transcripts) == 4) for tx in transcripts: self.assertTrue(tx.get_transcript_id() == "ENST00000491588.1" or len(tx.get_seq()) > 100, "No seq data for " + tx.get_transcript_id() )
def test_convert_genomic_space_to_exon_space(self, loc, gt_d): """Test genomic --> exon transform on real data. """ gencode_input_gtf = "testdata/gencode/MAPK1.gencode.v18.annotation.gtf" gencode_input_fasta = "testdata/gencode/MAPK1.gencode.v18.pc_transcripts.fa" base_output_filename = "out/test_variant_classification" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename) ensembl_ds = EnsemblTranscriptDatasource(base_output_filename, version="TEST") tx = ensembl_ds.get_overlapping_transcripts("22", "22108790", "22108790") start, end = TranscriptProviderUtils.convert_genomic_space_to_exon_space(loc[0], loc[1], tx[0]) loc_length = (int(loc[1]) - int(loc[0])) self.assertTrue((end - start) == loc_length, str(end) + " - " + str(start) + " was not correct length: " + str(loc_length)) self.assertTrue(start == gt_d, "start position (" + str(start) + ") did not match gt (" + str(end) + ")" + " exons: " + str(tx[0].get_exons()))
def test_multiple_gtf_initialization(self): """Test that we can create a datasource from multiple gtf & fastas""" gencode_input_gtfs = ["testdata/gencode/CP.gencode.v19.annotation.gtf", "testdata/gencode/MAPK1.gencode.v19.annotation.gtf"] gencode_input_fastas = ["testdata/gencode/CP.gencode.v19.pc_transcripts.fa", "testdata/gencode/MAPK1.gencode.v19.pc_transcripts.fa"] base_output_filename = "out/test_multi_gencode" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices(gencode_input_gtfs, gencode_input_fastas, base_output_filename) seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False) transcripts = seq_index["CP"] self.assertTrue(len(transcripts) == 15) transcripts = seq_index["MAPK1"] self.assertTrue(len(transcripts) == 4) for tx in transcripts: self.assertTrue(tx.get_transcript_id() == "ENST00000491588.1" or len(tx.get_seq()) > 100, "No seq data for " + tx.get_transcript_id() )
def test_build_ensembl_transcripts_by_gene_index(self): """Test building an index for getting a transcript given a gene.""" protocol = "file" transcript_index_filename = "out/test_ensembl_gtf_for_gene.db" output_filename = "out/test_ensembl_gtf_for_gene.db.gene.idx" shutil.rmtree(output_filename,ignore_errors=True) ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf" ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa" genome_build_factory = GenomeBuildFactory() genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol) genome_build_factory.build_ensembl_transcripts_by_gene_index(transcript_index_filename, output_filename) # Now load the index and look something up. gene_index = Shove(protocol + "://" + output_filename, optimize=False) self.assertTrue(len(gene_index['SEO1']) == 1) tx = gene_index['SEO1'][0] self.assertTrue(tx.get_transcript_id()=="YAL067C")
def test_gencode_cp(self): """Test the indexing of a gene that was causing problems and make sure that it can be indexed.""" gencode_input_gtf = "testdata/gencode/CP.gencode.annotation.gtf" gencode_input_fasta = "testdata/gencode/CP.gencode.pc_transcripts.fa" base_output_filename = "out/test_cp_gencode" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename) seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False) transcripts = seq_index["CP"] self.assertTrue(len(transcripts) == 15) troubled_transcript = "ENST00000474204.1" is_troubled_transcript_seen = False for tx in transcripts: if tx.get_transcript_id() == troubled_transcript: is_troubled_transcript_seen = True break self.assertTrue(is_troubled_transcript_seen)
def test_gencode_cp(self): """Test the indexing of a gene that was causing problems and make sure that it can be indexed.""" gencode_input_gtf = "testdata/gencode/CP.gencode.v19.annotation.gtf" gencode_input_fasta = "testdata/gencode/CP.gencode.v19.pc_transcripts.fa" base_output_filename = "out/test_cp_gencode" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices([gencode_input_gtf], [gencode_input_fasta], base_output_filename) seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", "memory://", optimize=False) transcripts = seq_index["CP"] self.assertTrue(len(transcripts) == 15) troubled_transcript = "ENST00000474204.1" is_troubled_transcript_seen = False for tx in transcripts: if tx.get_transcript_id() == troubled_transcript: is_troubled_transcript_seen = True break self.assertTrue(is_troubled_transcript_seen)
def test_retrieving_sequence(self): """Ensure we can retrieve a sequence from an ensembl transcript given a gene. """ ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf" ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa" base_output_filename = "out/test_retrieving_full_indices_ensembl" shutil.rmtree(base_output_filename + ".transcript.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gene.idx", ignore_errors=True) shutil.rmtree(base_output_filename + ".transcript_by_gp_bin.idx", ignore_errors=True) genome_build_factory = GenomeBuildFactory() genome_build_factory.construct_ensembl_indices([ensembl_input_gtf], [ensembl_input_fasta], base_output_filename) seq_index = Shove("file://" + base_output_filename + ".transcript_by_gene.idx", optimize=False) transcripts = seq_index['SEO1'] transcript = transcripts[0] for i in xrange(len(transcripts)): transcript = transcripts[i] if transcript._transcript_id == "YAL067C": break self.assertTrue(transcript.get_seq().startswith('ATGTATTCAATTGTTAAAGAGATTATTGTAGATCCTTACAAAAGACTAAAATGGGGTTTT')) transcripts = seq_index['PAU8'] transcript = transcripts[0] for i in xrange(len(transcripts)): transcript = transcripts[i] if transcript._transcript_id == "YAL068C": break self.assertTrue(transcript.get_strand() == "-") seq_index_gp = Shove("file://" + base_output_filename + ".transcript_by_gp_bin.idx", "memory://") transcripts = seq_index_gp["I_585"] self.assertTrue(len(transcripts) == 5, "There should be 5 transcripts.") transcript = transcripts[0] for i in xrange(len(transcripts)): transcript = transcripts[i] if transcript._transcript_id == "YAL069W": break self.assertTrue(transcript.get_strand() == "+")
def test_build_ensembl_transcripts_by_genomic_location_index(self): """Test that we can get an ensembl transcript from a genomic position""" protocol = "file" transcript_index_filename = "out/test_ensemble_gtf_for_gp.db" output_filename = "out/test_ensemble_gtf_for_gp.db.idx" shutil.rmtree(output_filename, ignore_errors=True) ensembl_input_gtf = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.gtf" ensembl_input_fasta = "testdata/Saccharomyces_cerevisiae.EF4.71_trim.cdna.all.fa" genome_build_factory = GenomeBuildFactory() genome_build_factory.build_ensembl_transcript_index([ensembl_input_gtf], [ensembl_input_fasta], transcript_index_filename, protocol=protocol) genome_build_factory.build_ensembl_transcripts_by_genomic_location_index(transcript_index_filename, output_filename, protocol=protocol) # Now load the index and look something up. gp_index = Shove(protocol + "://" + output_filename) gt_transcript_id = "YAL067C" bins = region2bins(1496172, 1496400) for bin in bins: key = 'I_' + str(bin) if key in gp_index.keys(): self.assertTrue(gp_index[key] == gt_transcript_id)
def main(): setup_logging() args = parseOptions() gtf_files = args.gtf_files.split(",") fasta_files = args.fasta_files.split(",") output_dir = args.output_dir genome_build = args.genome_build name = args.name ver = args.version tx_filter = args.filter protein_map_file = args.protein_map_file # create temp dir tmpDir = tempfile.mkdtemp(prefix="onco_ensembl_ds_") try: logging.getLogger(__name__).info("Creating tmp dir (" + tmpDir + ") ....") ds_build_dir = tmpDir + "/" + genome_build + "/" os.mkdir(ds_build_dir) if not (args.gtf_files.lower().find("gencode") !=-1) and tx_filter == "basic": logging.getLogger(__name__).warn("basic filter requested for (apparently) a non-gencode set of GTFs. If this is an ENSEMBL run (not GENCODE), please specify dummy, using --filter.") logging.getLogger(__name__).info("Creating config file...") config_filename = ds_build_dir + "/" + name + ".config" logging.getLogger(__name__).info("config file being written to: " + os.path.abspath(config_filename)) config_file_creator = GenericTsvDatasourceCreator() idx_cols = DatasourceInstallUtils.indexCols("dummy_option", "dummy_values") config_file_creator._createConfigFile(configFilename=config_filename + ".tmp", baseDSFile=os.path.basename(gtf_files[0]),ds_type="ensembl", ds_version=ver, ds_name=name, indexCols=idx_cols) # Append the tx_filter and protein map file config_parser = SafeConfigParser() fp = file(config_filename + ".tmp", 'r') config_parser.readfp(fp) fp.close() config_parser.set("general", "transcript_filter", tx_filter) # Write updated config file fp = file(config_filename, 'w') config_parser.write(fp) fp.close() logging.getLogger(__name__).info("Starting index construction (temp location: " + ds_build_dir + ") ...") factory = GenomeBuildFactory() factory.construct_ensembl_indices(gtf_files, fasta_files, ds_build_dir + os.path.basename(gtf_files[0]), protein_id_mapping_file=protein_map_file) logging.getLogger(__name__).info("Creating datasource md5...") DatasourceInstallUtils.create_datasource_md5_file(ds_build_dir) logging.getLogger(__name__).info("Copying created datasource from temp directory to final location (" + output_dir + ")...") shutil.copytree(symlinks=True, src=tmpDir, dst=output_dir) except Exception as e: import traceback logging.getLogger(__name__).fatal((e.__repr__()) + " " + traceback.format_exc()) logging.getLogger(__name__).info(""""If you are getting and error such as: KeyError: 'ENST00000474204.1'), then you may be out of disk space in /tmp/.""") # Remove the tempdir logging.getLogger(__name__).info("Done...") logging.getLogger(__name__).info("Removing ..." + tmpDir + '/') shutil.rmtree(tmpDir)